# Synthetic Label Generation

with `deepseek-r1-distill-qwen-7b`


## Hackernews data

https://huggingface.co/datasets/julien040/hacker-news-posts


In [1]:
import os

import polars as pl
from huggingface_hub import snapshot_download

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"  # turn on HF_TRANSFER

files = snapshot_download(
    repo_id="julien040/hacker-news-posts",
    allow_patterns=["*.parquet"],
    repo_type="dataset",
)

df = pl.scan_parquet(files)
df.head(1).collect()

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

id,title,url,score,time,comments,author
i32,str,str,i32,i32,i32,str
3404047,"""Copyright Office Seeks To Make…","""http://www.techdirt.com/articl…",31,1325175114,2,"""nextparadigms"""


In [2]:
# First, let's check our data
print("Min score:", df.select(pl.col("score")).min().collect().item())
print("Max score:", df.select(pl.col("score")).max().collect().item())

# Then create the distribution
score_distribution = (
    df.select(pl.col("score"))
    .with_columns(
        pl.col("score")
        .cut(
            breaks=[300, 500, 1000],
            labels=[
                "300-500",
                "501-1000",
                "1001-2000",
                "2000+",
            ],
        )
        .alias("score_range")
    )
    .group_by("score_range")
    .len()
    .sort("score_range")
)

score_distribution.collect()

Min score: -1
Max score: 6015


score_range,len
cat,u32
"""300-500""",3977967
"""501-1000""",21990
"""1001-2000""",9347
"""2000+""",1653


In [3]:
df = df.filter(pl.col("score") >= 300).collect()

examples = df.sample(n=10, seed=100).select("title").to_series().to_list()
examples

['Microsoft no longer signs Windows drivers for Process Hacker',
 "Nasa's Voyager 2 probe 'leaves the Solar System'",
 'Dart language',
 'Please stop asking how to find a technical co-founder.',
 'GitJournal: Mobile first Markdown notes synchronized with Git',
 'M1 Mac owners are experiencing high SSD writes over short periods of time',
 'The Website Obesity Crisis',
 'I converted my demoscene font collection to PNG and put it on GitHub',
 'Learn how to unleash the full potential of the type system of TypeScript',
 'Who wrote this shit?']

## structured generation


In [4]:
from enum import Enum
from typing import Dict, Union
from pydantic import BaseModel, constr


class HackerNewsLabel(str, Enum):
    DEV = "dev"  # Programming, software development
    WEB = "web"  # Web technologies, browsers, frameworks
    AI_ML = "ai_ml"  # AI, ML, data science
    INFRA = "infra"  # Infrastructure, cloud, DevOps
    HARDWARE = "hardware"  # Hardware, electronics, devices
    SECURITY = "security"  # Security, privacy, cybersecurity
    BUSINESS = "business"  # Business, startups, company news
    CAREER = "career"  # Jobs, workplace, professional growth
    SCIENCE = "science"  # Science, research, academic papers
    TOOLS = "tools"  # Development tools, utilities
    ASK_HN = "ask_hn"  # Questions, discussions
    SHOW_HN = "show_hn"  # Project launches, demos
    POLICY = "policy"  # Tech policy, regulation, politics
    CULTURE = "culture"  # Tech culture, society impact
    UNCLEAR = "unclear"  # Vague, too short, or inappropriate content


class HackerNewsClassification(BaseModel):
    explanation: constr(min_length=40)
    label: HackerNewsLabel

## Prompt


In [5]:
def format_text_as_prompt(title: str) -> str:
    categories = """
    1. dev: Programming languages, coding practices, software development techniques
    2. web: Web development, browsers, frontend/backend frameworks, web standards
    3. ai_ml: Artificial intelligence, machine learning, data science
    4. infra: Cloud computing, DevOps, system administration, deployment
    5. hardware: Physical computing devices, components, electronics
    6. security: Cybersecurity, privacy, vulnerabilities, authentication
    7. business: Startups, companies, funding, acquisitions, industry news
    8. career: Job seeking, workplace discussions, professional development
    9. science: Research, space exploration, physics, biology, academic papers
    10. tools: Development tools, utilities, software applications
    11. ask_hn: Questions, advice requests, community discussions
    12. show_hn: Project launches, personal creations, demos
    13. policy: Legal issues, regulations, industry standards
    14. culture: Tech industry trends, social impact, community issues
    15. unclear: Vague titles, insufficient context, inappropriate content
    """

    return f"""Look at the title for the following HackerNews post. Assess what category this post belongs to.

Title: "{title}"

Your role is to classify this HackerNews post title into exactly one category. You should choose out of the following categories: 

Categories: {categories}

Return your reasoning and the label you've chosen as a JSON object like this:
```json
{{"explanation": "2-3 clear sentences explaining why this specific category is the best fit",
    "label": "dev" | "web" | "ai_ml" | "infra" | "hardware" | "security" | "business" | "career" | "science" | "tools" | "ask_hn" | "show_hn" | "policy" | "culture" | "unclear"
}}
```
"""


## Starting the server with LM Studio


https://huggingface.co/lmstudio-community/DeepSeek-R1-Distill-Qwen-7B-GGUF


In [6]:
from openai import OpenAI

client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
client.models.list()

SyncPage[Model](data=[Model(id='deepseek-r1-distill-qwen-7b', created=None, object='model', owned_by='organization_owner'), Model(id='text-embedding-nomic-embed-text-v1.5', created=None, object='model', owned_by='organization_owner')], object='list')

## Generating Labels


In [7]:
messages = [
    {"role": "user", "content": format_text_as_prompt(examples[0])},
]


response = client.beta.chat.completions.parse(
    model="deepseek-r1-distill-qwen-7b",
    messages=messages,
    temperature=0.7,
    response_format=HackerNewsClassification,
)

In [8]:
HackerNewsClassification.model_validate_json(response.choices[0].message.content)

HackerNewsClassification(explanation='... explanation ...”, “label”: ... }<think></think><think>Alright, I need to determine the category for the given Hacker News post title: ', label=<HackerNewsLabel.SECURITY: 'security'>)

In [9]:
def predict_label(
    title: str, model: str = "deepseek-r1-distill-qwen-1.5b", client=client
) -> HackerNewsClassification | None:
    try:
        prompt = format_text_as_prompt(title)
        messages = [
            {"role": "user", "content": prompt},
        ]
        response = client.beta.chat.completions.parse(
            model=model,
            messages=messages,
            temperature=0.7,
            response_format=HackerNewsClassification,
        )
        return HackerNewsClassification.model_validate_json(
            response.choices[0].message.content
        )
    except Exception as e:
        print(f"Error occurred: {e}")
        return None

In [10]:
from rich import print as rich_print

structured_results = []
for example in examples:
    prediction = predict_label(example)
    structured_results.append(prediction)
    rich_print(example)
    rich_print(prediction)
    rich_print("---")

## Room to Think


In [11]:
def predict_label_without_structured(
    title: str, model: str = "deepseek-r1-distill-qwen-1.5b", client=client
) -> str:
    try:
        prompt = format_text_as_prompt(title)
        messages = [
            {"role": "user", "content": prompt},
        ]
        response = client.beta.chat.completions.parse(
            model=model,
            messages=messages,
            temperature=0.7,
        )
        return response.choices[0].message.content

    except Exception as e:
        print(f"Error occurred: {e}")
        return None

In [None]:
from rich import print as rich_print

results = []
for example in examples:
    prediction = predict_label_without_structured(example)
    results.append(prediction)
    rich_print(example)
    rich_print(prediction)
    rich_print("---")

KeyboardInterrupt: 

In [None]:
import contextlib
import re
import json

JSON_PATTERN = re.compile(r"```json\n(.*?)```", re.DOTALL)
DIRECT_JSON_PATTERN = re.compile(r"\{[^}]*\}", re.DOTALL)


def extract_json_from_text(text: str) -> tuple[str, dict | None]:
    if match := JSON_PATTERN.search(text):
        json_results = match.group(1)
        with contextlib.suppress(json.JSONDecodeError):
            return text, json.loads(json_results)
    if match := DIRECT_JSON_PATTERN.search(text):
        json_text = match.group(0)
        with contextlib.suppress(json.JSONDecodeError):
            return text, json.loads(json_text)
    return text, None


extract_json_from_text(results[0])