# Synthetic Label Generation

with `deepseek-r1-distill-qwen-7b`


## Hackernews data

https://huggingface.co/datasets/julien040/hacker-news-posts


In [1]:
import os

import polars as pl
from huggingface_hub import snapshot_download

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"  # turn on HF_TRANSFER

files = snapshot_download(
    repo_id="julien040/hacker-news-posts",
    allow_patterns=["*.parquet"],
    repo_type="dataset",
)

df = pl.scan_parquet(files)
df.head(1).collect()

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

id,title,url,score,time,comments,author
i32,str,str,i32,i32,i32,str
3404047,"""Copyright Office Seeks To Make…","""http://www.techdirt.com/articl…",31,1325175114,2,"""nextparadigms"""


In [2]:
# First, let's check our data
print("Min score:", df.select(pl.col("score")).min().collect().item())
print("Max score:", df.select(pl.col("score")).max().collect().item())

# Then create the distribution
score_distribution = (
    df.select(pl.col("score"))
    .with_columns(
        pl.col("score")
        .cut(
            breaks=[300, 500, 1000, 2000, 5000],
            labels=[
                "0-300",
                "301-500",
                "501-1000",
                "1001-2000",
                "2001-5000",
                "5000+"
            ],
        )
        .alias("score_range")
    )
    .group_by("score_range")
    .len()
    .sort("score_range")
)

score_distribution.collect()

Min score: -1
Max score: 6015


score_range,len
cat,u32
"""0-300""",3977967
"""301-500""",21990
"""501-1000""",9347
"""1001-2000""",1488
"""2001-5000""",163
"""5000+""",2


In [3]:
df = df.filter(
    (pl.col("score") >= 500) & ~pl.col("title").str.contains("Show HN|Ask HN")
).collect()

examples = df.sample(n=10, seed=500).select("title").to_series().to_list()
examples

['Steve Jobs has passed away.',
 "Sellers printing counterfeit books and selling under Amazon's brand",
 'What does the ??!??! operator do in C?',
 'React.js Introduction for People Who Know Just Enough JQuery',
 'Email a Dumpster Fire',
 'A Billionaire Mathematician’s Life of Ferocious Curiosity',
 "NSA shares raw intelligence including Americans' data with Israel",
 'Washington Post Is First Paper to Call for Prosecution of Its Own Source',
 'Uber Fires Anthony Levandowski',
 'AlphaGo Zero: Learning from scratch']

## structured generation


In [4]:
from enum import Enum
from typing import Dict, Union
from pydantic import BaseModel, constr


class HackerNewsLabel(str, Enum):
    DEV = "dev"  # Programming languages, coding practices, software development techniques
    WEB = "web"  # Web development, browsers, frontend/backend frameworks, web standards
    AI_ML = "ai_ml"  # Artificial intelligence, machine learning, data science
    SECURITY = "security"  # Cybersecurity, privacy, vulnerabilities, authentication
    BUSINESS = "business"  # Startups, companies, funding, acquisitions, industry news
    CAREER = "career"  # Job seeking, workplace discussions, professional development
    SCIENCE = "science"  # Research, space exploration, physics, biology, academic papers
    TOOLS = "tools"  # Development tools, utilities, software applications
    CULTURE = "culture"  # Tech industry trends, social impact, community issues
    TECH_NEWS = "tech_news"  # General technology news and updates


class HackerNewsClassification(BaseModel):
    explanation: constr(min_length=40)
    label: HackerNewsLabel

## Prompt


In [5]:
def format_text_as_prompt(title: str) -> str:
    categories = """
    1. dev: Programming languages, coding practices, software development techniques  
    2. web: Web development, browsers, frontend/backend frameworks, web standards  
    3. ai_ml: Artificial intelligence, machine learning, data science  
    4. security: Cybersecurity, privacy, vulnerabilities, authentication  
    5. business: Startups, companies, funding, acquisitions, industry news  
    6. career: Job seeking, workplace discussions, professional development  
    7. science: Research, space exploration, physics, biology, academic papers  
    8. tools: Development tools, utilities, software applications  
    9. culture: Tech industry trends, social impact, community issues  
    10. tech_news: General technology news and updates  
    """

    return f"""Look at the title for the following HackerNews post. Assess what category this post belongs to.

Title: "{title}"

Your role is to classify this HackerNews post title into exactly one category. You should choose out of the following categories:

Categories: {categories}

Return your reasoning and the label you've chosen as a JSON object like this:
```json
{{"explanation": "2-3 clear sentences explaining why this specific category is the best fit",
    "label": "dev" | "web" | "ai_ml" | "security" | "business" | "career" | "science" | "tools" | "culture" | "tech_news"
}}
```
"""

## Starting the server with LM Studio


https://huggingface.co/lmstudio-community/DeepSeek-R1-Distill-Qwen-7B-GGUF


In [6]:
from openai import OpenAI

client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
client.models.list()

SyncPage[Model](data=[Model(id='deepseek-r1-distill-qwen-7b@q8_0', created=None, object='model', owned_by='organization_owner'), Model(id='deepseek-r1-distill-qwen-7b@q6_k', created=None, object='model', owned_by='organization_owner'), Model(id='deepseek-r1-distill-llama-8b', created=None, object='model', owned_by='organization_owner'), Model(id='text-embedding-nomic-embed-text-v1.5', created=None, object='model', owned_by='organization_owner')], object='list')

## Generating Labels


In [7]:
messages = [
    {"role": "user", "content": format_text_as_prompt(examples[0])},
]


response = client.beta.chat.completions.parse(
    model="deepseek-r1-distill-qwen-7b",
    messages=messages,
    temperature=0.7,
    response_format=HackerNewsClassification,
)

In [8]:
HackerNewsClassification.model_validate_json(response.choices[0].message.content)

HackerNewsClassification(explanation="... explanation ... | label: ... }<think>Alright, so I've got this task here where I need to classify a HackerNews post based on its title. The title is ", label=<HackerNewsLabel.DEV: 'dev'>)

In [9]:
def predict_label(
    title: str, model: str = "deepseek-r1-distill-qwen-1.5b", client=client
) -> HackerNewsClassification | None:
    try:
        prompt = format_text_as_prompt(title)
        messages = [
            {"role": "user", "content": prompt},
        ]
        response = client.beta.chat.completions.parse(
            model=model,
            messages=messages,
            temperature=0.3,
            response_format=HackerNewsClassification,
        )
        return HackerNewsClassification.model_validate_json(
            response.choices[0].message.content
        )
    except Exception as e:
        print(f"Error occurred: {e}")
        return None

In [10]:
from rich import print as rich_print

structured_results = []
for example in examples:
    prediction = predict_label(example)
    structured_results.append(prediction)
    rich_print(example)
    rich_print(prediction)
    rich_print("---")

## Room to Think


In [11]:
def predict_label_without_structured(
    title: str, model: str = "deepseek-r1-distill-qwen-1.5b", client=client
) -> str:
    try:
        prompt = format_text_as_prompt(title)
        messages = [
            {"role": "user", "content": prompt},
        ]
        response = client.beta.chat.completions.parse(
            model=model,
            messages=messages,
            temperature=0.7,
        )
        return response.choices[0].message.content

    except Exception as e:
        print(f"Error occurred: {e}")
        return None

In [12]:
from rich import print as rich_print

results = []
for example in examples:
    prediction = predict_label_without_structured(example)
    results.append(prediction)
    rich_print(example)
    rich_print(prediction)
    rich_print("---")

In [13]:
import contextlib
import re
import json

JSON_PATTERN = re.compile(r"```json\n(.*?)```", re.DOTALL)
DIRECT_JSON_PATTERN = re.compile(r"\{[^}]*\}", re.DOTALL)


def extract_json_from_text(text: str) -> tuple[str, dict | None]:
    if match := JSON_PATTERN.search(text):
        json_results = match.group(1)
        with contextlib.suppress(json.JSONDecodeError):
            return text, json.loads(json_results)
    if match := DIRECT_JSON_PATTERN.search(text):
        json_text = match.group(0)
        with contextlib.suppress(json.JSONDecodeError):
            return text, json.loads(json_text)
    return text, None


extract_json_from_text(results[0])

('<think>\nAlright, I need to figure out which category this Hacker News post belongs to based on the title: "Steve Jobs has passed away." The available categories are dev, web, ai_ml, security, business, career, science, tools, culture, and tech_news.\n\nFirst, I\'ll read through each category to see how they fit. \n\n- **dev**: This is for programming topics like languages or practices.\n- **web**: Related to web development or frameworks.\n- **ai_ml**: Artificial intelligence and machine learning.\n- **security**: Cybersecurity stuff.\n- **business**: Startups, funding, etc.\n- **career**: Job-related discussions.\n- **science**: Research areas like physics or biology.\n- **tools**: Development tools or software applications.\n- **culture**: Tech trends or community issues.\n- **tech_news**: General tech updates.\n\nThe title is about Steve Jobs passing away. Steve Jobs was a significant figure in the tech industry, particularly known for Apple. However, the title itself doesn\'t di

## Generate dataset

In [44]:
# Filter DataFrame
filtered_df = df.filter(
    (pl.col("score") >= 300) & ~pl.col("title").str.contains("Show HN|Ask HN") & (pl.col("title").str.len_chars() >= 10)
)

# Get stats on filtered title lengths
filtered_title_lengths = filtered_df["title"].str.len_chars()
filtered_title_length_stats = filtered_title_lengths.describe()
print(f"Filtered title length stats:\n{filtered_title_length_stats}")

# Get filtered score distribution
filtered_score_stats = filtered_df["score"].describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
print(f"\nFiltered score distribution:\n{filtered_score_stats}")

# Write filtered DataFrame to Parquet file
filtered_df = filtered_df.sample(n=4000, seed=42)
filtered_df.write_parquet("../data/hackernews_filtered.parquet")

Filtered title length stats:
shape: (9, 2)
┌────────────┬───────────┐
│ statistic  ┆ value     │
│ ---        ┆ ---       │
│ str        ┆ f64       │
╞════════════╪═══════════╡
│ count      ┆ 4891.0    │
│ null_count ┆ 0.0       │
│ mean       ┆ 46.191576 │
│ std        ┆ 19.263831 │
│ min        ┆ 10.0      │
│ 25%        ┆ 30.0      │
│ 50%        ┆ 46.0      │
│ 75%        ┆ 62.0      │
│ max        ┆ 86.0      │
└────────────┴───────────┘

Filtered score distribution:
shape: (12, 2)
┌────────────┬────────────┐
│ statistic  ┆ value      │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 4891.0     │
│ null_count ┆ 0.0        │
│ mean       ┆ 775.144142 │
│ std        ┆ 351.60203  │
│ min        ┆ 500.0      │
│ …          ┆ …          │
│ 75%        ┆ 842.0      │
│ 90%        ┆ 1170.0     │
│ 95%        ┆ 1423.0     │
│ 99%        ┆ 2228.0     │
│ max        ┆ 5771.0     │
└────────────┴────────────┘
