### Import Required Packages

In [1]:
import os
from huggingface_hub import InferenceClient
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


### Initialize the Inference Client

In [2]:
client = InferenceClient(
    provider="nscale",
    api_key=os.environ["HF_TOKEN"],
)

### Few-shot Examples

In [3]:
FEW_SHOT_EXAMPLES = """
You are a system that classifies Google location reviews into one of four categories:
- Ad: Promotional or advertisement content.
- Rant: Angry or exaggerated complaints, often with excessive punctuation or all-caps.
- Irrelevant: Not related to the location being reviewed.
- Valid: A genuine and relevant review about the location.

Examples:
Review: "Best pizza in town! Fresh ingredients and great service."
Label: Valid

Review: "BUY ONE GET ONE FREE! Come to my shop now, limited offer!"
Label: Ad

Review: "THIS PLACE IS THE WORST!!! NEVER COMING BACK. HORRIBLE SERVICE!!!!!"
Label: Rant

Review: "I think the government is doing a terrible job with taxes."
Label: Irrelevant
"""

### Batch Classification Prompt

In [4]:
def create_batch_prompt(reviews):
    reviews_list = "\n".join(
        [f"Review {i+1}: {r}" for i, r in enumerate(reviews)]
    )
    return f"""{FEW_SHOT_EXAMPLES}
    Now classify the following reviews:
    {reviews_list}

    Output format:
    Review 1: <Label>
    Review 2: <Label>
    ...
    """

### Classify Reviews in Batches

In [5]:
def classify_batch(reviews):
    prompt = create_batch_prompt(reviews)
    response = client.chat.completions.create(
        model="Qwen/Qwen3-4B-Instruct-2507",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message["content"]

### Label Combined Dataset

In [6]:
def label_reviews(df, text_col="review_text", batch_size=100, max_reviews=1000):
    labeled = []
    reviews = df[text_col].astype(str).tolist()[:max_reviews]
    
    for i in range(0, len(reviews), batch_size):
        batch = reviews[i:i+batch_size]
        output = classify_batch(batch)

        # Parse output: expects "Review 1: Valid" style
        for line in output.splitlines():
            if line.strip() and line.startswith("Review"):
                try:
                    idx, label = line.split(":", 1)
                    labeled.append(label.strip())
                except:
                    labeled.append("Unknown")
        
        print(f"Processed {i+len(batch)} / {len(reviews)}")
    
    df = df.iloc[:max_reviews].copy()
    df["label"] = labeled
    return df

### Label Subset of Cleaned Combined Data (first 1000 reviews)

In [None]:
data_path = "../data/clean/cleaned_combined_reviews.csv"
df = pd.read_csv(data_path)
cleaned_df = df.copy()
labeled_df = label_reviews(df)
labeled_df.to_csv("../data/label/qwen_labelled_combined_reviews.csv", index=False)

Processed 100 / 1000
Processed 200 / 1000
Processed 300 / 1000
Processed 400 / 1000
Processed 500 / 1000
Processed 600 / 1000
Processed 700 / 1000
Processed 800 / 1000
Processed 900 / 1000
Processed 1000 / 1000


In [38]:
labeled_df.head(10)

Unnamed: 0,user_name,review_text,rating,label
0,Amber Thibeault,Andrea is amazing. Our dog loves her and she a...,5,Irrelevant
1,Esther,Andrea does a wonderful job with our wild Pr...,5,Irrelevant
2,Bob Barrett,Never called back,1,Rant
3,Luz Quiles,They don't answer the phones,3,Rant
4,Tim Sanderson,Limited information on the website,3,Irrelevant
5,Ellen Nastir,Leigh-Ann is an incredibly creative facilitato...,5,Valid
6,Jinnie Lee Schmid,Leigh Ann Rodgers is THE undisputed expert in ...,5,Valid
7,Wanda Walker,Leigh Ann Rodgers is a great collaborator and ...,5,Valid
8,Heather Clarke,I really appreciate all the wisdom and experie...,5,Valid
9,Cheryle Maurer,Leigh Ann’s masterful facilitation and engagem...,5,Valid
