In [1]:
%pip install pandas numpy matplotlib seaborn httpx tqdm scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [47]:
import pandas as pd
import httpx
import json
from tqdm import tqdm
import ast

In [None]:
df = pd.read_csv('../data/cleaned_reviews_noempty.csv')
df.head()

Unnamed: 0,store_name,rating,review,reviewer_name
0,49 SEATS,5,wowowow great vibes and food!! super eccentric...,Hannah Eva
1,49 SEATS,4,We had the classic pasta and fish n chips with...,S dssp
2,49 SEATS,5,Its an amazing restaurant with good vibes,Sanjith
3,49 SEATS,5,great atmosphere,Vivian L
4,49 SEATS,5,Great atmosphere and vibes!,Jayden


In [49]:
def classify_reviews_batch_long_prompt(df, api_key):
    headers = {
        "Authorization": f"Bearer {api_key}",
        "X-Title": "Batch Review Classification",
        "Content-Type": "application/json"
    }

    multitask_model_name = "google/gemma-3-12b-it:free"

    # convert dataframe to list of dicts
    reviews_list = df.to_dict(orient="records")

    # build batch prompt
    prompt = f"""
    You are a review moderator for Google location reviews.
    Classify each review below into one category: Ad, Irr, Rant, or Val.

    - Ad: Contains promotions, ads, or links (e.g., "Best pizza! Visit www.pizzapromo.com").
    - Irr: Irrelevant to the location (e.g., "I love my new phone, but this place is too noisy.").
    - Rant: Complaints without evidence of a visit (e.g., "Never been here, but I heard it’s terrible.").
    - Val: Valid, relevant review about the location, with evidence of visit, and compliant with policies.
    
    Examples:

    Review: "Amazing service! Come to our store for 20% off via link: promo.com"
    Metadata: "Rating: 5 | Store: 49 SEATS | Reviewer: Hannah Eva"
    Category: Ad

    Review: "Check out our new cafe opening next week, free samples available!"
    Metadata: "Rating: 5 | Store: 49 SEATS | Reviewer: Tom Tan"
    Category: Ad

    Review: "This restaurant is the worst, hate the food without even trying it."
    Metadata: "Rating: 1 | Store: 49 SEATS | Reviewer: Jane Lim"
    Category: Rant

    Review: "Never been here, but someone told me the service is bad."
    Metadata: "Rating: 2 | Store: 49 SEATS | Reviewer: Alex Foo"
    Category: Rant

    Review: "Great food, friendly staff, visited last week."
    Metadata: "Rating: 4 | Store: 49 SEATS | Reviewer: Emma Wong"
    Category: Val

    Review: "Loved the ambiance and coffee, came here for brunch on Saturday."
    Metadata: "Rating: 5 | Store: 49 SEATS | Reviewer: Yeong KX"
    Category: Val

    Review: "Bought a new car yesterday, but this café is okay."
    Metadata: "Rating: 3 | Store: 49 SEATS | Reviewer: Kai Lee"
    Category: Irr

    Review: "I just got a new phone, nothing to do with this restaurant."
    Metadata: "Rating: 5 | Store: 49 SEATS | Reviewer: Hannah Eva"
    Category: Irr
    For each review, provide 1 for the predicted category, 0 for others.
    
    Return ONLY a JSON list of dictionaries, each with keys:
    review, val_pred, ad_pred, irrelevant_pred, rant_pred.

    Reviews to classify:
    {json.dumps(reviews_list, indent=2)}
    """

    data = {
        "model": multitask_model_name,
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    }

    with httpx.Client() as client:
        resp = client.post(
            "https://openrouter.ai/api/v1/chat/completions",
            headers=headers,
            json=data,
            timeout=200
        )
        resp.raise_for_status()
        result = resp.json()
        output = result["choices"][0]["message"]["content"]

    # parse model output (JSON list of dicts)
    try:
        preds = json.loads(output)
    except Exception:
        # fallback: try to extract JSON substring
        start = output.find("[")
        end = output.rfind("]") + 1
        preds = json.loads(output[start:end])

    multitask_df = pd.DataFrame(preds)
    multitask_df.to_csv("multitask_predictions_batch.csv", index=False)
    return multitask_df

In [None]:
api = ''

In [38]:
df1 = df[:20]
pred1_longprompt = classify_reviews_batch_long_prompt(df1, api)

In [39]:
len(pred1_longprompt)

12

In [40]:
pred1_longprompt

Unnamed: 0,review,val_pred,ad_pred,irrelevant_pred,rant_pred
0,wowowow great vibes and food!! super eccentric...,1,0,0,0
1,We had the classic pasta and fish n chips with...,1,0,0,0
2,Its an amazing restaurant with good vibes,1,0,0,0
3,great atmosphere,1,0,0,0
4,Great atmosphere and vibes!,1,0,0,0
5,Wonderful food and service!,1,0,0,0
6,"The atmosphere and service are excellent, enjo...",1,0,0,0
7,"Resturant price, hawker center quality (the bl...",0,0,0,1
8,Always enjoy my meal there. Take note it may b...,1,0,0,0
9,"Atmosphere is great, food is awesome",1,0,0,0


In [52]:
df_noise = pd.read_csv('noisy_reviews.csv')
df_noise.head()

Unnamed: 0,store_name,rating,review,reviewer_name
0,49 Seats,3,This place is overhyped! I never went but hear...,John Doe
1,49 Seats,5,Didn't visit but sounds cool.,Liam Garcia
2,49 Seats,4,Didn't eat here but saw a billboard nearby. Fo...,Laura Evans
3,49 Seats,4,Today is a good day for sleeping.,Samuel Cook
4,Colony,3,Never went but my cousin said it's okay.,Michael Hall


In [None]:
df1_noise = df_noise[:10]
df2_noise = df_noise[10:20]
df3_noise = df_noise[20:30]
df4_noise = df_noise[30:40]
df5_noise = df_noise[40:50]

In [54]:
pred1_noise = classify_reviews_batch(df1_noise, api)

In [55]:
pred1_noise

Unnamed: 0,review,val_pred,ad_pred,irrelevant_pred,rant_pred
0,This place is overhyped! I never went but hear...,0,0,0,1
1,Didn't visit but sounds cool.,0,0,0,1
2,Didn't eat here but saw a billboard nearby. Fo...,0,1,0,0
3,Today is a good day for sleeping.,0,0,1,0
4,Never went but my cousin said it's okay.,0,0,0,1
5,Didn't eat here but the vibe seems cool. Check...,0,1,0,0
6,Haven't been but seems fine. Follow my Instagr...,0,1,0,0
7,I prefer cloudy weather.,0,0,1,0
8,Why do people like this place? Never went but ...,0,0,0,1
9,Didn't visit but looks nice. Listen to my podc...,0,1,0,0


In [66]:
pred5_noise = classify_reviews_batch(df5_noise, api)

In [67]:
pred5_noise

Unnamed: 0,review,val_pred,ad_pred,irrelevant_pred,rant_pred
0,Didn't visit but my friend said it's meh. Shop...,0,1,0,1
1,Haven't been but seems okay. Buy my phone at h...,0,1,0,1
2,I love collecting rare coins.,0,0,1,0
3,"Not sure since I didn't eat there, but seems p...",0,0,1,0
4,Haven't been but I bet it's decent. Listen to ...,0,1,0,1
5,Didn't eat here but sounds nice. Buy my art at...,0,1,0,1
6,My favorite color is blue.,0,0,1,0
7,Haven't gone but probably bad.,0,0,0,1
8,Never tried it but looks interesting. Get my w...,0,1,0,1
9,I prefer tea over coffee.,0,0,1,0


In [53]:
def export_df_to_csv(df, filename):
    df.to_csv(filename, index=False)

In [68]:
export_df_to_csv(pred5_noise, "pred5_noise.csv")

In [69]:
pred1_noise = pd.read_csv("pred1_noise.csv")
pred2_noise = pd.read_csv("pred2_noise.csv")
pred3_noise = pd.read_csv("pred3_noise.csv")
pred4_noise = pd.read_csv("pred4_noise.csv")
pred5_noise = pd.read_csv("pred5_noise.csv")

In [70]:
# Concatenate all prediction dataframes from df_0_9_pred to df_190_199_pred
dfs = [
    pred1_noise, pred2_noise, pred3_noise, pred4_noise, pred5_noise,
]

noise_preds = pd.concat(dfs, ignore_index=True)
noise_preds

Unnamed: 0,review,val_pred,ad_pred,irrelevant_pred,rant_pred
0,This place is overhyped! I never went but hear...,0,0,0,1
1,Didn't visit but sounds cool.,0,0,0,1
2,Didn't eat here but saw a billboard nearby. Fo...,0,1,0,0
3,Today is a good day for sleeping.,0,0,1,0
4,Never went but my cousin said it's okay.,0,0,0,1
5,Didn't eat here but the vibe seems cool. Check...,0,1,0,0
6,Haven't been but seems fine. Follow my Instagr...,0,1,0,0
7,I prefer cloudy weather.,0,0,1,0
8,Why do people like this place? Never went but ...,0,0,0,1
9,Didn't visit but looks nice. Listen to my podc...,0,1,0,0


In [71]:
export_df_to_csv(noise_preds, "noise_prediction.csv")