In [1]:
import json
import random
import re
from typing import Dict, List

import pandas as pd
from tqdm import tqdm


In [5]:
df = pd.read_csv(r"C:\Users\adity\CustomerPulse\data\yelp_csv_2.csv")
df

Unnamed: 0,text,stars
0,My wife took me here on my birthday for breakf...,5
1,I have no idea why some people give bad review...,5
2,love the gyro plate. Rice is so good and I als...,4
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5
4,General Manager Scott Petello is a good egg!!!...,5
...,...,...
9995,First visit...Had lunch here today - used my G...,3
9996,Should be called house of deliciousness!\n\nI ...,4
9997,I recently visited Olive and Ivy for business ...,4
9998,My nephew just moved to Scottsdale recently so...,2


In [7]:
df = df.rename(columns={
    "text": "review_text",
    "stars": "actual_stars"
})

In [11]:
df = df[["review_text", "actual_stars"]]

In [15]:
print(df["actual_stars"].value_counts())


actual_stars
4    3526
5    3337
3    1461
2     927
1     749
Name: count, dtype: int64


In [17]:
SAMPLE_SIZE = 200

sampled_df = (
    df.groupby("actual_stars", group_keys=False)
      .apply(lambda x: x.sample(min(len(x), SAMPLE_SIZE // 5), random_state=42))
      .sample(SAMPLE_SIZE, random_state=42)
      .reset_index(drop=True)
)

print(sampled_df.shape)
sampled_df.head()


(200, 2)


  .apply(lambda x: x.sample(min(len(x), SAMPLE_SIZE // 5), random_state=42))


Unnamed: 0,review_text,actual_stars
0,"I like to call this ""My Basha's"". On the po...",3
1,This place was the worst place to EVER live! I...,1
2,I dont understand how a second restaurant was ...,1
3,"Awesome Donuts! \n\nThe only thing is, I was e...",4
4,Our visit to Scottsdale brought us into Founta...,4


In [19]:
def prompt_v1(review_text: str) -> str:
    return f"""
You are given a Yelp customer review.

Task:
Predict the star rating (1 to 5) that the reviewer would give.

Return your answer strictly in the following JSON format:
{{
  "predicted_stars": <integer between 1 and 5>,
  "explanation": "<brief reasoning>"
}}

Review:
\"\"\"{review_text}\"\"\"
"""


In [21]:
def prompt_v2(review_text: str) -> str:
    return f"""
You are an expert sentiment analyst for Yelp reviews.

Use the following rubric:
- 1 star: Extremely negative, strong complaints, urges others to avoid
- 2 stars: Mostly negative, some positives but disappointing overall
- 3 stars: Mixed or neutral experience, average or inconsistent
- 4 stars: Mostly positive, minor complaints
- 5 stars: Extremely positive, enthusiastic praise

Return ONLY valid JSON:
{{
  "predicted_stars": <integer>,
  "explanation": "<1–2 sentence justification>"
}}

Review:
\"\"\"{review_text}\"\"\"
"""


In [23]:
def prompt_v3(review_text: str) -> str:
    return f"""
You are a Yelp rating classifier.

Step 1: Analyze sentiment, tone, and complaints/praise.
Step 2: Decide the best star rating from 1 to 5.
Step 3: Summarize reasoning briefly.

IMPORTANT:
- Do NOT show step-by-step reasoning.
- Output ONLY valid JSON.

Format:
{{
  "predicted_stars": <integer between 1 and 5>,
  "explanation": "<concise reasoning>"
}}

Review:
\"\"\"{review_text}\"\"\"
"""


In [25]:
import requests
import json

OLLAMA_URL = "http://localhost:11434/api/generate"
MODEL_NAME = "llama3.2"

def call_llm(prompt: str) -> str:
    payload = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": 0.2,
            "top_p": 0.9
        }
    }

    response = requests.post(OLLAMA_URL, json=payload, timeout=120)
    response.raise_for_status()

    return response.json()["response"]


In [27]:
test_prompt = """
Return ONLY JSON:
{
  "predicted_stars": 5,
  "explanation": "Test"
}
"""

print(call_llm(test_prompt))


{"predicted_stars":5,"explanation":"Test"}


In [29]:
def parse_json(response: str) -> Dict:
    try:
        # Extract JSON if model adds extra text
        match = re.search(r"\{.*\}", response, re.DOTALL)
        if not match:
            return None
        return json.loads(match.group())
    except Exception:
        return None


In [31]:
def evaluate_prompt(prompt_fn, df: pd.DataFrame) -> pd.DataFrame:
    results = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        prompt = prompt_fn(row["review_text"])
        
        try:
            response = call_llm(prompt)
            parsed = parse_json(response)
        except Exception:
            parsed = None

        results.append({
            "actual_stars": row["actual_stars"],
            "predicted_stars": parsed.get("predicted_stars") if parsed else None,
            "json_valid": parsed is not None
        })

    return pd.DataFrame(results)


In [33]:
def compute_metrics(results: pd.DataFrame) -> Dict:
    valid = results[results["json_valid"]]

    accuracy = (
        (valid["actual_stars"] == valid["predicted_stars"]).mean()
        if len(valid) > 0 else 0
    )

    json_rate = results["json_valid"].mean()

    return {
        "accuracy": round(accuracy * 100, 2),
        "json_validity": round(json_rate * 100, 2)
    }


In [37]:
results_v1 = evaluate_prompt(prompt_v1, sampled_df)
results_v2 = evaluate_prompt(prompt_v2, sampled_df)
results_v3 = evaluate_prompt(prompt_v3, sampled_df)

100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [32:33<00:00,  9.77s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [13:02<00:00,  3.91s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [12:20<00:00,  3.70s/it]


In [38]:
comparison = pd.DataFrame([
    {"Prompt": "Direct", **compute_metrics(results_v1)},
    {"Prompt": "Rubric-Based", **compute_metrics(results_v2)},
    {"Prompt": "Reasoning-Guided", **compute_metrics(results_v3)},
])

comparison


Unnamed: 0,Prompt,accuracy,json_validity
0,Direct,51.06,23.5
1,Rubric-Based,56.78,99.5
2,Reasoning-Guided,56.99,93.0
