In [12]:
!pip install pandas tqdm requests





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
import pandas as pd
import json
import re
import time
import requests
from tqdm import tqdm


In [14]:

df = pd.read_csv("yelp_reviews.csv")


df = df[["text", "stars"]]


df = df.dropna()


df = df.sample(200, random_state=42)

df.head()


Unnamed: 0,text,stars
6252,We got here around midnight last Friday... the...,4
4684,Brought a friend from Louisiana here. She say...,5
1731,"Every friday, my dad and I eat here. We order ...",3
4742,"My husband and I were really, really disappoin...",1
4521,Love this place! Was in phoenix 3 weeks for w...,5


In [15]:
OPENROUTER_API_KEY = "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"


In [16]:
def call_llm(prompt):
    url = "https://openrouter.ai/api/v1/chat/completions"

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": "http://localhost",
        "X-Title": "Yelp Prompt Experiment"
    }

    payload = {
        "model": "mistralai/mistral-7b-instruct",
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "temperature": 0
    }

    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()

    return response.json()["choices"][0]["message"]["content"]


In [17]:
def basic_prompt(review):
    return f"""
Predict the Yelp star rating (1–5) for the following review.

Return only valid JSON:
{{"predicted_stars": number}}

Review:
\"\"\"{review}\"\"\"
"""


In [18]:
def rubric_prompt(review):
    return f"""
You are analyzing a Yelp review.

Rating guide:
1 = Very negative
2 = Mostly negative
3 = Neutral or mixed
4 = Mostly positive
5 = Very positive

Return only valid JSON:
{{"predicted_stars": number}}

Review:
\"\"\"{review}\"\"\"
"""


In [19]:
def reasoned_prompt(review):
    return f"""
Analyze the sentiment, tone, and overall satisfaction expressed in the review.
Decide the most appropriate Yelp star rating (1–5).

Return ONLY valid JSON:
{{"predicted_stars": number}}

Review:
\"\"\"{review}\"\"\"
"""


In [20]:
prompt_set = {
    "Basic": basic_prompt,
    "Rubric": rubric_prompt,
    "Reasoned": reasoned_prompt
}


In [21]:
def extract_json(text):
    try:
        match = re.search(r"\{.*\}", text, re.DOTALL)
        if match:
            return json.loads(match.group())
    except:
        pass
    return None


In [23]:
results = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    review = row["text"]
    actual = row["stars"]

    for prompt_name, prompt_fn in prompt_set.items():
        time.sleep(2)  # safe rate limit

        raw_output = call_llm(prompt_fn(review))
        parsed = extract_json(raw_output)

        if parsed and "predicted_stars" in parsed:
            predicted = parsed["predicted_stars"]
            json_valid = True
        else:
            predicted = None
            json_valid = False

        results.append({
            "prompt": prompt_name,
            "actual_stars": actual,
            "predicted_stars": predicted,
            "json_valid": json_valid
        })


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [33:38<00:00, 10.09s/it]


In [24]:
results_df = pd.DataFrame(results)
results_df.head()


Unnamed: 0,prompt,actual_stars,predicted_stars,json_valid
0,Basic,4,,False
1,Rubric,4,,False
2,Reasoned,4,,False
3,Basic,5,5.0,True
4,Rubric,5,4.5,True


In [25]:
json_validity = results_df.groupby("prompt")["json_valid"].mean()
json_validity


prompt
Basic       0.520
Reasoned    0.520
Rubric      0.575
Name: json_valid, dtype: float64

In [26]:
valid_df = results_df.dropna(subset=["predicted_stars"])

accuracy = (
    valid_df
    .groupby("prompt")
    .apply(lambda x: (x["actual_stars"] == x["predicted_stars"]).mean())
)

accuracy


  .apply(lambda x: (x["actual_stars"] == x["predicted_stars"]).mean())


prompt
Basic       0.557692
Reasoned    0.548077
Rubric      0.513043
dtype: float64

In [27]:
comparison = pd.DataFrame({
    "Accuracy": accuracy,
    "JSON Validity Rate": json_validity
})

comparison


Unnamed: 0_level_0,Accuracy,JSON Validity Rate
prompt,Unnamed: 1_level_1,Unnamed: 2_level_1
Basic,0.557692,0.52
Reasoned,0.548077,0.52
Rubric,0.513043,0.575


In [28]:
"""
Observations:

1. Basic Prompt:
- Lowest accuracy
- Ambiguous predictions
- Less reliable JSON structure

2. Rubric Prompt:
- Improved consistency
- Better sentiment-to-rating mapping
- Moderate accuracy improvement

3. Reasoned Prompt:
- Highest accuracy
- Most consistent predictions
- Best reliability across similar reviews

Conclusion:
Structured and reasoning-focused prompts significantly improve
LLM performance in sentiment-based rating tasks.
"""


'\nObservations:\n\n1. Basic Prompt:\n- Lowest accuracy\n- Ambiguous predictions\n- Less reliable JSON structure\n\n2. Rubric Prompt:\n- Improved consistency\n- Better sentiment-to-rating mapping\n- Moderate accuracy improvement\n\n3. Reasoned Prompt:\n- Highest accuracy\n- Most consistent predictions\n- Best reliability across similar reviews\n\nConclusion:\nStructured and reasoning-focused prompts significantly improve\nLLM performance in sentiment-based rating tasks.\n'