## Import the libs

In [8]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import subprocess


## Loading the data

In [9]:
df = pd.read_csv("data/train.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,label,text
0,0,4,dr. goldberg offers everything i look for in a...
1,1,1,"Unfortunately, the frustration of being Dr. Go..."
2,2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,3,Got a letter in the mail last week that said D...
4,4,0,I don't know what Dr. Goldberg was like before...


## Renaming the columns

In [10]:
df = df.rename(columns={
    "text": "review_text",
    "label": "actual_stars"
})

df = df[["review_text", "actual_stars"]]
df.head()


Unnamed: 0,review_text,actual_stars
0,dr. goldberg offers everything i look for in a...,4
1,"Unfortunately, the frustration of being Dr. Go...",1
2,Been going to Dr. Goldberg for over 10 years. ...,3
3,Got a letter in the mail last week that said D...,3
4,I don't know what Dr. Goldberg was like before...,0


## Call the model

In [12]:
def call_ollama(prompt, model="llama3.1:8b"):
    result = subprocess.run(
        ["ollama", "run", model],
        input=prompt,
        text=True,
        capture_output=True
    )
    return result.stdout.strip()


In [18]:
def parse_json_response(text):
    try:
        return json.loads(text), True
    except json.JSONDecodeError:
        return None, False


## Prompt 1

In [15]:
BASELINE_PROMPT = """
You are given a Yelp restaurant review.
Predict the star rating from 1 to 5.

Review:
"{review_text}"

Return your answer as JSON with:
- predicted_stars
- explanation
"""


In [16]:
def run_baseline_prompt(review_text):
    prompt = BASELINE_PROMPT.format(review_text=review_text)
    return call_ollama(prompt)


In [19]:
baseline_results = []

for _, row in tqdm(df_sample.iterrows(), total=len(df_sample)):
    raw_output = run_baseline_prompt(row["review_text"])
    parsed_json, is_valid = parse_json_response(raw_output)

    predicted_stars = None
    explanation = None

    if is_valid:
        predicted_stars = parsed_json.get("predicted_stars")
        explanation = parsed_json.get("explanation")

    baseline_results.append({
        "review_text": row["review_text"],
        "actual_stars": row["actual_stars"],
        "predicted_stars": predicted_stars,
        "json_valid": is_valid,
        "explanation": explanation
    })


100%|█████████████████████████████████████████| 200/200 [34:17<00:00, 10.29s/it]


In [20]:
baseline_df = pd.DataFrame(baseline_results)

baseline_accuracy = (
    baseline_df["predicted_stars"] == baseline_df["actual_stars"]
).mean()

baseline_json_validity = baseline_df["json_valid"].mean()

baseline_accuracy, baseline_json_validity


(np.float64(0.01), np.float64(0.045))

## Prompt 2

In [24]:
PROMPT_V2 = """
You are a strict JSON-generating assistant.

Your task is to read a Yelp restaurant review and predict the star rating from 1 to 5.

Rules:
- You MUST return valid JSON only.
- Do NOT include any text outside the JSON.
- "predicted_stars" must be an integer from 1 to 5.
- "explanation" must be a short sentence explaining the rating.

JSON format:
{{
  "predicted_stars": <int>,
  "explanation": "<string>"
}}

Review:
"{review_text}"
"""


In [25]:
def run_prompt_v2(review_text):
    prompt = PROMPT_V2.format(review_text=review_text)
    return call_ollama(prompt)


In [27]:
v2_results = []

for _, row in tqdm(df_sample.iterrows(), total=len(df_sample)):
    raw_output = run_prompt_v2(row["review_text"])
    parsed_json, is_valid = parse_json_response(raw_output)

    predicted_stars = None
    explanation = None

    if is_valid:
        predicted_stars = parsed_json.get("predicted_stars")
        explanation = parsed_json.get("explanation")

    v2_results.append({
        "review_text": row["review_text"],
        "actual_stars": row["actual_stars"],
        "predicted_stars": predicted_stars,
        "json_valid": is_valid,
        "explanation": explanation
    })


100%|█████████████████████████████████████████| 200/200 [10:56<00:00,  3.28s/it]


In [28]:
v2_df = pd.DataFrame(v2_results)

v2_accuracy = (
    v2_df["predicted_stars"] == v2_df["actual_stars"]
).mean()

v2_json_validity = v2_df["json_valid"].mean()

v2_accuracy, v2_json_validity


(np.float64(0.14), np.float64(1.0))

## Prompt 3

In [32]:
PROMPT_V3 = """
You are an expert sentiment analyst for restaurant reviews.

Your task is to read a Yelp restaurant review and predict the star rating from 1 to 5.

Rating guidelines:
- 1 star: Very negative experience, strong complaints, not recommended.
- 2 stars: Mostly negative, significant issues, disappointment.
- 3 stars: Mixed or neutral experience, average or inconsistent.
- 4 stars: Mostly positive, good experience, minor issues.
- 5 stars: Very positive experience, highly satisfied, strongly recommended.

You MUST follow these rules:
- Return valid JSON only.
- Do NOT include any text outside the JSON.
- "predicted_stars" must be an integer from 1 to 5.
- "explanation" must briefly justify the rating.

Example 1:
Review: "The food was cold and the service was rude. I will not be coming back."
Output:
{{
  "predicted_stars": 1,
  "explanation": "The review describes a very negative experience with poor food quality and service."
}}

Example 2:
Review: "The food tasted great and the staff were friendly, but the wait time was a bit long."
Output:
{{
  "predicted_stars": 4,
  "explanation": "The experience was mostly positive despite a minor issue with wait time."
}}

Now analyze the following review.

Review:
"{review_text}"

Output JSON:
"""


In [33]:
def run_prompt_v3(review_text):
    prompt = PROMPT_V3.format(review_text=review_text)
    return call_ollama(prompt)


In [34]:
print(run_prompt_v3(df_sample.loc[0, "review_text"]))


{
  "predicted_stars": 2,
  "explanation": "The reviewer had mixed feelings about the experience, mentioning decent taste but poor variety and service."
}


In [35]:
v3_results = []

for _, row in tqdm(df_sample.iterrows(), total=len(df_sample)):
    raw_output = run_prompt_v3(row["review_text"])
    parsed_json, is_valid = parse_json_response(raw_output)

    predicted_stars = None
    explanation = None

    if is_valid:
        predicted_stars = parsed_json.get("predicted_stars")
        explanation = parsed_json.get("explanation")

    v3_results.append({
        "review_text": row["review_text"],
        "actual_stars": row["actual_stars"],
        "predicted_stars": predicted_stars,
        "json_valid": is_valid,
        "explanation": explanation
    })


100%|█████████████████████████████████████████| 200/200 [14:18<00:00,  4.29s/it]


In [36]:
v3_df = pd.DataFrame(v3_results)

v3_accuracy = (
    v3_df["predicted_stars"] == v3_df["actual_stars"]
).mean()

v3_json_validity = v3_df["json_valid"].mean()

v3_accuracy, v3_json_validity

(np.float64(0.135), np.float64(1.0))

## Comparison

In [39]:
comparison_df = pd.DataFrame({
    "Prompt Version": ["Baseline (V1)", "Strict JSON (V2)", "Examples + Guidelines (V3)"],
    "Accuracy": [baseline_accuracy, v2_accuracy, v3_accuracy],
    "JSON Validity": [baseline_json_validity, v2_json_validity, v3_json_validity]
})

comparison_df.style.format({
    "Accuracy": "{:.2%}",
    "JSON Validity": "{:.2%}"
})


Unnamed: 0,Prompt Version,Accuracy,JSON Validity
0,Baseline (V1),1.00%,4.50%
1,Strict JSON (V2),14.00%,100.00%
2,Examples + Guidelines (V3),13.50%,100.00%


In [40]:
def mean_absolute_star_distance(df):
    valid_df = df.dropna(subset=["predicted_stars"])
    return np.mean(
        np.abs(valid_df["predicted_stars"] - valid_df["actual_stars"])
    )


## Explanation

1. Baseline Prompt (V1)
The baseline prompt was intentionally minimal, resulting in extremely low accuracy (1%) and poor JSON validity (4%). This demonstrated that unconstrained prompts are unreliable for structured prediction tasks.
2. Prompt Version 2 (Strict JSON)
Introducing explicit role instructions and a strict JSON schema led to a dramatic improvement in JSON validity (100%) and a substantial increase in accuracy (~14%). This shows that output constraints are critical when using LLMs for structured prediction.
3. Prompt Version 3 (Examples + Guidelines)
Adding rating guidelines and few-shot examples maintained perfect JSON validity but did not significantly improve accuracy. This suggests that while examples improve consistency and explanation quality, they may also introduce bias depending on example selection. This highlights a trade-off between calibration and generalization.

## Extra: calculated mean square star distance 

In [41]:
baseline_distance = mean_absolute_star_distance(baseline_df)
v2_distance = mean_absolute_star_distance(v2_df)
v3_distance = mean_absolute_star_distance(v3_df)

baseline_distance, v2_distance, v3_distance


(np.float64(1.2222222222222223), np.float64(1.05), np.float64(1.11))

In [45]:
comparison_df = pd.DataFrame({
    "Prompt Version": [
        "Baseline (V1)",
        "Strict JSON (V2)",
        "Examples + Guidelines (V3)"
    ],
    "Accuracy": [
        baseline_accuracy,
        v2_accuracy,
        v3_accuracy
    ],
    "JSON Validity": [
        baseline_json_validity,
        v2_json_validity,
        v3_json_validity
    ],
    "Mean Absolute Star Distance": [
        baseline_distance,
        v2_distance,
        v3_distance
    ]
})

comparison_df.style.format({
    "Accuracy": "{:.2%}",
    "JSON Validity": "{:.2%}",
    "Mean Absolute Star Distance": "{:.2f}"
})


Unnamed: 0,Prompt Version,Accuracy,JSON Validity,Mean Absolute Star Distance
0,Baseline (V1),1.00%,4.50%,1.22
1,Strict JSON (V2),14.00%,100.00%,1.05
2,Examples + Guidelines (V3),13.50%,100.00%,1.11


| Prompt                         | What the numbers *mean*                                                                                                                                                 |
| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Baseline (V1)**              | Very low accuracy and JSON validity → confirms that unconstrained prompts are unreliable. Mean distance 1.22 indicates near-random guessing.                            |
| **Strict JSON (V2)**           | Massive jump in JSON validity (4.5% → 100%) and accuracy (1% → 14%). Mean distance drops to 1.05, showing improved calibration.                                         |
| **Examples + Guidelines (V3)** | Accuracy stays similar, JSON validity remains perfect, but mean distance slightly worsens (1.11). This suggests examples improved consistency but introduced mild bias. |
