In [19]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from sklearn.metrics import accuracy_score

load_dotenv()


True

In [21]:
import os
import json

CACHE_PATH = "llm_cache.json"

LLM_CACHE = {}

if os.path.exists(CACHE_PATH):
    try:
        with open(CACHE_PATH, "r") as f:
            LLM_CACHE = json.load(f)
    except json.JSONDecodeError:
        print("Cache file corrupted. Reinitializing cache.")
        LLM_CACHE = {}


In [22]:
import google.generativeai as genai

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

if not GEMINI_API_KEY:
    raise ValueError("GEMINI_API_KEY not found in .env")

genai.configure(api_key=GEMINI_API_KEY)

model = genai.GenerativeModel(
    model_name="gemini-flash-lite-latest"
)


In [23]:
df = pd.read_csv("data/yelp.csv")

df.head()


Unnamed: 0,text,stars
0,"A nice place to shop, but I wouldn't want to l...",3
1,The cheesy breadsticks were delicious. Everyt...,1
2,"Ok, so I have to share my morning experience. ...",1
3,"Two times a year, you can meander down Mill Av...",4
4,Older restrant but very good food,4


In [24]:
df.columns


Index(['text', 'stars'], dtype='object')

In [47]:
SAMPLE_SIZE = 60
df_sample = df.sample(SAMPLE_SIZE, random_state=42)

df_sample = df_sample[['text', 'stars']].reset_index(drop=True)
df_sample.head()


Unnamed: 0,text,stars
0,I think everyone has said most everything that...,5
1,"This Chipotle Rocks! \nToday, I express my jo...",5
2,The only reason 1 star can be given is the fla...,1
3,I only ask for two things when I'm flying. To ...,1
4,"Ok, they should be happy that I am understandi...",3


In [48]:
import time

def call_llm(prompt, cache_key):
    if cache_key in LLM_CACHE:
        return LLM_CACHE[cache_key]

    response = model.generate_content(
        prompt,
        generation_config={"temperature": 0}
    )

    text = response.text

    LLM_CACHE[cache_key] = text
    with open(CACHE_PATH, "w") as f:
        json.dump(LLM_CACHE, f, indent=2)

    time.sleep(6.5)

    return text


## Prompt Version 1: Basic Classification

This prompt directly asks the model to classify the review into a star rating.
It has minimal constraints and no strong formatting enforcement.


In [49]:
def prompt_v1_batch(reviews):
    reviews_text = "\n".join(
        [f"{i+1}. {r}" for i, r in enumerate(reviews)]
    )

    return f"""
Classify each Yelp review below into a star rating from 1 to 5.

Reviews:
{reviews_text}

Return the output as a JSON array.
Each element should contain:
- predicted_stars
- explanation

Output only JSON.
"""


## Prompt Version 2: Explicit Constraints & Schema

Changes:
- Explicitly restrict star values to integers 1–5
- Emphasize strict JSON output
- Reduce verbosity


In [50]:
def prompt_v2_batch(reviews):
    reviews_text = "\n".join(
        [f"{i+1}. {r}" for i, r in enumerate(reviews)]
    )

    return f"""
You are a sentiment analysis system.

For EACH review below, predict a Yelp star rating (1–5).

Rules:
- Output MUST be valid JSON
- Output MUST be an array
- Order must match input order
- Each item must contain:
  - predicted_stars (integer 1–5)
  - explanation (short)

Reviews:
{reviews_text}

Output JSON ONLY in this format:
[
  {{
    "predicted_stars": 5,
    "explanation": "reason"
  }}
]
"""


## Prompt Version 3: Few-Shot Learning

Changes:
- Provide examples
- Improve consistency and calibration
- Reduce hallucination


In [51]:
def prompt_v3_batch(reviews):
    reviews_text = "\n".join(
        [f"{i+1}. {r}" for i, r in enumerate(reviews)]
    )

    return f"""
You are an expert Yelp review classifier.

Examples:
Review: "Amazing food and friendly staff."
Rating: 5

Review: "The service was slow and the food was cold."
Rating: 1

Now classify EACH review below.

Rules:
- Ratings must be integers from 1 to 5
- Output MUST be valid JSON
- Output MUST be an array
- Order MUST match input order
- No text outside JSON

Reviews:
{reviews_text}

Output JSON format:
[
  {{
    "predicted_stars": 5,
    "explanation": "Short justification"
  }}
]
"""


In [52]:
def batch_reviews(reviews, batch_size=10):
    for i in range(0, len(reviews), batch_size):
        yield reviews[i:i + batch_size]


In [53]:
def parse_response(response):
    try:
        data = json.loads(response)
        stars = int(data["predicted_stars"])
        valid = 1 <= stars <= 5
        return stars if valid else None, valid
    except:
        return None, False


In [57]:
def evaluate_prompt_batch(prompt_fn, df, prompt_name, batch_size=10):
    y_true = []
    y_pred = []
    valids = []

    reviews = df["text"].tolist()
    actuals = df["stars"].tolist()

    for batch_idx, batch in enumerate(batch_reviews(reviews, batch_size)):
        cache_key = f"{prompt_name}_batch_{batch_idx}"

        output = call_llm(
            prompt_fn(batch),
            cache_key
        )

        try:
            results = json.loads(output)
        except json.JSONDecodeError:
            continue

        for i, item in enumerate(results):
            global_index = batch_idx * batch_size + i
            if global_index >= len(actuals):
                break

            try:
                star = int(item["predicted_stars"])
                valid = 1 <= star <= 5
            except:
                star, valid = None, False

            if valid:
                y_pred.append(star)
                y_true.append(actuals[global_index])
                valids.append(True)
            else:
                valids.append(False)

    if len(y_pred) == 0:
        accuracy = 0.0
        json_validity = 0.0
    else:
        accuracy = accuracy_score(y_true, y_pred)
        json_validity = sum(valids) / len(valids)

    return accuracy, json_validity


In [58]:
results = []

for name, fn in [
    ("Prompt_V1_Batch", prompt_v1_batch),
    ("Prompt_V2_Batch", prompt_v2_batch),
    ("Prompt_V3_Batch", prompt_v3_batch),
]:
    acc, json_rate = evaluate_prompt_batch(
        fn,
        df_sample,
        name,
        batch_size=10
    )
    results.append([name, acc, json_rate])

results_df = pd.DataFrame(
    results,
    columns=["Prompt Version", "Accuracy", "JSON Validity"]
)

results_df


Unnamed: 0,Prompt Version,Accuracy,JSON Validity
0,Prompt_V1_Batch,0.0,0.0
1,Prompt_V2_Batch,0.0,0.0
2,Prompt_V3_Batch,0.8,1.0


In [59]:
results_df = pd.DataFrame(
    results,
    columns=["Prompt Version", "Accuracy", "JSON Validity Rate"]
)

results_df


Unnamed: 0,Prompt Version,Accuracy,JSON Validity Rate
0,Prompt_V1_Batch,0.0,0.0
1,Prompt_V2_Batch,0.0,0.0
2,Prompt_V3_Batch,0.8,1.0
