In [1]:
import google.genai as genai
import pandas as pd
import json
import time
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
from dotenv import load_dotenv
import os
load_dotenv()


True

In [2]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
client = genai.Client(api_key=GEMINI_API_KEY)
print("Gemini client initialized successfully!")
print("Using model: gemini-2.5-flash")


Gemini client initialized successfully!
Using model: gemini-2.5-flash


In [3]:


df = pd.read_csv('yelp.csv') 
df_sample = df.sample(n=200, random_state=42).reset_index(drop=True)
print(f"Total reviews: {len(df_sample)}")
print(f"Columns: {df_sample.columns.tolist()}")
df_sample.head()


Total reviews: 200
Columns: ['business_id', 'date', 'review_id', 'stars', 'text', 'type', 'user_id', 'cool', 'useful', 'funny']


Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,QVR7dsvBeg8xFt9B-vd1BA,2010-07-22,hwYVJs8Ko4PMjI19QcR57g,4,We got here around midnight last Friday... the...,review,90a6z--_CUrl84aCzZyPsg,5,5,2
1,24qSrF_XOrvaHDBy-gLIQg,2012-01-22,0mvthYPKb2ZmKhCADiKSmQ,5,Brought a friend from Louisiana here. She say...,review,9lJAj_2zCvP2jcEiRjF9oA,0,0,0
2,j0Uc-GuOe-x9_N_IK1KPpA,2009-05-09,XJHknNIecha6h0wkBSZB4w,3,"Every friday, my dad and I eat here. We order ...",review,0VfJi9Au0rVFVnPKcJpt3Q,0,0,0
3,RBiiGw8c7j-0a8nk35JO3w,2010-12-22,z6y3GRpYDqTznVe-0dn--Q,1,"My husband and I were really, really disappoin...",review,lwppVF0Yqkuwt-xaEuugqw,2,2,2
4,U8VA-RW6LYOhxR-Ygi6eDw,2011-01-17,vhWHdemMvsqVNv5zi2OMiA,5,Love this place! Was in phoenix 3 weeks for w...,review,Y2R_tlSk4lTHiLXTDsn1rg,0,1,0


In [4]:

PROMPT_1_TEMPLATE = """You are a rating prediction system. Based on the review text below, predict the star rating (1-5).

Review: "{review_text}"

Return ONLY a JSON object in this exact format:
{{"predicted_stars": <number>, "explanation": "<brief reason>"}}"""


PROMPT_2_TEMPLATE = """Analyze the following review step-by-step:

Review: "{review_text}"

Steps:
1. Identify the sentiment (positive, negative, neutral, mixed)
2. Look for specific indicators (complaints, praise, specific issues, enthusiasm level)
3. Based on these factors, determine the star rating (1-5)

Return ONLY a JSON object:
{{"predicted_stars": <number>, "explanation": "<reasoning based on sentiment and indicators>"}}"""


PROMPT_3_TEMPLATE = """You are an expert at predicting star ratings from reviews. Here are examples:

Example 1:
Review: "Absolutely amazing food! Best pizza I've ever had. Service was fantastic too."
Output: {{"predicted_stars": 5, "explanation": "Highly positive language with superlatives indicating excellent experience"}}

Example 2:
Review: "Food was okay, nothing special. Service took forever."
Output: {{"predicted_stars": 2, "explanation": "Mediocre food quality combined with poor service indicates below average experience"}}

Example 3:
Review: "Good food and decent prices. Could be better but satisfied overall."
Output: {{"predicted_stars": 4, "explanation": "Positive with minor reservations suggests good but not perfect experience"}}

Now predict for this review:
Review: "{review_text}"

Return ONLY a JSON object:
{{"predicted_stars": <number>, "explanation": "<brief reasoning>"}}"""

prompts = {
    "Prompt 1 (Basic)": PROMPT_1_TEMPLATE,
    "Prompt 2 (Chain-of-Thought)": PROMPT_2_TEMPLATE,
    "Prompt 3 (Few-Shot)": PROMPT_3_TEMPLATE
}


In [5]:
def predict_rating(review_text, prompt_template, max_retries=2):
    """Call Gemini API and return parsed JSON response"""
    prompt = prompt_template.format(review_text=review_text)
    
    for attempt in range(max_retries):
        try:
            response = model.generate_content(prompt)
            response_text = response.text.strip()
            
            if "```json" in response_text:
                response_text = response_text.split("```json").split("```").strip()[1]
            elif "```" in response_text:
                response_text = response_text.split("```")[1].split("```")[0].strip()
            
            result = json.loads(response_text)
    
            if "predicted_stars" in result and "explanation" in result:
                return {
                    "predicted_stars": int(result["predicted_stars"]),
                    "explanation": result["explanation"],
                    "is_valid": True,
                    "raw_response": response_text
                }
            else:
                return {"is_valid": False, "error": "Missing required fields", "raw_response": response_text}
                
        except json.JSONDecodeError as e:
            if attempt == max_retries - 1:
                return {"is_valid": False, "error": f"JSON parse error: {str(e)}", "raw_response": response.text}
        except Exception as e:
            if attempt == max_retries - 1:
                return {"is_valid": False, "error": str(e), "raw_response": str(e)}
        
        time.sleep(1) 
    
    return {"is_valid": False, "error": "Max retries exceeded"}


In [6]:
def calculate_metrics(predictions):
    '''Calculate accuracy and other metrics from predictions'''
    valid_predictions = [p for p in predictions if p['is_valid']]
    
    if not valid_predictions:
        return {
            'accuracy': 0,
            'valid_count': 0,
            'total_count': len(predictions),
            'json_validity_rate': 0
        }
    
    actual = [p['actual_stars'] for p in valid_predictions]
    predicted = [p['predicted_stars'] for p in valid_predictions]
    
    accuracy = accuracy_score(actual, predicted)
    conf_matrix = confusion_matrix(actual, predicted, labels=[1, 2, 3, 4, 5])
    
    return {
        'accuracy': accuracy,
        'valid_count': len(valid_predictions),
        'total_count': len(predictions),
        'json_validity_rate': len(valid_predictions) / len(predictions),
        'confusion_matrix': conf_matrix,
        'actual': actual,
        'predicted': predicted
    }

def calculate_consistency(predictions):
    '''Calculate prediction consistency (std dev of errors)'''
    valid_predictions = [p for p in predictions if p['is_valid']]
    if not valid_predictions:
        return 0
    
    errors = [abs(p['predicted_stars'] - p['actual_stars']) for p in valid_predictions]
    return np.std(errors)

In [7]:
results = {}

for prompt_name, prompt_template in prompts.items():
    print(f"\n{'='*60}")
    print(f"Testing: {prompt_name}")
    print(f"{'='*60}")
    
    predictions = []
    valid_count = 0
    
    for idx, row in df_sample.iterrows():
        review_text = row['text']  # Adjust column name if needed
        actual_stars = row['stars']  # Adjust column name if needed
        
        result = predict_rating(review_text, prompt_template)
        
        result['actual_stars'] = actual_stars
        predictions.append(result)
        
        if result['is_valid']:
            valid_count += 1
        
        # Progress indicator
        if (idx + 1) % 20 == 0:
            print(f"Processed {idx + 1}/{len(df_sample)} reviews...")
        
        time.sleep(0.5)  # Rate limiting for free tier
    
    results[prompt_name] = predictions
    print(f"Completed! Valid JSON responses: {valid_count}/{len(df_sample)}")



Testing: Prompt 1 (Basic)
Processed 20/200 reviews...
Processed 40/200 reviews...
Processed 60/200 reviews...
Processed 80/200 reviews...
Processed 100/200 reviews...
Processed 120/200 reviews...
Processed 140/200 reviews...
Processed 160/200 reviews...
Processed 180/200 reviews...
Processed 200/200 reviews...
Completed! Valid JSON responses: 0/200

Testing: Prompt 2 (Chain-of-Thought)
Processed 20/200 reviews...
Processed 40/200 reviews...
Processed 60/200 reviews...
Processed 80/200 reviews...
Processed 100/200 reviews...
Processed 120/200 reviews...
Processed 140/200 reviews...
Processed 160/200 reviews...
Processed 180/200 reviews...
Processed 200/200 reviews...
Completed! Valid JSON responses: 0/200

Testing: Prompt 3 (Few-Shot)
Processed 20/200 reviews...
Processed 40/200 reviews...


KeyboardInterrupt: 

In [None]:

metrics_summary = {}

for prompt_name, predictions in results.items():
    metrics = calculate_metrics(predictions)
    consistency = calculate_consistency(predictions)
    
    metrics_summary[prompt_name] = {
        'Accuracy': f"{metrics['accuracy']:.2%}",
        'JSON Validity Rate': f"{metrics['json_validity_rate']:.2%}",
        'Valid Predictions': f"{metrics['valid_count']}/{metrics['total_count']}",
        'Consistency (Lower is Better)': f"{consistency:.3f}"
    }
    
    print(f"\n{'='*60}")
    print(f"{prompt_name} - Results")
    print(f"{'='*60}")
    print(f"Accuracy: {metrics['accuracy']:.2%}")
    print(f"JSON Validity: {metrics['json_validity_rate']:.2%}")
    print(f"Valid Predictions: {metrics['valid_count']}/{metrics['total_count']}")
    print(f"Consistency (Std Dev of Errors): {consistency:.3f}")

✓ Gemini client initialized successfully!
✓ Using model: gemini-2.5-flash


In [None]:
comparison_df = pd.DataFrame(metrics_summary).T
print("\n" + "="*80)
print("COMPARISON TABLE - All Prompts")
print("="*80)
print(comparison_df.to_string())
print("\n")

# Save results
comparison_df.to_csv('prompt_comparison_results.csv')
print("Results saved to 'prompt_comparison_results.csv'")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 10)

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Accuracy Comparison
prompt_names = list(metrics_summary.keys())
accuracies = [float(metrics_summary[p]['Accuracy'].strip('%'))/100 for p in prompt_names]
axes[0, 0].bar(range(len(prompt_names)), accuracies, color=['#3498db', '#e74c3c', '#2ecc71'])
axes[0, 0].set_xticks(range(len(prompt_names)))
axes[0, 0].set_xticklabels(prompt_names, rotation=15, ha='right')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].set_title('Accuracy Comparison Across Prompts')
axes[0, 0].set_ylim([0, 1])
for i, v in enumerate(accuracies):
    axes[0, 0].text(i, v + 0.02, f'{v:.2%}', ha='center', va='bottom', fontweight='bold')

# 2. JSON Validity Rate
validity_rates = [float(metrics_summary[p]['JSON Validity Rate'].strip('%'))/100 for p in prompt_names]
axes[0, 1].bar(range(len(prompt_names)), validity_rates, color=['#9b59b6', '#f39c12', '#1abc9c'])
axes[0, 1].set_xticks(range(len(prompt_names)))
axes[0, 1].set_xticklabels(prompt_names, rotation=15, ha='right')
axes[0, 1].set_ylabel('JSON Validity Rate')
axes[0, 1].set_title('JSON Validity Rate Comparison')
axes[0, 1].set_ylim([0, 1])
for i, v in enumerate(validity_rates):
    axes[0, 1].text(i, v + 0.02, f'{v:.2%}', ha='center', va='bottom', fontweight='bold')

# 3. Confusion Matrix for Best Performing Prompt
best_prompt = max(metrics_summary.keys(), key=lambda x: float(metrics_summary[x]['Accuracy'].strip('%')))
best_metrics = calculate_metrics(results[best_prompt])
sns.heatmap(best_metrics['confusion_matrix'], annot=True, fmt='d', cmap='Blues', 
            xticklabels=[1,2,3,4,5], yticklabels=[1,2,3,4,5], ax=axes[1, 0])
axes[1, 0].set_xlabel('Predicted Stars')
axes[1, 0].set_ylabel('Actual Stars')
axes[1, 0].set_title(f'Confusion Matrix - {best_prompt}')

# 4. Error Distribution
all_errors = []
all_labels = []
for prompt_name in prompt_names:
    valid_preds = [p for p in results[prompt_name] if p['is_valid']]
    errors = [abs(p['predicted_stars'] - p['actual_stars']) for p in valid_preds]
    all_errors.extend(errors)
    all_labels.extend([prompt_name] * len(errors))

error_df = pd.DataFrame({'Prompt': all_labels, 'Absolute Error': all_errors})
sns.boxplot(data=error_df, x='Prompt', y='Absolute Error', ax=axes[1, 1])
axes[1, 1].set_xticklabels(axes[1, 1].get_xticklabels(), rotation=15, ha='right')
axes[1, 1].set_title('Error Distribution Across Prompts')

plt.tight_layout()
plt.savefig('prompt_evaluation_results.png', dpi=300, bbox_inches='tight')
plt.show()

print("Visualization saved to 'prompt_evaluation_results.png'")

In [None]:
print("\n" + "="*80)
print("DETAILED ANALYSIS & DISCUSSION")
print("="*80)

print("\n### PROMPT DESIGN RATIONALE ###\n")

print("1. PROMPT 1 (Basic):")
print("   - Simple, direct instruction")
print("   - Minimal guidance to the model")
print("   - Tests baseline performance")
print("   - Expected: Fast but potentially less accurate")

print("\n2. PROMPT 2 (Chain-of-Thought):")
print("   - Guides model through reasoning steps")
print("   - Explicitly asks to identify sentiment and indicators")
print("   - Expected: Better reasoning, potentially higher accuracy")
print("   - Trade-off: Slightly longer processing time")

print("\n3. PROMPT 3 (Few-Shot):")
print("   - Provides concrete examples of rating patterns")
print("   - Shows model what good predictions look like")
print("   - Expected: Most consistent and accurate")
print("   - Trade-off: Longer prompt = higher token usage")

print("\n### RESULTS SUMMARY ###\n")

# Find best performing prompt for each metric
best_accuracy = max(metrics_summary.keys(), key=lambda x: float(metrics_summary[x]['Accuracy'].strip('%')))
best_validity = max(metrics_summary.keys(), key=lambda x: float(metrics_summary[x]['JSON Validity Rate'].strip('%')))
best_consistency = min(metrics_summary.keys(), key=lambda x: float(metrics_summary[x]['Consistency (Lower is Better)']))

print(f"Best Accuracy: {best_accuracy} ({metrics_summary[best_accuracy]['Accuracy']})")
print(f"Best JSON Validity: {best_validity} ({metrics_summary[best_validity]['JSON Validity Rate']})")
print(f"Best Consistency: {best_consistency} ({metrics_summary[best_consistency]['Consistency (Lower is Better)']})")

print("\n### KEY FINDINGS ###\n")
print("1. Accuracy: How well each prompt predicted the correct star rating")
print("2. JSON Validity: How reliably each prompt returned properly formatted JSON")
print("3. Consistency: How stable the predictions are (lower std dev = more reliable)")

print("\n### TRADE-OFFS ###\n")
print("- Basic Prompt: Fast, simple, but may lack nuance")
print("- Chain-of-Thought: Better reasoning, but requires more tokens")
print("- Few-Shot: Most accurate, but highest token cost and prompt complexity")

print("\n### RECOMMENDATIONS ###\n")
if best_accuracy == best_validity == best_consistency:
    print(f"✓ {best_accuracy} is the clear winner across all metrics")
else:
    print("✓ Choose based on priority:")
    print(f"  - For accuracy: {best_accuracy}")
    print(f"  - For reliability: {best_validity}")
    print(f"  - For consistency: {best_consistency}")

In [None]:
import json

with open('detailed_predictions.json', 'w') as f:
    json.dump(results, f, indent=2, default=str)

print("\nAll results saved!")
print("- prompt_comparison_results.csv")
print("- prompt_evaluation_results.png")
print("- detailed_predictions.json")