# Task 1: Rating Prediction via Prompting

This notebook implements three different prompting approaches to classify Yelp reviews into star ratings (1-5) using Google Gemini API.

## Objectives
- Implement 3 different prompting approaches
- Evaluate accuracy, JSON validity, and consistency
- Compare results across approaches

## Setup and Imports

In [None]:
import pandas as pd
import google.generativeai as genai
import json
import os
from typing import List, Dict
import sys
from dotenv import load_dotenv

# Load environment variables
env_path = os.path.join(os.path.dirname(os.getcwd()), 'task1', '.env')
if os.path.exists(env_path):
    try:
        load_dotenv(env_path)
    except:
        try:
            with open(env_path, 'r', encoding='utf-8') as f:
                for line in f:
                    if line.startswith('GEMINI_API_KEY='):
                        os.environ['GEMINI_API_KEY'] = line.split('=', 1)[1].strip()
                        break
        except:
            pass
else:
    load_dotenv()

# Add paths for imports
sys.path.append(os.path.dirname(os.path.abspath('.')))
from prompts.prompt_versions import get_direct_classification_prompt, get_few_shot_prompt, get_chain_of_thought_prompt
from utils.evaluation import evaluate_approach, calculate_accuracy, calculate_json_validity_rate

## Configure Gemini API

In [None]:
# Configure Gemini API
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    # Fallback: use the API key directly
    GEMINI_API_KEY = 'AIzaSyDnfybUacyg2A4WqPR7GjhuLVY00r18xh4'

genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel('gemini-pro')
print("Gemini API configured successfully")

## Load Dataset

In [None]:
def load_yelp_data(file_path: str, sample_size: int = 200) -> pd.DataFrame:
    """
    Loads and samples the Yelp reviews dataset.
    """
    print(f"Loading data from {file_path}...")
    try:
        df = pd.read_csv(file_path)
        print(f"Total rows in dataset: {len(df)}")
        
        # Sample the data
        if len(df) > sample_size:
            df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
            print(f"Sampled {sample_size} reviews for evaluation")
        else:
            print(f"Using all {len(df)} reviews (less than sample size)")
        
        # Verify required columns exist
        if 'text' not in df.columns or 'stars' not in df.columns:
            print("Error: Dataset must contain 'text' and 'stars' columns")
            print(f"Available columns: {df.columns.tolist()}")
            return pd.DataFrame()
        
        print(f"Successfully loaded {len(df)} reviews")
        return df
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        print("Please ensure yelp.csv is in the data/ directory")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return pd.DataFrame()

# Load the dataset
data_file = "data/yelp.csv"
df = load_yelp_data(data_file)

if not df.empty:
    print(f"\nDataset preview:")
    print(df.head())
    print(f"\nDataset shape: {df.shape}")

## Prompting Approaches

We implement three different prompting strategies:

1. **Direct Classification**: Simple instruction-based prompt
2. **Few-Shot Learning**: Includes examples in the prompt
3. **Chain-of-Thought**: Step-by-step reasoning approach

### Approach 1: Direct Classification

This is the simplest approach - directly asking the model to classify the review.

In [None]:
# Example of Direct Classification prompt
example_review = "Amazing food and great service! Highly recommend."
example_prompt = get_direct_classification_prompt(example_review)
print("Example Direct Classification Prompt:")
print("=" * 60)
print(example_prompt)

### Approach 2: Few-Shot Learning

This approach provides examples to guide the model's understanding.

In [None]:
# Example of Few-Shot prompt
example_prompt = get_few_shot_prompt(example_review)
print("Example Few-Shot Learning Prompt:")
print("=" * 60)
print(example_prompt)

### Approach 3: Chain-of-Thought

This approach asks the model to reason through the classification step by step.

In [None]:
# Example of Chain-of-Thought prompt
example_prompt = get_chain_of_thought_prompt(example_review)
print("Example Chain-of-Thought Prompt:")
print("=" * 60)
print(example_prompt)

## Evaluation Functions

In [None]:
def predict_rating(prompt: str) -> str:
    """
    Sends prompt to Gemini API and returns response.
    """
    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        print(f"Error calling API: {e}")
        return ""

def run_evaluation(df: pd.DataFrame, approach_name: str, prompt_func) -> Dict:
    """
    Runs evaluation for a single prompting approach.
    """
    print(f"\nEvaluating {approach_name} approach...")
    responses = []
    actual_ratings = []
    
    for idx, row in df.iterrows():
        review_text = str(row.get('text', ''))
        actual_rating = int(row.get('stars', 3))
        
        # Skip if review text is empty or invalid
        if not review_text or review_text == 'nan' or len(review_text.strip()) == 0:
            continue
        
        prompt = prompt_func(review_text)
        response = predict_rating(prompt)
        
        responses.append(response)
        actual_ratings.append(actual_rating)
        
        if (idx + 1) % 20 == 0:
            print(f"Processed {idx + 1}/{len(df)} reviews...")
    
    results = evaluate_approach(responses, actual_ratings)
    results['approach_name'] = approach_name
    return results

In [None]:
if df.empty:
    print("Dataset is empty. Please ensure yelp.csv is in the data/ directory")
else:
    # Define approaches
    approaches = [
        ("Direct Classification", get_direct_classification_prompt),
        ("Few-Shot Learning", get_few_shot_prompt),
        ("Chain-of-Thought", get_chain_of_thought_prompt)
    ]
    
    # Run evaluations
    all_results = []
    for approach_name, prompt_func in approaches:
        results = run_evaluation(df, approach_name, prompt_func)
        all_results.append(results)
    
    print("\n" + "=" * 60)
    print("All evaluations completed!")
    print("=" * 60)

## Results and Comparison

In [None]:
# Print comparison table
print("\n" + "=" * 60)
print("COMPARISON TABLE")
print("=" * 60)
print(f"{'Approach':<25} {'Accuracy':<12} {'JSON Validity':<15} {'Valid Predictions':<20}")
print("-" * 60)

for result in all_results:
    print(f"{result['approach_name']:<25} "
          f"{result['accuracy']:.2f}%{'':<8} "
          f"{result['json_validity_rate']:.2f}%{'':<10} "
          f"{result['valid_predictions_count']:<20}")

## Discussion and Analysis

In [None]:
print("\n" + "=" * 60)
print("DISCUSSION")
print("=" * 60)
print("\nKey Findings:")
print("1. Direct Classification: Simple and fast, but may lack context.")
print("2. Few-Shot Learning: Provides examples to guide the model.")
print("3. Chain-of-Thought: Encourages step-by-step reasoning.")
print("\nTrade-offs:")
print("- More complex prompts may improve accuracy but increase API costs.")
print("- JSON validity is crucial for production use.")
print("- Consistency across runs indicates reliability.")
print("\nRecommendations:")
print("- For production: Choose approach with best balance of accuracy and JSON validity")
print("- For cost optimization: Consider simpler prompts if accuracy difference is minimal")
print("- For reliability: Test consistency across multiple runs")