# TASK 1: Yelp Rating Prediction via Prompting (OpenRouter Version)

This notebook uses OpenRouter's free LLMs.
You can experiment with different free models available on OpenRouter.

## 1. IMPORTS AND SETUP


In [None]:
import pandas as pd
import numpy as np
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple
from sklearn.metrics import confusion_matrix, mean_absolute_error, accuracy_score
import requests
from collections import Counter

## 2. OPENROUTER API CONFIGURATION

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

# OpenRouter API endpoint
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"

# You can experiment with different models!
# AVAILABLE_FREE_MODELS = {
#     "mistral-7b": "mistralai/mistral-7b-instruct:free",
#     "openchat-7b": "openchat/openchat-7b:free",
#     "mythomax-13b": "gryphe/mythomax-l2-13b:free",
#     "toppy-7b": "undi95/toppy-m-7b:free",
#     "cinematika-7b": "openrouter/cinematika-7b:free",
#     "gemma-7b": "google/gemma-7b-it:free",
#     "mythomist-7b": "gryphe/mythomist-7b:free",
# }

# Choose which model to use
SELECTED_MODEL = "mistralai/mistral-7b-instruct:free"  # Change this to experiment!

print(f"‚úÖ Using OpenRouter model: {SELECTED_MODEL}")

## 3. OPENROUTER API HELPER FUNCTIONS

In [None]:
def call_openrouter_api(prompt: str, temperature: float = 0.1, max_tokens: int = 300) -> str:
    
    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json"
    }
    
    data = {
        "model": SELECTED_MODEL,
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "temperature": temperature,
        "max_tokens": max_tokens,
    }
    
    try:
        response = requests.post(
            OPENROUTER_API_URL,
            headers=headers,
            json=data,
            timeout=30
        )
        
        response.raise_for_status()
        result = response.json()
        
        # Extract the response text
        return result['choices'][0]['message']['content']
        
    except requests.exceptions.RequestException as e:
        raise Exception(f"API request failed: {str(e)}")
    except KeyError as e:
        raise Exception(f"Unexpected API response format: {str(e)}")


def test_api_connection():
    
    print("\nüîç Testing OpenRouter API connection...")
    
    try:
        response = call_openrouter_api(
            "Say 'Hello' in JSON format: {\"message\": \"your message\"}",
            temperature=0.0,
            max_tokens=50
        )
        print(f"‚úÖ API connection successful!")
        print(f"Test response: {response}")
        return True
    except Exception as e:
        print(f"‚ùå API connection failed: {str(e)}")
        print("\nüí° Tips:")
        print("1. Get free API key from: https://openrouter.ai/keys")
        print("2. Update OPENROUTER_API_KEY in this notebook")
        print("3. Make sure you have credits (free tier included)")
        return False

# Test the connection
if not test_api_connection():
    print("\n‚ö†Ô∏è Please fix API configuration before continuing!")

## 4. DATA LOADING AND PREPROCESSING

In [None]:
def load_and_sample_data(filepath: str, sample_size: int = 250) -> pd.DataFrame:
    
    print(f"\nüìÇ Loading data from {filepath}...")
    df = pd.read_csv(filepath)
    
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"\nRating distribution:")
    print(df['stars'].value_counts().sort_index())
    
    # Create balanced sample (50 reviews per star rating)
    per_rating = sample_size // 5
    sampled_dfs = []
    
    for rating in range(1, 6):
        rating_df = df[df['stars'] == rating].sample(
            n=min(per_rating, len(df[df['stars'] == rating])), 
            random_state=42
        )
        sampled_dfs.append(rating_df)
    
    sample_df = pd.concat(sampled_dfs, ignore_index=True)
    sample_df = sample_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    print(f"\n‚úÖ Sampled {len(sample_df)} reviews")
    print(f"Sample rating distribution:")
    print(sample_df['stars'].value_counts().sort_index())
    
    return sample_df

# Load the data (update path to your file)
df = load_and_sample_data('data/yelp.csv', sample_size=250)

# Display sample reviews
print("\nüìù Sample reviews:")
for i in range(3):
    print(f"\nRating: {df.iloc[i]['stars']} stars")
    print(f"Review: {df.iloc[i]['text'][:200]}...")

## 5. PROMPTING APPROACHES

In [None]:
class PromptApproach:
    
    def __init__(self, name: str, description: str):
        self.name = name
        self.description = description
    
    def create_prompt(self, review_text: str) -> str:
        raise NotImplementedError
    
    def predict(self, review_text: str, temperature: float = 0.1) -> Dict:
        
        try:
            # Create the prompt
            prompt = self.create_prompt(review_text)
            
            # Verify prompt is a string
            if not isinstance(prompt, str):
                raise TypeError(f"Prompt must be a string, got {type(prompt)}")
            
            # Call OpenRouter API
            response_text = call_openrouter_api(prompt, temperature=temperature, max_tokens=300)
            response_text = response_text.strip()
            print(response_text)
            
            # Try to find JSON in the response
            if '```json' in response_text:
                json_start = response_text.find('```json') + 7
                json_end = response_text.find('```', json_start)
                response_text = response_text[json_start:json_end].strip()
            elif '```' in response_text:
                json_start = response_text.find('```') + 3
                json_end = response_text.find('```', json_start)
                response_text = response_text[json_start:json_end].strip()
            
            # Parse JSON
            result = json.loads(response_text)
            
            # Validate the result
            if 'predicted_stars' not in result:
                return {"predicted_stars": None, "explanation": "Invalid JSON structure", "valid_json": False}
            
            # Ensure rating is between 1-5
            predicted_stars = result['predicted_stars']
            if not isinstance(predicted_stars, (int, float)):
                return {"predicted_stars": None, "explanation": "predicted_stars must be a number", "valid_json": False}
            
            predicted_stars = int(predicted_stars)
            if not (1 <= predicted_stars <= 5):
                predicted_stars = max(1, min(5, predicted_stars))
            
            return {
                "predicted_stars": predicted_stars,
                "explanation": result.get('explanation', 'No explanation provided'),
                "valid_json": True
            }
            
        except json.JSONDecodeError as e:
            return {"predicted_stars": None, "explanation": f"JSON parsing failed: {str(e)}", "valid_json": False}
        except Exception as e:
            return {"predicted_stars": None, "explanation": f"Error: {str(e)}", "valid_json": False}

### Approach 1 : Zero-Shot Prompt

In [None]:
class ZeroShotApproach(PromptApproach):
    
    def __init__(self):
        super().__init__(
            name = "Zero-Shot Prompt",
            description = "Simple, direct instruction with no examples"
        )
    
    def create_prompt(self, review_text: str) -> str:

        zero_shot_prompt = """
        You are a rating prediction system for Yelp reviews. Analyze the following review and predict the star rating (1-5 stars).

        Rating Guidelines:
        - 5 stars: Excellent, highly positive
        - 4 stars: Good, mostly positive with minor issues
        - 3 stars: Average, mixed feelings
        - 2 stars: Poor, mostly negative with some positives
        - 1 star: Terrible, extremely negative

        Review: "{review_text}"

        Return your response in this EXACT JSON format (no markdown, no extra text):
        {{
          "predicted_stars": <number between 1-5>,
          "explanation": "<brief reasoning in 1-2 sentences>"
        }}
        """

        return zero_shot_prompt

### Approach 2 : Few Shot 

In [None]:
class FewShotApproach(PromptApproach):
    
    def __init__(self):
        super().__init__(
            name = "Few Shot With Examples",
            description = "Provides 5 examples (one for each rating) to guide the model"
        )
    
    def create_prompt(self, review_text: str) -> str:

        few_shot_prompt = """
        You are a rating prediction system. Learn from these examples:

        Example 1 - 5 stars:
        Review: "Absolutely amazing experience! The food was outstanding, service was impeccable, and the atmosphere was perfect. Best restaurant in town!"
        Rating: {{"predicted_stars": 5, "explanation": "Extremely positive language with multiple superlatives and no complaints"}}

        Example 2 - 4 stars:
        Review: "Really good food and nice staff. The wait was a bit long but overall a great experience. Would definitely come back."
        Rating: {{"predicted_stars": 4, "explanation": "Positive overall with one minor negative aspect mentioned"}}

        Example 3 - 3 stars:
        Review: "The food was decent but nothing special. Service was okay. It's fine for a quick meal but I wouldn't go out of my way to come here."
        Rating: {{"predicted_stars": 3, "explanation": "Neutral language with mixed sentiments, neither strongly positive nor negative"}}

        Example 4 - 2 stars:
        Review: "Pretty disappointed. The food was cold and the service was slow. A few items were good but mostly not worth the price."
        Rating: {{"predicted_stars": 2, "explanation": "Predominantly negative with slight positive mention, expressing disappointment"}}

        Example 5 - 1 star:
        Review: "Horrible experience. Rude staff, terrible food, dirty environment. Complete waste of money. Never coming back!"
        Rating: {{"predicted_stars": 1, "explanation": "Extremely negative with multiple serious complaints and no positive aspects"}}

        Now analyze this review:
        Review: "{review_text}"

        Return ONLY a JSON object in the exact same format (no markdown, no extra text):
        {{"predicted_stars": <1-5>, "explanation": "<brief reasoning>"}}
        """

        return few_shot_prompt

### Approach 3 : Chain of Thought

In [None]:
class ChainOfThoughtApproach(PromptApproach):
        
    def __init__(self):
        super().__init__(
            name = "Chain of Thought (CoT)",
            description = "Step-by-step reasoning before final prediction"
        )
    
    def create_prompt(self, review_text: str) -> str:
        cot_prompt = """
        You are an expert at analyzing Yelp reviews. Use step-by-step reasoning to predict the star rating.

        Review to analyze: "{review_text}"

        Follow these steps:
        1. Identify all POSITIVE aspects mentioned (quality, service, atmosphere, value, etc.)
        2. Identify all NEGATIVE aspects or complaints
        3. Assess the overall sentiment intensity (mild, moderate, strong, extreme)
        4. Consider the language used (neutral, emotional, superlatives, etc.)
        5. Determine if there are any deal-breakers or exceptional highlights

        Based on your analysis:
        - 5 stars: Overwhelmingly positive, exceptional experience
        - 4 stars: Very positive, minor issues don't overshadow the good
        - 3 stars: Balanced or neutral, significant pros and cons
        - 2 stars: Predominantly negative, few redeeming qualities  
        - 1 star: Extremely negative, multiple serious problems

        After analyzing, provide your rating in this EXACT JSON format (no markdown):
        {{
          "predicted_stars": <number between 1-5>,
          "explanation": "<2-3 sentence explanation of your reasoning>"
        }}
        """
        
        return cot_prompt

## 6. EVALUATION FRAMEWORK

In [None]:
def evaluate_approach(approach: PromptApproach, df: pd.DataFrame, sample_size: int = None) -> Dict:

    print(f"\n{'='*70}")
    print(f"üîÑ Evaluating: {approach.name}")
    print(f"Description: {approach.description}")
    print(f"{'='*70}")
    
    # Use subset if specified
    eval_df = df.sample(n=sample_size, random_state=42) if sample_size else df
    
    predictions = []
    valid_json_count = 0
    start_time = time.time()
    
    for idx, row in eval_df.iterrows():
        review_text = str(row['text'])  # Ensure it's a string
        actual_rating = int(row['stars'])  # Ensure it's an int
        
        print(f"\nüìù Processing review {idx + 1}/{len(eval_df)}...")
        print(f"Review preview: {review_text[:100]}...")
        
        # Make prediction
        try:
            result = approach.predict(review_text)
            print(f"‚úÖ Prediction: {result['predicted_stars']} stars")
        except Exception as e:
            print(f"‚ùå Error during prediction: {str(e)}")
            result = {
                'predicted_stars': None,
                'explanation': f"Prediction error: {str(e)}",
                'valid_json': False
            }
        
        predictions.append({
            'actual_stars': actual_rating,
            'predicted_stars': result['predicted_stars'],
            'explanation': result['explanation'],
            'review_text': review_text[:100] + '...',
            'valid_json': result['valid_json']
        })
        
        if result['valid_json']:
            valid_json_count += 1
        
        # Rate limiting - be nice to the API
        time.sleep(1.0)
    
    elapsed_time = time.time() - start_time
    
    # Calculate metrics
    pred_df = pd.DataFrame(predictions)
    
    # Filter out invalid predictions for accuracy calculation
    valid_preds = pred_df[pred_df['valid_json'] == True].copy()
    
    if len(valid_preds) == 0:
        print("‚ùå No valid predictions generated!")
        return {
            'approach_name': approach.name,
            'predictions': pred_df,
            'metrics': {}
        }
    
    accuracy = accuracy_score(valid_preds['actual_stars'], valid_preds['predicted_stars'])
    mae = mean_absolute_error(valid_preds['actual_stars'], valid_preds['predicted_stars'])
    json_validity_rate = (valid_json_count / len(predictions)) * 100
    avg_time_per_prediction = elapsed_time / len(predictions)
    
    # Calculate per-rating accuracy
    per_rating_accuracy = {}
    for rating in range(1, 6):
        rating_preds = valid_preds[valid_preds['actual_stars'] == rating]
        if len(rating_preds) > 0:
            rating_acc = (rating_preds['actual_stars'] == rating_preds['predicted_stars']).mean()
            per_rating_accuracy[rating] = rating_acc
    
    metrics = {
        'accuracy': accuracy,
        'mae': mae,
        'json_validity_rate': json_validity_rate,
        'avg_time_per_prediction': avg_time_per_prediction,
        'total_predictions': len(predictions),
        'valid_predictions': len(valid_preds),
        'per_rating_accuracy': per_rating_accuracy
    }
    
    # Print results
    print(f"\nüìä Results for {approach.name}:")
    print(f"  Accuracy: {accuracy:.2%}")
    print(f"  MAE: {mae:.3f} stars")
    print(f"  JSON Validity Rate: {json_validity_rate:.1f}%")
    print(f"  Avg Time per Prediction: {avg_time_per_prediction:.2f}s")
    print(f"  Total Time: {elapsed_time:.1f}s")
    
    return {
        'approach_name': approach.name,
        'predictions': pred_df,
        'metrics': metrics
    }

## 7. RUN ALL EVALUATIONS

In [None]:
# Initialize approaches
approaches = [
    ZeroShotApproach(),
    FewShotApproach(),
    ChainOfThoughtApproach()
]

# Evaluate all approaches
# Note: Using 200 reviews as recommended, change to None to use all 250
results = []

for approach in approaches:
    result = evaluate_approach(approach, df, sample_size=200)
    results.append(result)
    
    # Save predictions to JSON
    filename = f"results/predictions_{approach.name.lower().replace(' ', '_').replace('-', '_')}.json"
    result['predictions'].to_json(filename, orient='records', indent=2)
    print(f"üíæ Saved predictions to {filename}")

print("\n‚úÖ All evaluations complete!")

## 8. COMPARISON

In [None]:
def create_comparison_table(results: List[Dict]) -> pd.DataFrame:
    
    comparison_data = []
    
    for result in results:
        metrics = result['metrics']
        if not metrics:
            continue
            
        comparison_data.append({
            'Approach': result['approach_name'],
            'Model': SELECTED_MODEL.split('/')[-1],
            'Accuracy (%)': f"{metrics['accuracy']*100:.2f}",
            'MAE': f"{metrics['mae']:.3f}",
            'JSON Validity (%)': f"{metrics['json_validity_rate']:.1f}",
            'Avg Time (s)': f"{metrics['avg_time_per_prediction']:.2f}",
            'Valid Predictions': metrics['valid_predictions']
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    return comparison_df

comparison_table = create_comparison_table(results)
print("\n" + "="*80)
print("üìä COMPARISON TABLE")
print("="*80)
print(comparison_table.to_string(index=False))
print("="*80)

# Save comparison table
comparison_table.to_csv('results/comparison_results_openrouter.csv', index=False)
print("\nüíæ Saved comparison table to results/comparison_results_openrouter.csv")

## 9. VISUALIZATIONS

In [None]:
if any(r['metrics'] for r in results):
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle(f'Yelp Rating Prediction - OpenRouter ({SELECTED_MODEL.split("/")[-1]})', 
                 fontsize=16, fontweight='bold')

    # Plot 1: Accuracy Comparison
    ax = axes[0, 0]
    accuracies = [r['metrics']['accuracy']*100 for r in results if r['metrics']]
    approach_names = [r['approach_name'] for r in results if r['metrics']]
    bars = ax.bar(approach_names, accuracies, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
    ax.set_ylabel('Accuracy (%)', fontweight='bold')
    ax.set_title('Accuracy Comparison', fontweight='bold')
    ax.set_ylim([0, 100])
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.1f}%', ha='center', va='bottom', fontweight='bold')
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right')

    # Plot 2: MAE Comparison
    ax = axes[0, 1]
    maes = [r['metrics']['mae'] for r in results if r['metrics']]
    bars = ax.bar(approach_names, maes, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
    ax.set_ylabel('Mean Absolute Error', fontweight='bold')
    ax.set_title('MAE Comparison (Lower is Better)', fontweight='bold')
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}', ha='center', va='bottom', fontweight='bold')
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right')

    # Plot 3: JSON Validity Rate
    ax = axes[0, 2]
    validity_rates = [r['metrics']['json_validity_rate'] for r in results if r['metrics']]
    bars = ax.bar(approach_names, validity_rates, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
    ax.set_ylabel('JSON Validity Rate (%)', fontweight='bold')
    ax.set_title('JSON Validity Rate', fontweight='bold')
    ax.set_ylim([0, 100])
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.1f}%', ha='center', va='bottom', fontweight='bold')
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right')

    # Plot 4-6: Confusion Matrices
    for idx, result in enumerate(results):
        if not result['metrics']:
            continue
        
        ax = axes[1, idx]
        valid_preds = result['predictions'][result['predictions']['valid_json'] == True]
        
        if len(valid_preds) > 0:
            cm = confusion_matrix(valid_preds['actual_stars'], valid_preds['predicted_stars'], 
                                 labels=[1, 2, 3, 4, 5])
            
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax, 
                        xticklabels=[1,2,3,4,5], yticklabels=[1,2,3,4,5])
            ax.set_xlabel('Predicted Stars', fontweight='bold')
            ax.set_ylabel('Actual Stars', fontweight='bold')
            ax.set_title(f'{result["approach_name"]} - Confusion Matrix', fontweight='bold')

    plt.tight_layout()
    plt.savefig('results/comparison_visualizations_openrouter.png', dpi=300, bbox_inches='tight')
    print("üíæ Saved visualizations to results/comparison_visualizations_openrouter.png")
    plt.show()
else:
    print("‚ö†Ô∏è No valid results to visualize")

print("\n‚úÖ Analysis complete! Check the results/ folder for all outputs.")
print(f"\nüí° Tip: Try experimenting with different models by changing SELECTED_MODEL at the top!")
print(f"   Current model: {SELECTED_MODEL}")