In [1]:
import pandas as pd
import json
import openai
from openai import OpenAI
import time
import os
from typing import Dict, List, Any

import constants # This is a Python file constants.py and contains the OpenAI API key



In [8]:
# Set your OpenAI API key

client = OpenAI(
    api_key=constants.OPENAI_API_KEY
)

In [12]:


def analyze_review(title: str, text: str) -> Dict[str, Any]:
    """
    Analyze a single review using OpenAI API for sentiment, emotions, aspects, and product identification.
    """
    
    prompt = f"""
    Analyze the following Amazon review and provide a comprehensive analysis in JSON format.

    Review Title: {title}
    Review Text: {text}

    Please provide the analysis in the following JSON structure:
    {{
        "sentiment": "positive/negative/neutral",
        "emotions": {{
            "happiness": 0-5,
            "sadness": 0-5,
            "anger": 0-5,
            "fear": 0-5,
            "surprise": 0-5,
            "disgust": 0-5,
            "neutral": 0-5
        }},
        "aspects": [
            {{
                "aspect": "aspect name",
                "sentiment": "positive/negative/neutral"
            }},
            {{
                "aspect": "aspect name", 
                "sentiment": "positive/negative/neutral"
            }},
            {{
                "aspect": "aspect name",
                "sentiment": "positive/negative/neutral"
            }}
        ],
        "product": "product type or N/A",
        "brand": "brand name or N/A"
    }}

    Instructions:
    1. Sentiment: Classify overall sentiment as positive, negative, or neutral
    2. Emotions: Rate each emotion on scale 0-5 (0=absent, 1=somewhat, 5=very much)
    3. Aspects: Identify up to 3 key aspects mentioned (e.g., "battery life", "screen quality", "price") and their sentiment
    4. Product: Identify the product category (e.g., "laptop", "smartphone", "headphones") or "N/A"
    5. Brand: Identify the brand name or "N/A"
    
    Return only valid JSON without any additional text or formatting.
    """
    
    try:
        response = client.responses.create(
            model="gpt-4o-mini",  # Using gpt-4o-mini for cost efficiency
            input=[
                {"role": "system", "content": "You are an expert at analyzing product reviews. Always respond with valid JSON only."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.1,
            max_output_tokens=500
        )
        
        # Parse the JSON response
        analysis = json.loads(response.output_text)
        return analysis
        
    except json.JSONDecodeError as e:
        print(f"JSON parsing error: {e}")
        # Return a default structure if JSON parsing fails
        return {
            "sentiment": "neutral",
            "emotions": {"happiness": 0, "sadness": 0, "anger": 0, "fear": 0, "surprise": 0, "disgust": 0, "neutral": 3},
            "aspects": [],
            "product": "N/A",
            "brand": "N/A"
        }
    except Exception as e:
        print(f"API call error: {e}")
        return {
            "sentiment": "neutral",
            "emotions": {"happiness": 0, "sadness": 0, "anger": 0, "fear": 0, "surprise": 0, "disgust": 0, "neutral": 3},
            "aspects": [],
            "product": "N/A",
            "brand": "N/A"
        }

def process_reviews_file(input_file: str, output_file: str):
    """
    Process all reviews in the CSV file and save the enhanced results.
    """
    
    # Read the original CSV file
    print("Loading reviews from CSV file...")
    df_original = pd.read_csv(input_file)
    
    # Validate required columns
    if 'title' not in df_original.columns or 'text' not in df_original.columns:
        raise ValueError("CSV file must contain 'title' and 'text' columns")
    
    print(f"Found {len(df_original)} reviews to process...")
    
    # Initialize lists to store analysis results
    analysis_results = []
    
    # Process each review
    for idx, row in df_original.iterrows():
        print(f"Processing review {idx + 1}/{len(df_original)}...")
        
        # Analyze the review
        analysis = analyze_review(row['title'], row['text'])
        analysis_results.append(analysis)
        
        # Add a small delay to avoid rate limiting
        time.sleep(0.5)
    
    # Convert analysis results to DataFrame
    print("Converting analysis results to DataFrame...")
    
    # Extract data for DataFrame
    df_analysis_data = []
    for analysis in analysis_results:
        row_data = {
            'sentiment': analysis['sentiment'],
            'happiness': analysis['emotions']['happiness'],
            'sadness': analysis['emotions']['sadness'],
            'anger': analysis['emotions']['anger'],
            'fear': analysis['emotions']['fear'],
            'surprise': analysis['emotions']['surprise'],
            'disgust': analysis['emotions']['disgust'],
            'neutral_emotion': analysis['emotions']['neutral'],
            'product': analysis['product'],
            'brand': analysis['brand']
        }
        
        # Add up to 3 aspects
        for i in range(3):
            if i < len(analysis['aspects']):
                row_data[f'aspect_{i+1}'] = analysis['aspects'][i]['aspect']
                row_data[f'aspect_{i+1}_sentiment'] = analysis['aspects'][i]['sentiment']
            else:
                row_data[f'aspect_{i+1}'] = 'N/A'
                row_data[f'aspect_{i+1}_sentiment'] = 'N/A'
        
        df_analysis_data.append(row_data)
    
    # Create DataFrame from analysis results
    df_analysis = pd.DataFrame(df_analysis_data)
    
    # Combine original DataFrame with analysis results
    print("Combining original data with analysis results...")
    df_combined = pd.concat([df_original, df_analysis], axis=1)
    
    # Save to new CSV file
    print(f"Saving results to {output_file}...")
    df_combined.to_csv(output_file, index=False)
    
    print("Analysis complete!")
    print(f"Results saved to: {output_file}")
    print(f"Total reviews processed: {len(df_combined)}")
    
    # Display summary statistics
    print("\n--- Analysis Summary ---")
    print(f"Sentiment distribution:")
    print(df_combined['sentiment'].value_counts())
    print(f"\nTop products identified:")
    print(df_combined['product'].value_counts().head())
    print(f"\nTop brands identified:")
    print(df_combined['brand'].value_counts().head())
    
    return df_combined

def main():
    """
    Main function to run the analysis.
    """
    

    
    # File paths
    input_file = "Data/reviews_100.csv"
    output_file = "Data/reviews_100_analyzed.csv"
    
    try:
        # Process the reviews
        df_result = process_reviews_file(input_file, output_file)
        
        # Display first few rows of the result
        print("\n--- Sample of Enhanced Data ---")
        print(df_result.head())
        
    except FileNotFoundError:
        print(f"Error: Could not find {input_file}")
        print("Please make sure the file exists in the current directory")
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

Loading reviews from CSV file...
Found 100 reviews to process...
Processing review 1/100...
Processing review 2/100...
Processing review 3/100...
Processing review 4/100...
Processing review 5/100...
Processing review 6/100...
Processing review 7/100...
Processing review 8/100...
Processing review 9/100...
Processing review 10/100...
Processing review 11/100...
Processing review 12/100...
Processing review 13/100...
Processing review 14/100...
Processing review 15/100...
Processing review 16/100...
Processing review 17/100...
Processing review 18/100...
Processing review 19/100...
Processing review 20/100...
Processing review 21/100...
Processing review 22/100...
Processing review 23/100...
Processing review 24/100...
Processing review 25/100...
Processing review 26/100...
Processing review 27/100...
Processing review 28/100...
Processing review 29/100...
Processing review 30/100...
Processing review 31/100...
Processing review 32/100...
Processing review 33/100...
Processing review 34