In [5]:
"""
Step 3: Sentiment Analysis Script
--------------------------------
- Loads topic-annotated dataset from Step 2
- Uses Hugging Face sentiment analysis pipeline
- Saves results with an extra `sentiment` column

Usage:
    python scripts/sentiment_analysis.py
"""

import os
import pandas as pd
from transformers import pipeline

# Paths
TOPIC_FILE = "data/sample/reviews_with_topics.csv"
SENTIMENT_OUTPUT_FILE = "data/sample/reviews_with_sentiment.csv"


def analyze_sentiment(texts, model_name="distilbert-base-uncased-finetuned-sst-2-english"):
    """
    Run sentiment analysis on a list of texts using a Hugging Face model.
    Returns a list of labels (e.g., 'POSITIVE', 'NEGATIVE', 'NEUTRAL').
    """
    classifier = pipeline("sentiment-analysis", model=model_name)
    results = classifier(texts, truncation=True, batch_size=32)

    return [r["label"] for r in results]


def main():
    # Load dataset with topics
    if not os.path.exists(TOPIC_FILE):
        raise FileNotFoundError(f"❌ Missing file: {TOPIC_FILE}. Run topic_modeling.py first.")

    df = pd.read_csv(TOPIC_FILE)
    print(f"✅ Loaded {len(df)} reviews with topics for sentiment analysis")

    # Run sentiment analysis
    print("🔄 Running sentiment analysis...")
    df["sentiment"] = analyze_sentiment(df["review_text"].astype(str).tolist())

    # Save augmented dataset
    os.makedirs(os.path.dirname(SENTIMENT_OUTPUT_FILE), exist_ok=True)
    df.to_csv(SENTIMENT_OUTPUT_FILE, index=False)
    print(f"💾 Saved sentiment results → {SENTIMENT_OUTPUT_FILE}")


if __name__ == "__main__":
    main()


✅ Loaded 5000 reviews with topics for sentiment analysis
🔄 Running sentiment analysis...


Device set to use mps:0


💾 Saved sentiment results → data/sample/reviews_with_sentiment.csv
