In [16]:
import os
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
# from Scripts.config import DATA_PATHS

Sentiment Analysis

In [9]:
# RAW_IN = f"../{DATA_PATHS['processed']}"
# OUT_CSV = f"../{DATA_PATHS['final_result']}"

RAW_IN = "data/processed/processed_reviews.csv"
OUT_CSV = "data/processed/reviews_with_sentiment_themes.csv"

In [10]:
tqdm.pandas()

In [None]:
from transformers import pipeline

# Load the pre-trained sentiment analysis model
sentiment_pipeline = pipeline(
    "sentiment-analysis", 
    model="distilbert-base-uncased-finetuned-sst-2-english"
)


In [11]:
analyzer = SentimentIntensityAnalyzer()

In [12]:
def score_sentiment(text: str):
    """
    Compute sentiment compound score and label
    """
    s = analyzer.polarity_scores(str(text))
    if s["compound"] >= 0.05:
        label = "positive"
    elif s["compound"] <= -0.05:
        label = "negative"
    else:
        label = "neutral"
    return pd.Series({"sentiment_compound": s["compound"], "sentiment_label": label})


Thematic Analysis

In [None]:
def extract_keywords(docs, top_n=10):
    """
    Extract top TF-IDF keywords from a list of documents
    """
    vect = TfidfVectorizer(ngram_range=(1,2), stop_words='english', max_features=5000)
    X = vect.fit_transform(docs)
    features = vect.get_feature_names_out()
    avg_tfidf = np.asarray(X.mean(axis=0)).ravel()
    top_idx = np.argsort(avg_tfidf)[::-1][:top_n]
    return [features[i] for i in top_idx]

In [None]:
def cluster_themes(docs, n_clusters=4):
    """
    Cluster reviews using TF-IDF + KMeans
    Returns cluster labels and top keywords per cluster
    """
    vect = TfidfVectorizer(ngram_range=(1,2), stop_words='english', max_features=5000)
    X = vect.fit_transform(docs)
    model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = model.fit_predict(X)
    
    terms = vect.get_feature_names_out()
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    cluster_keywords = {}
    
    for i in range(n_clusters):
        top_terms = [terms[ind] for ind in order_centroids[i, :10]]
        cluster_keywords[i] = top_terms
    return labels, cluster_keywords