In [1]:
import os
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
# from Scripts.config import DATA_PATHS

Sentiment Analysis

In [11]:
# RAW_IN = f"../{DATA_PATHS['processed']}"
# OUT_CSV = f"../{DATA_PATHS['final_result']}"

RAW_IN = "../data/processed/processed_reviews.csv"
OUT_CSV = "../data/processed/reviews_with_sentiment_themes.csv"

In [3]:
tqdm.pandas()

In [None]:
from transformers import pipeline

# Load the pre-trained sentiment analysis model
sentiment_pipeline = pipeline(
    "sentiment-analysis", 
    model="distilbert-base-uncased-finetuned-sst-2-english"
)


In [4]:
analyzer = SentimentIntensityAnalyzer()

In [5]:
def score_sentiment(text: str):
    """
    Compute sentiment compound score and label
    """
    s = analyzer.polarity_scores(str(text))
    if s["compound"] >= 0.05:
        label = "positive"
    elif s["compound"] <= -0.05:
        label = "negative"
    else:
        label = "neutral"
    return pd.Series({"sentiment_compound": s["compound"], "sentiment_label": label})


Thematic Analysis

In [6]:
def extract_keywords(docs, top_n=10):
    """
    Extract top TF-IDF keywords from a list of documents
    """
    vect = TfidfVectorizer(ngram_range=(1,2), stop_words='english', max_features=5000)
    X = vect.fit_transform(docs)
    features = vect.get_feature_names_out()
    avg_tfidf = np.asarray(X.mean(axis=0)).ravel()
    top_idx = np.argsort(avg_tfidf)[::-1][:top_n]
    return [features[i] for i in top_idx]

In [7]:
def cluster_themes(docs, n_clusters=4):
    """
    Cluster reviews using TF-IDF + KMeans
    Returns cluster labels and top keywords per cluster
    """
    vect = TfidfVectorizer(ngram_range=(1,2), stop_words='english', max_features=5000)
    X = vect.fit_transform(docs)
    model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = model.fit_predict(X)
    
    terms = vect.get_feature_names_out()
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    cluster_keywords = {}
    
    for i in range(n_clusters):
        top_terms = [terms[ind] for ind in order_centroids[i, :10]]
        cluster_keywords[i] = top_terms
    return labels, cluster_keywords

Human Readable Theme Naming

In [15]:
def assign_theme_names(cluster_keywords):
    """
    Map clusters to human-readable themes based on top keywords
    """
    theme_map = {}
    for cid, keywords in cluster_keywords.items():
        keywords_lower = [k.lower() for k in keywords]
        if any(word in keywords_lower for word in ["login", "password", "access"]):
            theme_map[cid] = "Account Access Issues"
        elif any(word in keywords_lower for word in ["slow", "loading", "lag", "speed", "transfer"]):
            theme_map[cid] = "Transaction Performance"
        elif any(word in keywords_lower for word in ["ui", "design", "interface", "screen"]):
            theme_map[cid] = "User Interface & Experience"
        elif any(word in keywords_lower for word in ["support", "help", "customer"]):
            theme_map[cid] = "Customer Support"
        elif any(word in keywords_lower for word in ["feature", "request", "fingerprint"]):
            theme_map[cid] = "Feature Requests"
        else:
            theme_map[cid] = "Other"
    return theme_map

In [16]:
# Run main Analysis
def run_analysis(input_csv=RAW_IN, output_csv=OUT_CSV, n_clusters=4):
    df = pd.read_csv(input_csv)
    
    # --- Sentiment ---
    df[["sentiment_compound", "sentiment_label"]] = df["review_text"].progress_apply(score_sentiment)
    
    # --- Themes ---
    theme_labels = []
    
    for bank in df['bank'].unique():
        bank_mask = df['bank'] == bank
        docs = df.loc[bank_mask, "review_text"].astype(str).tolist()
        if len(docs) == 0:
            continue
        
        labels, cluster_keywords = cluster_themes(docs, n_clusters=n_clusters)
        cluster_map = assign_theme_names(cluster_keywords)
        df.loc[bank_mask, "theme_label"] = [cluster_map[lbl] for lbl in labels]
    
    # --- Save ---
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    df.to_csv(output_csv, index=False)
    print(f"Saved annotated reviews with sentiment & themes -> {output_csv}")
    return df

In [20]:
df_final = run_analysis()
df_final

100%|██████████| 968/968 [00:00<00:00, 6670.37it/s]


Saved annotated reviews with sentiment & themes -> ../data/processed/reviews_with_sentiment_themes.csv


Unnamed: 0,review_text,rating,date,bank,source,sentiment_compound,sentiment_label,theme_label
0,CBE ይለያል።,5,2025-11-29,CBE,google_play,0.0000,neutral,Other
1,it's special for me,5,2025-11-29,CBE,google_play,0.4019,positive,Other
2,Make it user friendly.,2,2025-11-29,CBE,google_play,0.4939,positive,Other
3,maaliif daddafee install gaafata,3,2025-11-28,CBE,google_play,0.0000,neutral,Other
4,good app,5,2025-11-28,CBE,google_play,0.4404,positive,Other
...,...,...,...,...,...,...,...,...
963,"To be honest, best banking and lifestyle app i...",5,2025-05-12,DASHEN,google_play,0.8176,positive,Other
964,"A must have, seamless, all in one digital plat...",5,2025-05-12,DASHEN,google_play,0.0000,neutral,Transaction Performance
965,Amazing app super easy to use and best design....,5,2025-05-09,DASHEN,google_play,0.9623,positive,Other
966,its the best ever,5,2025-05-09,DASHEN,google_play,0.6369,positive,Transaction Performance


In [25]:
# Count neutral, positive and negative reviews
postivie_count = (df_final["sentiment_label"] == "positive").sum()
neutral_count = (df_final["sentiment_label"] == "neutral").sum()
negative_count = (df_final["sentiment_label"] == "negative").sum()

print("Positive Count:", postivie_count)
print("Neutral count:", neutral_count)
print("Negative count:", negative_count)


Positive Count: 471
Neutral count: 339
Negative count: 158
