In [2]:
import pandas as pd

df = pd.read_csv("../data/play_reviews_clean.csv")
print(df.shape)
print(df.head())

(1200, 12)
                              review_id  \
0  28f229b5-0026-41b9-a1eb-b76e74736f63   
1  68d8daea-db47-4e23-a692-755173dea983   
2  ee0dbb0e-4eb0-47b5-9874-c37877493f99   
3  5112423d-e618-44ba-ba49-62677cb76cd6   
4  bcb34681-1dd4-4781-b400-4393bb10b1d9   

                                              review  rating        date bank  \
0                             Make it user friendly.       2  2025-11-29  CBE   
1                   maaliif daddafee install gaafata       3  2025-11-28  CBE   
2                                           good app       5  2025-11-28  CBE   
3  This application is very important and advanta...       5  2025-11-27  CBE   
4                          why didn't work this app?       1  2025-11-27  CBE   

        source                                      content_clean lang  \
0  google_play                             Make it user friendly.   en   
1  google_play                   maaliif daddafee install gaafata   so   
2  google_play       

In [3]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    score = analyzer.polarity_scores(text)['compound']
    if score >= 0.05:
        label = 'positive'
    elif score <= -0.05:
        label = 'negative'
    else:
        label = 'neutral'
    return pd.Series([label, score])

df[['sentiment_label', 'sentiment_score']] = df['review'].apply(get_sentiment)


In [4]:
agg = df.groupby(['bank', 'rating'])['sentiment_score'].mean().reset_index()
print(agg)


      bank  rating  sentiment_score
0      BOA       1        -0.174759
1      BOA       2         0.107793
2      BOA       3         0.187350
3      BOA       4         0.351683
4      BOA       5         0.344843
5      CBE       1        -0.087782
6      CBE       2         0.076877
7      CBE       3         0.150213
8      CBE       4         0.304397
9      CBE       5         0.367248
10  Dashen       1        -0.155278
11  Dashen       2        -0.057728
12  Dashen       3         0.269823
13  Dashen       4         0.366700
14  Dashen       5         0.464930


In [5]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def preprocess(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return " ".join(tokens)

df['review_clean'] = df['review'].apply(preprocess)


In [6]:
vectorizer = TfidfVectorizer(max_features=50, ngram_range=(1,2))
X = vectorizer.fit_transform(df['review_clean'])
keywords = vectorizer.get_feature_names_out()
print("Top keywords:", keywords)


Top keywords: ['account' 'amazing' 'app' 'application' 'bad' 'bad app' 'bank' 'banking'
 'banking app' 'boa' 'cbe' 'dashen' 'dashen bank' 'easy' 'ethiopia'
 'excellent' 'experience' 'fast' 'feature' 'fix' 'good' 'good app' 'great'
 'issue' 'like' 'love' 'mobile' 'mobile banking' 'money' 'need' 'nice'
 'open' 'option' 'phone' 'problem' 'service' 'slow' 'super' 'super app'
 'thank' 'time' 'transaction' 'transfer' 'try' 'update' 'use' 'user'
 'well' 'work' 'wow']


In [7]:
themes = {
    "Account & Access Issues": ["account", "open", "option"],
    "Transactions & Performance": ["transaction", "transfer", "slow", "problem", "time"],
    "UI/UX & Features": ["app", "application", "mobile banking", "easy", "good app", "nice", "experience", "use", "user", "feature"],
    "Customer Support": ["service", "fix", "thank"],
    "Overall Feedback": ["good", "great", "excellent", "amazing", "love", "wow", "bad", "bad app", "issue"]
}


In [11]:
def assign_themes(text):
    matched = [theme for theme, kws in themes.items() if any(kw in text.lower() for kw in kws)]
    return matched if matched else ["Other"]

df['themes'] = df['review'].apply(assign_themes)


In [10]:
import os

os.makedirs("data", exist_ok=True)  # creates 'data' folder if it doesn't exist



In [12]:
df[['review_id', 'review', 'sentiment_label', 'sentiment_score', 'themes', 'bank']].to_csv(
    "data/play_reviews_sentiment_themes.csv", index=False
)
