In [1]:
# Import modules from src
import sys
sys.path.append('../src')  # Add src to path

from scraper import scrape_reviews
from preprocessor import preprocess_reviews
import pandas as pd
from tqdm import tqdm

In [53]:
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
from preprocessor import get_sentiment_scores, label_sentiment

In [None]:
from analyzer import SentimentAnalyzer, ThemeAnalyzer
from visualizer import ReviewVisualizer
from database import ReviewDatabase

In [3]:
from initialize import setup_nltk

# Run this once to download necessary resources
setup_nltk()


[nltk_data] Downloading package punkt to /home/age/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/age/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/age/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/age/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/age/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
from preprocessor import preprocess_text

In [4]:
# 1. Data Collection
bank_apps = {
    'CBE': 'com.combanketh.mobilebanking',
    'BOA': 'com.boa.boaMobileBanking',
    'Dashen': 'com.dashen.dashensuperapp'
}

In [5]:
raw_reviews = scrape_reviews(bank_apps)


Scraping banks:   0%|          | 0/3 [00:00<?, ?it/s]

Scraping banks: 100%|██████████| 3/3 [00:04<00:00,  1.54s/it]


In [7]:
raw_reviews.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,bank,language
0,42eed344-f5b7-49be-a7eb-b1c14386e20b,Tilahun Nedi,https://play-lh.googleusercontent.com/a/ACg8oc...,very niec,5.0,0.0,5.1.0,2025-06-04 11:21:53,,,5.1.0,CBE,en
1,fce00cda-d71a-486e-a4c2-7479ab7793bc,Abdurahiman Abubaker Hussein,https://play-lh.googleusercontent.com/a/ACg8oc...,best app of finance,5.0,0.0,,2025-06-04 07:38:54,,,,CBE,en
2,f543a13e-bc0f-4348-b57f-3a83eb67206c,Milky Yebassa,https://play-lh.googleusercontent.com/a/ACg8oc...,yetemeta,1.0,0.0,5.1.0,2025-06-03 21:04:44,,,5.1.0,CBE,en
3,2470d199-834a-4134-a0f9-8c684ba75491,ENGIDA KEBEDE,https://play-lh.googleusercontent.com/a-/ALV-U...,Engida Kebede Fetera,5.0,0.0,5.1.0,2025-06-03 20:20:23,,,5.1.0,CBE,en
4,4beee0dd-05ad-474c-bdff-0146723f2179,Abel Beyena,https://play-lh.googleusercontent.com/a/ACg8oc...,good,5.0,0.0,5.1.0,2025-06-03 11:32:52,,,5.1.0,CBE,en


In [8]:
raw_reviews.shape

(1715, 13)

In [9]:
raw_reviews.to_csv('../datasets/raw/reviews.csv', index=False)

In [3]:
df = pd.read_csv('../datasets/raw/reviews.csv', parse_dates=['at'])

In [4]:
df.head(3)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,bank,language
0,42eed344-f5b7-49be-a7eb-b1c14386e20b,Tilahun Nedi,https://play-lh.googleusercontent.com/a/ACg8oc...,very niec,5.0,0.0,5.1.0,2025-06-04 11:21:53,,,5.1.0,CBE,en
1,fce00cda-d71a-486e-a4c2-7479ab7793bc,Abdurahiman Abubaker Hussein,https://play-lh.googleusercontent.com/a/ACg8oc...,best app of finance,5.0,0.0,,2025-06-04 07:38:54,,,,CBE,en
2,f543a13e-bc0f-4348-b57f-3a83eb67206c,Milky Yebassa,https://play-lh.googleusercontent.com/a/ACg8oc...,yetemeta,1.0,0.0,5.1.0,2025-06-03 21:04:44,,,5.1.0,CBE,en


In [5]:
# preprocess_text(df['content'], language='en')

In [70]:
# 2. Data Preprocessing
cleaned_df = preprocess_reviews(df)
cleaned_df.to_csv('../datasets/raw/cleaned_reviews.csv', index=False)

Preprocessing text:   0%|          | 0/1437 [00:00<?, ?it/s]

Preprocessing text: 100%|██████████| 1437/1437 [00:01<00:00, 1286.40it/s]


In [7]:
cleaned_df

Unnamed: 0,review,rating,date,bank,language,processed_review
0,very niec,5.0,2025-06-04,CBE,en,niec
1,best app of finance,5.0,2025-06-04,CBE,en,best app finance
2,yetemeta,1.0,2025-06-03,CBE,en,yetemeta
3,Engida Kebede Fetera,5.0,2025-06-03,CBE,en,engida kebede fetera
4,good,5.0,2025-06-03,CBE,en,good
...,...,...,...,...,...,...
1709,Faster and userfriendly,5.0,2025-01-14,Dashen,en,faster userfriendly
1711,"Waw Great and innovated,user friendly, always ...",5.0,2025-01-13,Dashen,en,waw great innovateduser friendly always one st...
1712,It's Best waww 🙏,5.0,2025-01-13,Dashen,en,best waww
1713,Always one step ahead,5.0,2025-01-13,Dashen,en,always one step ahead


In [71]:
cleaned_df = get_sentiment_scores(cleaned_df)

Calculating sentiment scores: 100%|██████████| 1437/1437 [00:00<00:00, 11376.37it/s]


In [9]:
cleaned_df

Unnamed: 0,review,rating,date,bank,language,processed_review,sentiment
0,very niec,5.0,2025-06-04,CBE,en,niec,0.0000
1,best app of finance,5.0,2025-06-04,CBE,en,best app finance,0.6369
2,yetemeta,1.0,2025-06-03,CBE,en,yetemeta,0.0000
3,Engida Kebede Fetera,5.0,2025-06-03,CBE,en,engida kebede fetera,0.0000
4,good,5.0,2025-06-03,CBE,en,good,0.4404
...,...,...,...,...,...,...,...
1709,Faster and userfriendly,5.0,2025-01-14,Dashen,en,faster userfriendly,0.0000
1711,"Waw Great and innovated,user friendly, always ...",5.0,2025-01-13,Dashen,en,waw great innovateduser friendly always one st...,0.8074
1712,It's Best waww 🙏,5.0,2025-01-13,Dashen,en,best waww,0.6369
1713,Always one step ahead,5.0,2025-01-13,Dashen,en,always one step ahead,0.0000


In [72]:
cleaned_df = label_sentiment(cleaned_df)

In [73]:
cleaned_df

Unnamed: 0,review,rating,date,bank,language,processed_review,sentiment,label
0,very niec,5.0,2025-06-04,CBE,en,niec,0.0000,neutral
1,best app of finance,5.0,2025-06-04,CBE,en,best app finance,0.6369,positive
2,yetemeta,1.0,2025-06-03,CBE,en,yetemeta,0.0000,neutral
3,Engida Kebede Fetera,5.0,2025-06-03,CBE,en,engida kebede fetera,0.0000,neutral
4,good,5.0,2025-06-03,CBE,en,good,0.4404,positive
...,...,...,...,...,...,...,...,...
1709,Faster and userfriendly,5.0,2025-01-14,Dashen,en,faster userfriendly,0.0000,neutral
1711,"Waw Great and innovated,user friendly, always ...",5.0,2025-01-13,Dashen,en,waw great innovateduser friendly always one st...,0.8074,positive
1712,It's Best waww 🙏,5.0,2025-01-13,Dashen,en,best waww,0.6369,positive
1713,Always one step ahead,5.0,2025-01-13,Dashen,en,always one step ahead,0.0000,neutral


In [74]:
average_sentiment = cleaned_df['sentiment'].mean()
max_sentiment = cleaned_df['sentiment'].max()
min_sentiment = cleaned_df['sentiment'].min()
print(f'Max Sentiment Score: {max_sentiment:.2f}')
print(f'Min Sentiment Score: {min_sentiment:.2f}')
print(f'Average Sentiment Score: {average_sentiment:.2f}')

Max Sentiment Score: 0.98
Min Sentiment Score: -0.96
Average Sentiment Score: 0.25


In [52]:
cleaned_df.groupby(['bank','rating'])['sentiment'].mean().reset_index().sort_values(by='rating', ascending=False)

Unnamed: 0,bank,rating,sentiment
4,BOA,5.0,0.296856
14,Dashen,5.0,0.53301
9,CBE,5.0,0.359814
8,CBE,4.0,0.295584
13,Dashen,4.0,0.3382
3,BOA,4.0,0.184147
2,BOA,3.0,0.175856
12,Dashen,3.0,0.326727
7,CBE,3.0,0.081322
1,BOA,2.0,0.045408


#### Thematic Analysis Pipeline

In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [83]:

tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=100)
X = tfidf.fit_transform(cleaned_df['processed_review'])
feature_names = tfidf.get_feature_names_out()

# Top keywords per bank
for bank in cleaned_df['bank'].unique():
    bank_reviews = cleaned_df[cleaned_df['bank'] == bank]
    X_bank = tfidf.transform(bank_reviews['processed_review'])
    avg_tfidf = X_bank.mean(axis=0).A1
    top_keywords = sorted(zip(feature_names, avg_tfidf), key=lambda x: x[1], reverse=True)[:15]
    
    print(f"\nTop keywords for {bank}:")
    for kw, score in top_keywords:
        print(f"{kw}: {score:.4f}")



Top keywords for CBE:
app: 0.1088
good: 0.0932
best: 0.0444
cbe: 0.0417
nice: 0.0381
screenshot: 0.0325
bank: 0.0312
በጣም: 0.0275
like: 0.0272
update: 0.0263
good app: 0.0257
great: 0.0256
application: 0.0254
use: 0.0209
apps: 0.0209

Top keywords for BOA:
app: 0.1250
good: 0.0516
work: 0.0389
bank: 0.0383
working: 0.0321
boa: 0.0319
worst: 0.0284
doesnt: 0.0275
please: 0.0263
mobile: 0.0260
banking: 0.0255
dont: 0.0238
cant: 0.0228
developer: 0.0228
use: 0.0218

Top keywords for Dashen:
app: 0.1317
dashen: 0.0791
best: 0.0621
super: 0.0595
bank: 0.0532
banking: 0.0480
good: 0.0459
one: 0.0447
dashen bank: 0.0443
amazing: 0.0405
use: 0.0382
super app: 0.0378
fast: 0.0372
easy: 0.0354
features: 0.0331


In [84]:
feature_names

array(['account', 'ahead', 'also', 'always', 'amazing', 'app', 'app ever',
       'application', 'apps', 'bad', 'bank', 'banking', 'banking app',
       'best', 'best app', 'better', 'boa', 'cant', 'cbe', 'convenient',
       'crashes', 'dashen', 'dashen bank', 'dashen super', 'developer',
       'developer options', 'digital', 'doesnt', 'doesnt work', 'dont',
       'easy', 'easy use', 'ethiopia', 'even', 'ever', 'every',
       'excellent', 'experience', 'fast', 'features', 'fix', 'get',
       'good', 'good app', 'great', 'im', 'ive', 'keep', 'know', 'like',
       'love', 'make', 'makes', 'mobile', 'mobile banking', 'money',
       'much', 'need', 'new', 'nice', 'one', 'one step', 'open', 'option',
       'options', 'phone', 'please', 'really', 'reliable', 'screenshot',
       'secure', 'security', 'service', 'services', 'simple', 'slow',
       'smooth', 'sometimes', 'step', 'super', 'super app', 'thank',
       'time', 'times', 'transaction', 'transactions', 'transfer', 'try',
  

In [86]:
# 3. Data Analysis
# Extract top keywords for each review
def top_keywords_per_doc(row_index):
    row = X[row_index]
    sorted_indices = row.toarray().flatten().argsort()[::-1]
    return [feature_names[i] for i in sorted_indices if row[0, i] > 0][:5]  # top 5 keywords

cleaned_df['keywords'] = [top_keywords_per_doc(i) for i in range(X.shape[0])]


In [87]:
cleaned_df['keywords']

0                                           []
1                        [best app, best, app]
2                                           []
3                                           []
4                                       [good]
                         ...                  
1709                                        []
1711    [ahead, one step, step, always, great]
1712                                    [best]
1713      [ahead, one step, step, always, one]
1714                            [better, like]
Name: keywords, Length: 1437, dtype: object

In [89]:
# Theme keyword dictionaries
themes_keywords = {
    "Account Access Issues": ["login", "password", "otp", "sign in", "register", "access"],
    "Transaction Problems": ["transfer", "failed", "money", "delay", "deducted", "charged"],
    "App Performance": ["crash", "freeze", "bug", "slow", "update", "loading"],
    "User Experience": ["interface", "design", "user friendly", "easy", "navigation", "ux"],
    "Customer Support": ["support", "help", "agent", "contact", "response", "feedback"]
}

def assign_theme_from_keywords(keywords):
    for theme, keywords_set in themes_keywords.items():
        if any(kw in keywords for kw in keywords_set):
            return theme
    return "Other"

cleaned_df['theme'] = cleaned_df['keywords'].apply(assign_theme_from_keywords)


In [92]:
cleaned_df

Unnamed: 0,review,rating,date,bank,language,processed_review,sentiment,label,keywords,theme
0,very niec,5.0,2025-06-04,CBE,en,niec,0.0000,neutral,[],Other
1,best app of finance,5.0,2025-06-04,CBE,en,best app finance,0.6369,positive,"[best app, best, app]",Other
2,yetemeta,1.0,2025-06-03,CBE,en,yetemeta,0.0000,neutral,[],Other
3,Engida Kebede Fetera,5.0,2025-06-03,CBE,en,engida kebede fetera,0.0000,neutral,[],Other
4,good,5.0,2025-06-03,CBE,en,good,0.4404,positive,[good],Other
...,...,...,...,...,...,...,...,...,...,...
1709,Faster and userfriendly,5.0,2025-01-14,Dashen,en,faster userfriendly,0.0000,neutral,[],Other
1711,"Waw Great and innovated,user friendly, always ...",5.0,2025-01-13,Dashen,en,waw great innovateduser friendly always one st...,0.8074,positive,"[ahead, one step, step, always, great]",Other
1712,It's Best waww 🙏,5.0,2025-01-13,Dashen,en,best waww,0.6369,positive,[best],Other
1713,Always one step ahead,5.0,2025-01-13,Dashen,en,always one step ahead,0.0000,neutral,"[ahead, one step, step, always, one]",Other
