In [1]:
# Import modules from src
import sys
sys.path.append('../src')  # Add src to path

from scraper import scrape_reviews
from preprocessor import preprocess_reviews
import pandas as pd
from tqdm import tqdm

In [2]:
from preprocessor import get_sentiment_scores, label_sentiment

In [None]:
from analyzer import SentimentAnalyzer, ThemeAnalyzer
from visualizer import ReviewVisualizer
from database import ReviewDatabase

In [3]:
from initialize import setup_nltk

# Run this once to download necessary resources
setup_nltk()


[nltk_data] Downloading package punkt to /home/age/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/age/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/age/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/age/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/age/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
from preprocessor import preprocess_text

In [4]:
# 1. Data Collection
bank_apps = {
    'CBE': 'com.combanketh.mobilebanking',
    'BOA': 'com.boa.boaMobileBanking',
    'Dashen': 'com.dashen.dashensuperapp'
}

In [5]:
raw_reviews = scrape_reviews(bank_apps)


Scraping banks:   0%|          | 0/3 [00:00<?, ?it/s]

Scraping banks: 100%|██████████| 3/3 [00:04<00:00,  1.54s/it]


In [7]:
raw_reviews.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,bank,language
0,42eed344-f5b7-49be-a7eb-b1c14386e20b,Tilahun Nedi,https://play-lh.googleusercontent.com/a/ACg8oc...,very niec,5.0,0.0,5.1.0,2025-06-04 11:21:53,,,5.1.0,CBE,en
1,fce00cda-d71a-486e-a4c2-7479ab7793bc,Abdurahiman Abubaker Hussein,https://play-lh.googleusercontent.com/a/ACg8oc...,best app of finance,5.0,0.0,,2025-06-04 07:38:54,,,,CBE,en
2,f543a13e-bc0f-4348-b57f-3a83eb67206c,Milky Yebassa,https://play-lh.googleusercontent.com/a/ACg8oc...,yetemeta,1.0,0.0,5.1.0,2025-06-03 21:04:44,,,5.1.0,CBE,en
3,2470d199-834a-4134-a0f9-8c684ba75491,ENGIDA KEBEDE,https://play-lh.googleusercontent.com/a-/ALV-U...,Engida Kebede Fetera,5.0,0.0,5.1.0,2025-06-03 20:20:23,,,5.1.0,CBE,en
4,4beee0dd-05ad-474c-bdff-0146723f2179,Abel Beyena,https://play-lh.googleusercontent.com/a/ACg8oc...,good,5.0,0.0,5.1.0,2025-06-03 11:32:52,,,5.1.0,CBE,en


In [8]:
raw_reviews.shape

(1715, 13)

In [9]:
raw_reviews.to_csv('../datasets/raw/reviews.csv', index=False)

In [3]:
df = pd.read_csv('../datasets/raw/reviews.csv', parse_dates=['at'])

In [4]:
df.head(3)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,bank,language
0,42eed344-f5b7-49be-a7eb-b1c14386e20b,Tilahun Nedi,https://play-lh.googleusercontent.com/a/ACg8oc...,very niec,5.0,0.0,5.1.0,2025-06-04 11:21:53,,,5.1.0,CBE,en
1,fce00cda-d71a-486e-a4c2-7479ab7793bc,Abdurahiman Abubaker Hussein,https://play-lh.googleusercontent.com/a/ACg8oc...,best app of finance,5.0,0.0,,2025-06-04 07:38:54,,,,CBE,en
2,f543a13e-bc0f-4348-b57f-3a83eb67206c,Milky Yebassa,https://play-lh.googleusercontent.com/a/ACg8oc...,yetemeta,1.0,0.0,5.1.0,2025-06-03 21:04:44,,,5.1.0,CBE,en


In [5]:
# preprocess_text(df['content'], language='en')

In [6]:
# 2. Data Preprocessing
cleaned_df = preprocess_reviews(df)
cleaned_df.to_csv('../datasets/raw/cleaned_reviews.csv', index=False)

Preprocessing text: 100%|██████████| 1437/1437 [00:01<00:00, 1400.34it/s]


In [7]:
cleaned_df

Unnamed: 0,review,rating,date,bank,language,processed_review
0,very niec,5.0,2025-06-04,CBE,en,niec
1,best app of finance,5.0,2025-06-04,CBE,en,best app finance
2,yetemeta,1.0,2025-06-03,CBE,en,yetemeta
3,Engida Kebede Fetera,5.0,2025-06-03,CBE,en,engida kebede fetera
4,good,5.0,2025-06-03,CBE,en,good
...,...,...,...,...,...,...
1709,Faster and userfriendly,5.0,2025-01-14,Dashen,en,faster userfriendly
1711,"Waw Great and innovated,user friendly, always ...",5.0,2025-01-13,Dashen,en,waw great innovateduser friendly always one st...
1712,It's Best waww 🙏,5.0,2025-01-13,Dashen,en,best waww
1713,Always one step ahead,5.0,2025-01-13,Dashen,en,always one step ahead


In [8]:
cleaned_df = get_sentiment_scores(cleaned_df)

Calculating sentiment scores: 100%|██████████| 1437/1437 [00:00<00:00, 11039.10it/s]


In [9]:
cleaned_df

Unnamed: 0,review,rating,date,bank,language,processed_review,sentiment
0,very niec,5.0,2025-06-04,CBE,en,niec,0.0000
1,best app of finance,5.0,2025-06-04,CBE,en,best app finance,0.6369
2,yetemeta,1.0,2025-06-03,CBE,en,yetemeta,0.0000
3,Engida Kebede Fetera,5.0,2025-06-03,CBE,en,engida kebede fetera,0.0000
4,good,5.0,2025-06-03,CBE,en,good,0.4404
...,...,...,...,...,...,...,...
1709,Faster and userfriendly,5.0,2025-01-14,Dashen,en,faster userfriendly,0.0000
1711,"Waw Great and innovated,user friendly, always ...",5.0,2025-01-13,Dashen,en,waw great innovateduser friendly always one st...,0.8074
1712,It's Best waww 🙏,5.0,2025-01-13,Dashen,en,best waww,0.6369
1713,Always one step ahead,5.0,2025-01-13,Dashen,en,always one step ahead,0.0000


In [10]:
cleaned_df = label_sentiment(cleaned_df)

In [11]:
cleaned_df

Unnamed: 0,review,rating,date,bank,language,processed_review,sentiment,label
0,very niec,5.0,2025-06-04,CBE,en,niec,0.0000,neutral
1,best app of finance,5.0,2025-06-04,CBE,en,best app finance,0.6369,positive
2,yetemeta,1.0,2025-06-03,CBE,en,yetemeta,0.0000,neutral
3,Engida Kebede Fetera,5.0,2025-06-03,CBE,en,engida kebede fetera,0.0000,neutral
4,good,5.0,2025-06-03,CBE,en,good,0.4404,positive
...,...,...,...,...,...,...,...,...
1709,Faster and userfriendly,5.0,2025-01-14,Dashen,en,faster userfriendly,0.0000,neutral
1711,"Waw Great and innovated,user friendly, always ...",5.0,2025-01-13,Dashen,en,waw great innovateduser friendly always one st...,0.8074,positive
1712,It's Best waww 🙏,5.0,2025-01-13,Dashen,en,best waww,0.6369,positive
1713,Always one step ahead,5.0,2025-01-13,Dashen,en,always one step ahead,0.0000,neutral


In [13]:
average_sentiment = cleaned_df['sentiment'].mean()
max_sentiment = cleaned_df['sentiment'].max()
min_sentiment = cleaned_df['sentiment'].min()
print(f'Max Sentiment Score: {max_sentiment:.2f}')
print(f'Min Sentiment Score: {min_sentiment:.2f}')
print(f'Average Sentiment Score: {average_sentiment:.2f}')

Max Sentiment Score: 0.98
Min Sentiment Score: -0.96
Average Sentiment Score: 0.25
