In [1]:
# import dependensies
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# packages for sentiment analaysis
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# load the processed review
df=pd.read_csv("../data/processed/processed_banks_app_reviews.csv")
print(df.head())

                                              review  rating        date  \
0  why is it every time the app is updated histor...       1  2025-08-21   
1  You guys charge way to much when we transfer m...       3  2025-08-21   
2                                               good       5  2025-08-21   
3                                        not bad apk       1  2025-08-21   
4                                                 ok       5  2025-08-21   

                          bank       source  
0  Commercial Bank of Ethiopia  Google Play  
1  Commercial Bank of Ethiopia  Google Play  
2  Commercial Bank of Ethiopia  Google Play  
3  Commercial Bank of Ethiopia  Google Play  
4  Commercial Bank of Ethiopia  Google Play  


In [3]:

# download NLTK resources
nltk.download("punket")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('punkt_tab')
nltk.download("vader_lexicon")

[nltk_data] Error loading punket: Package 'punket' not found in index
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
# preprocessing the data
"""
Steps:
-- Clean text: Remove punctuation, special characters, convert to lowercase.
-- Tokenize: Split text into words.
-- Remove stop words: Eliminate common words (e.g., "the," "and").
-- Lemmatize: Reduce words to base form (e.g., "running" → "run").

"""
# define function for processing the review before sentiment analaysis
def preprocess_text(text):
    text=text.lower()  # change to lowercase
    text=text.translate(str.maketrans("","",string.punctuation))  # remove punctuations
    tokens=word_tokenize(text) # tokenize the text to split apart the text
    lemmatizer=WordNetLemmatizer()
    stop_words=set(stopwords.words("english"))
    tokens=[lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(tokens)

# Apply preprocessing to the review column
df["processed_review"]=df["review"].apply(preprocess_text)
print(df.head())

                                              review  rating        date  \
0  why is it every time the app is updated histor...       1  2025-08-21   
1  You guys charge way to much when we transfer m...       3  2025-08-21   
2                                               good       5  2025-08-21   
3                                        not bad apk       1  2025-08-21   
4                                                 ok       5  2025-08-21   

                          bank       source  \
0  Commercial Bank of Ethiopia  Google Play   
1  Commercial Bank of Ethiopia  Google Play   
2  Commercial Bank of Ethiopia  Google Play   
3  Commercial Bank of Ethiopia  Google Play   
4  Commercial Bank of Ethiopia  Google Play   

                                    processed_review  
0  every time app updated history previous accoun...  
1  guy charge way much transfer money cbe telebir...  
2                                               good  
3                                       

## Sentiment Analysis

In [5]:
# sentiment analaysis with VADER
# define an object
sis=SentimentIntensityAnalyzer()

def get_sentiment(text):
    scores=sis.polarity_scores(text)
    if scores["compound"]> 0.05:
        return "positive"
    elif scores["compound"]< -0.05:
        return "negative"
    else:
        return "neutral"

# apply the get_sentiment on the review column   
df["sentiment"]=df["processed_review"].apply(get_sentiment)
# count the frequency of each sentiment type
print(df["sentiment"].value_counts()) 

sentiment
positive    544
neutral     293
negative    132
Name: count, dtype: int64


## Thematic Analysis

In [6]:
# Keyword Extraction
# Vectorize the dataset
vectorizer=TfidfVectorizer(max_features=100)
x=vectorizer.fit_transform(df["processed_review"])

# get tok keywords
keywords=vectorizer.get_feature_names_out()
print("the most keywords are:   ") 
print(keywords)

the most keywords are:   
['able' 'access' 'account' 'ahead' 'also' 'always' 'amazing' 'app'
 'application' 'apps' 'bad' 'bank' 'banking' 'best' 'better' 'bill' 'boa'
 'cant' 'cbe' 'convenient' 'crash' 'customer' 'dashen' 'day' 'developer'
 'digital' 'doesnt' 'dont' 'easy' 'ethiopia' 'ethiopian' 'even' 'ever'
 'every' 'everything' 'excellent' 'experience' 'fast' 'feature' 'fix'
 'friendly' 'get' 'give' 'go' 'good' 'great' 'highly' 'im' 'issue' 'keep'
 'life' 'like' 'love' 'make' 'many' 'mobile' 'money' 'much' 'need' 'never'
 'new' 'nice' 'one' 'open' 'option' 'payment' 'phone' 'please' 'problem'
 'really' 'reliable' 'seamless' 'secure' 'security' 'service' 'simple'
 'slow' 'smooth' 'step' 'super' 'system' 'take' 'thank' 'time'
 'transaction' 'transfer' 'turn' 'update' 'use' 'used' 'user'
 'userfriendly' 'using' 'well' 'work' 'working' 'worst' 'would' 'በጣም' 'ነው']


In [7]:
# Manual/Rule-Based Clustering
# Group related keywords
themes = {
    "Account Access & Security": [
        "access", "account", "login", "secure", "security", "issue", "problem", "cant", "open"
    ],
    "Transactions & Payments": [
        "transaction", "transfer", "payment", "bill", "money", "system", "slow", "fast", "reliable"
    ],
    "User Experience & Interface": [
        "app", "application", "apps", "mobile", "user", "userfriendly", "easy", "simple", "friendly",
        "seamless", "smooth", "convenient", "experience", "design", "interface"
    ],
    "Customer Service & Support": [
        "customer", "service", "support", "help", "please", "developer", "fix"
    ],
    "Feature Requests & Updates": [
        "feature", "update", "option", "need", "new", "better", "improvement"
    ]
}


In [9]:
# define function to assign themes for each row of reviwe

# phrases into themes per bank
def assign_review(review):
    for theme,keywords in themes.items():
        if any(keyword in review for keyword in keywords ):
            return theme
        else:
            return "other"
        
# apply assign_review on the df
df["theme"]=df["processed_review"].apply(assign_review)
df

Unnamed: 0,review,rating,date,bank,source,processed_review,sentiment,theme
0,why is it every time the app is updated histor...,1,2025-08-21,Commercial Bank of Ethiopia,Google Play,every time app updated history previous accoun...,negative,Account Access & Security
1,You guys charge way to much when we transfer m...,3,2025-08-21,Commercial Bank of Ethiopia,Google Play,guy charge way much transfer money cbe telebir...,neutral,other
2,good,5,2025-08-21,Commercial Bank of Ethiopia,Google Play,good,positive,other
3,not bad apk,1,2025-08-21,Commercial Bank of Ethiopia,Google Play,bad apk,negative,other
4,ok,5,2025-08-21,Commercial Bank of Ethiopia,Google Play,ok,positive,other
...,...,...,...,...,...,...,...,...
964,I’ve been using the Dashen Bank Super App for ...,5,2025-03-28,Dashen Bank,Google Play,using dashen bank super app gamechanger app su...,positive,Account Access & Security
965,Very good app that like it too. B/c it is very...,5,2025-03-26,Dashen Bank,Google Play,good app like bc fast easy use,positive,other
966,best experiences,5,2025-03-26,Dashen Bank,Google Play,best experience,positive,other
967,too slow to use.,1,2025-03-26,Dashen Bank,Google Play,slow use,neutral,other


In [13]:
df.to_csv("../data/processed/final_bank_apps_review_data.csv",index=False)