In [1]:
import pandas as pd
import spacy
from transformers import pipeline
import nltk
from nltk.corpus import stopwords
from collections import Counter
import os

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Download NLTK stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\belet\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [2]:

stop_words = set(stopwords.words('english'))

In [7]:
spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
# Download and load spaCy model
nlp = spacy.load('en_core_web_sm')

In [4]:
# Load sentiment analysis model
sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


In [5]:
# Load reviews
df_reviews = pd.read_csv('data/bank_reviews.csv')

In [6]:

# Sentiment analysis
def get_sentiment(text):
    try:
        result = sentiment_analyzer(text[:512])[0]  # Truncate to 512 tokens
        label = result['label']
        # score = result['score']
        return label, score
    except:
        return 'NEUTRAL', 0.0

In [7]:

df_reviews[['sentiment_label', 'sentiment_score']] = df_reviews['review'].apply(get_sentiment).apply(pd.Series)

# Thematic analysis
def extract_keywords(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha and token.text not in stop_words]
    return tokens

In [8]:

df_reviews[['sentiment_label', 'sentiment_score']] = df_reviews['review'].apply(get_sentiment).apply(pd.Series)

# Thematic analysis
def extract_keywords(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha and token.text not in stop_words]
    return tokens

In [10]:
print(df_reviews)

                                 review_id  \
0     08d41b8d-c5f5-4ca9-9287-9ae8294cf0e6   
1     747a0d8f-7f36-41ca-b377-f9f687ac2eec   
2     64b198ae-91c6-40d4-ba3c-229f97e01c98   
3     828c5fcd-f084-4e57-ad56-201735b8e413   
4     46f43687-fa3d-434f-98c1-383d691a4223   
...                                    ...   
1180  121f30ef-0f05-4695-a424-5b2ffbdf1483   
1181  dea35742-2646-4ac9-98c0-c4fa4c509da9   
1182  d5a42e90-cd0f-4276-b8f8-d17b6a5d9eb1   
1183  4818ccd5-f8fa-44d0-a9b7-adcd7ad55d28   
1184  58f9de7f-6855-4e78-aa65-e7a01ff565a9   

                                                 review  rating        date  \
0     "Why don’t your ATMs support account-to-accoun...       4  2025-06-06   
1                           what is this app problem???       1  2025-06-05   
2          the app is proactive and a good connections.       5  2025-06-05   
3       I cannot send to cbebirr app. through this app.       3  2025-06-05   
4                                                  g

In [11]:
from nltk.util import ngrams

def extract_ngrams(text, ngram_range=(2, 3)):
	tokens = [token.lemma_ for token in nlp(text.lower()) if token.is_alpha and token.text not in stop_words]
	ngram_list = []
	for n in range(ngram_range[0], ngram_range[1] + 1):
		ngram_list += [' '.join(gram) for gram in ngrams(tokens, n)]
	return ngram_list

# Extract keywords and n-grams
df_reviews['keywords'] = df_reviews['review'].apply(lambda x: extract_keywords(x) + extract_ngrams(x))


In [14]:
print(df_reviews.head())

                              review_id  \
0  08d41b8d-c5f5-4ca9-9287-9ae8294cf0e6   
1  747a0d8f-7f36-41ca-b377-f9f687ac2eec   
2  64b198ae-91c6-40d4-ba3c-229f97e01c98   
3  828c5fcd-f084-4e57-ad56-201735b8e413   
4  46f43687-fa3d-434f-98c1-383d691a4223   

                                              review  rating        date bank  \
0  "Why don’t your ATMs support account-to-accoun...       4  2025-06-06  CBE   
1                        what is this app problem???       1  2025-06-05  CBE   
2       the app is proactive and a good connections.       5  2025-06-05  CBE   
3    I cannot send to cbebirr app. through this app.       3  2025-06-05  CBE   
4                                               good       4  2025-06-05  CBE   

        source sentiment_label  sentiment_score  \
0  Google Play         NEUTRAL              0.0   
1  Google Play         NEUTRAL              0.0   
2  Google Play         NEUTRAL              0.0   
3  Google Play         NEUTRAL              0.0   

In [12]:

# Define themes based on common keywords
themes = {
    'CBE':{
        'Account Access Issues': ['login', 'pin', 'authentication', 'access'],
        'Transaction Performance': ['transfer', 'slow', 'fast', 'payment'],
        'Reliability': ['crash', 'freeze', 'bug', 'error'],
        'User Interface': ['ui', 'interface', 'design', 'navigation'],
        'Customer Support': ['support', 'help', 'response']
    },
    'BOA': {
        'Account Access Issues': ['login', 'pin', 'face id', 'authentication'],
        'Transaction Performance': ['transfer', 'slow', 'payment', 'et switch'],
        'Reliability': ['crash', 'freeze', 'bug', 'error'],
        'User Interface': ['ui', 'interface', 'logo', 'design'],
        'Feature Requests': ['fingerprint', 'biometric', 'qr code']
    },
    'Dashen': {
        'Account Access Issues': ['login', 'pin', 'authentication'],
        'Transaction Performance': ['transfer', 'payment', 'fast'],
        'Reliability': ['crash', 'error', 'bug'],
        'User Interface': ['ui', 'interface', 'design'],
        'Sharia Compliance': ['sharik', 'interest free']
    }
}


In [15]:

# Assign themes to reviews
def assign_themes(keywords, bank):
    assigned_themes = []
    for theme, theme_keywords in themes[bank].items():
        if any(kw in keywords for kw in theme_keywords):
            assigned_themes.append(theme)
    return assigned_themes if assigned_themes else ['Other']


In [16]:

df_reviews['themes'] = df_reviews.apply(lambda row: assign_themes(row['keywords'], row['bank']), axis=1)

# Save results
df_reviews.to_csv('data/analyzed_reviews.csv', index=False)

print(f"Analyzed {len(df_reviews)} reviews and saved to data/analyzed_reviews.csv")

Analyzed 1185 reviews and saved to data/analyzed_reviews.csv
