In [1]:
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
from transformers import pipeline


# Load your cleaned CSV
# Step 2: Load your cleaned reviews dataset
# Replace 'cleaned_reviews.csv' with your actual file name if it's different



df = pd.read_csv('../data/raw/banks_review_cleaned.csv')
# Display the first few rows to understand the structure
df.head(50)


# Step 3: Load the pre-trained sentiment analysis pipeline from HuggingFace Transformers
# We're using a lightweight BERT model fine-tuned on SST-2 dataset for binary sentiment (positive/negative)
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Step 4: Prepare review text
# Ensure the review column is in string format and truncate long reviews to the first 512 characters
# (Transformer models like BERT have a max token limit, usually around 512 tokens)
df['short_review'] = df['review'].astype(str).str[:512]

# Step 5: Apply the sentiment analysis pipeline to each review
# This will return a dictionary with 'label' (POSITIVE or NEGATIVE) and 'score' (confidence)
df['sentiment_result'] = df['short_review'].apply(lambda x: sentiment_pipeline(x)[0])

# Step 6: Extract 'label' and 'score' from the result dictionary into separate columns
df['sentiment_label'] = df['sentiment_result'].apply(lambda x: x['label'])  # POSITIVE or NEGATIVE
df['sentiment_score'] = df['sentiment_result'].apply(lambda x: x['score'])  # Confidence score

# Optional: Convert label to lowercase for consistency
df['sentiment_label'] = df['sentiment_label'].str.lower()

# Step 7: Preview the result
print(df[['review', 'sentiment_label', 'sentiment_score']].head())

# Step 8: Save the result to a new CSV for later use (e.g. thematic analysis)
df.to_csv("../data/processed/bank_reviews_with_sentiment.csv", index=False)





Device set to use cpu


KeyboardInterrupt: 

In [2]:
import nltk
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Bob\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [8]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

# Initialize
analyzer = SentimentIntensityAnalyzer()

# Example text
text = "This movie was very good, but the ending was AMAZING!!"
score = analyzer.polarity_scores(text)

compound = score['compound']

# Classify based on compound score
if compound >= 0.05:
    sentiment = "Positive"
elif compound <= -0.05:
    sentiment = "Negative"
else:
    sentiment = "Neutral"

print(f"Text: {text}")
print(f"Sentiment: {sentiment}")
print(f"Compound Score: {compound}")


Text: This movie was  good, but the ending was AMAZING!!
Sentiment: Positive
Compound Score: 0.87


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Bob\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:

df = pd.read_csv('../data/raw/banks_review_cleaned.csv')
# Display the first few rows to understand the structure
df.head(50)

Unnamed: 0,review,rating,date,bank,source
0,"""Why don’t your ATMs support account-to-accoun...",4,2025-06-06,CBE,Google Play
1,what is this app problem???,1,2025-06-05,CBE,Google Play
2,the app is proactive and a good connections.,5,2025-06-05,CBE,Google Play
3,I cannot send to cbebirr app. through this app.,3,2025-06-05,CBE,Google Play
4,good,4,2025-06-05,CBE,Google Play
5,not functional,1,2025-06-05,CBE,Google Play
6,everytime you uninstall the app you have to re...,1,2025-06-04,CBE,Google Play
7,The name of our account is the name of our acc...,4,2025-06-04,CBE,Google Play
8,best,5,2025-06-04,CBE,Google Play
9,Bezabih,5,2025-06-04,CBE,Google Play


In [3]:

df = pd.read_csv('../data/raw/banks_review_cleaned.csv')
# Display the first few rows to understand the structure
df.head(10)

df['short_review'] = df['review'].astype(str).str[:512]
# Step 6 (continued): Extract the sentiment label and score from the output dictionary
df['sentiment_result'] = df['short_review'].apply(lambda x: sentiment_pipeline(x)[0])
# Create a new column 'sentiment_label' with the value: POSITIVE or NEGATIVE
df['sentiment_label'] = df['sentiment_result'].apply(lambda x: x['label'])

# Create a new column 'sentiment_score' which is the model's confidence in its prediction (between 0 and 1)
df['sentiment_score'] = df['sentiment_result'].apply(lambda x: x['score'])

# Optional: Map sentiment labels to numerical values for easier aggregation
# POSITIVE -> 1, NEGATIVE -> -1 (you can also use 0 and 1 if preferred)
df['sentiment_numeric'] = df['sentiment_label'].map({'POSITIVE': 1, 'NEGATIVE': -1})

# Step 7: Group and aggregate sentiment by bank and star rating
# We calculate:
# - The average sentiment score
# - The proportion of positive reviews
# - Total number of reviews in that group (for context)

sentiment_summary = df.groupby(['bank', 'rating']).agg(
    mean_sentiment_score=('sentiment_score', 'mean'),
    mean_sentiment_label_numeric=('sentiment_numeric', 'mean'),
    percent_positive=('sentiment_label', lambda x: (x == 'POSITIVE').mean() * 100),
    review_count=('review', 'count')
).reset_index()

# Display the summary
print(sentiment_summary)


      bank  rating  mean_sentiment_score  mean_sentiment_label_numeric  \
0      BOA       1              0.986962                     -0.773585   
1      BOA       2              0.933640                     -0.750000   
2      BOA       3              0.976630                     -0.085714   
3      BOA       4              0.964666                      0.000000   
4      BOA       5              0.971614                      0.584541   
5      CBE       1              0.976445                     -0.622642   
6      CBE       2              0.997675                     -0.294118   
7      CBE       3              0.986456                     -0.310345   
8      CBE       4              0.963908                     -0.043478   
9      CBE       5              0.986866                      0.794286   
10  Dashen       1              0.995235                     -0.882353   
11  Dashen       2              0.981517                     -0.647059   
12  Dashen       3              0.9976

In [4]:
# Step 1: Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Use short version of the review or full version
texts = df['short_review'].fillna("")

# Step 2: Create a TF-IDF Vectorizer
# We use unigrams, bigrams, and trigrams (1 to 3-word phrases)
# This helps extract phrases like "login error", "transfer failed", "slow app"
vectorizer = TfidfVectorizer(ngram_range=(1,3), 
                             stop_words='english',  # remove common words
                             max_features=1000)     # limit to top 1000 keywords

# Step 3: Fit the vectorizer and transform the reviews into TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(texts)

# Step 4: Get top n keywords by average TF-IDF score across all reviews
import numpy as np

# Compute mean tf-idf score for each feature (keyword/ngram)
mean_tfidf = np.asarray(tfidf_matrix.mean(axis=0)).flatten()

# Map scores to terms
tfidf_scores = pd.DataFrame({
    'term': vectorizer.get_feature_names_out(),
    'score': mean_tfidf
})

# Sort to get top themes
top_keywords = tfidf_scores.sort_values(by='score', ascending=False).head(30)
print("🔍 Top Thematic Keywords from Reviews:")
print(top_keywords)


🔍 Top Thematic Keywords from Reviews:
               term     score
385            good  0.112085
41              app  0.073632
165            best  0.050803
644            nice  0.038064
139            bank  0.024281
166        best app  0.020417
658              ok  0.019762
987             wow  0.018982
308       excellent  0.017222
515            like  0.016630
931             use  0.016626
386        good app  0.016345
157         banking  0.016117
403           great  0.015948
974            work  0.015242
325            fast  0.014396
118     application  0.014150
291            easy  0.013905
240          dashen  0.013863
22          amazing  0.013760
976         working  0.012112
862           super  0.011800
196             cbe  0.011305
598          mobile  0.011165
182             boa  0.010922
981           worst  0.009053
889           thank  0.008861
280           doesn  0.008262
601  mobile banking  0.008258
241     dashen bank  0.008142


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Step 1: Prepare review text (same as before)
texts = df['short_review'].fillna("")

# Step 2: Convert text into a document-term matrix
# Using unigrams and bigrams to capture phrases like "login error"
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000)
dtm = vectorizer.fit_transform(texts)

# Step 3: Apply LDA to discover topics
n_topics = 5  # You can tweak this based on results
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(dtm)

# Step 4: Print top keywords in each topic
feature_names = vectorizer.get_feature_names_out()

def get_top_keywords(model, feature_names, n_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_keywords = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(top_keywords)
        print(f"Topic #{topic_idx + 1}: {', '.join(top_keywords)}")
    return topics

# Get top 10 words per topic
topics = get_top_keywords(lda, feature_names, n_top_words=10)


Topic #1: good, app, work, ok, better, boa, make, working, update, don
Topic #2: app, super, dashen, fast, super app, user, easy, dashen bank, features, banking
Topic #3: app, best, best app, good, good app, like, use, banking, bank, worst
Topic #4: app, bank, nice, cbe, wow, great, dashen, step, developer, dashen bank
Topic #5: app, mobile, banking, mobile banking, work, doesn, application, amazing, doesn work, excellent


In [21]:
import pandas as pd
import spacy

# Load English language model
nlp = spacy.load("en_core_web_sm")

# Sample reviews
texts = df['short_review'].fillna("")

# Preprocessing function
def preprocess(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return " ".join(tokens)

# Apply preprocessing
df['cleaned_review'] = texts.apply(preprocess)


In [7]:
from transformers import pipeline

sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
df['sentiment'] = df['short_review'].apply(lambda x: sentiment_pipeline(x)[0]['label'] if isinstance(x, str) else None)
df['sentiment_score'] = df['short_review'].apply(lambda x: sentiment_pipeline(x)[0]['score'] if isinstance(x, str) else None)


Device set to use cpu


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Extract top keywords
vectorizer = TfidfVectorizer(max_features=20, ngram_range=(1,2))
X = vectorizer.fit_transform(df['cleaned_review'])

# Add top keywords to DataFrame
keywords = vectorizer.get_feature_names_out()
tfidf_matrix = pd.DataFrame(X.toarray(), columns=keywords)
df_keywords = pd.concat([df, tfidf_matrix], axis=1)


In [18]:
# Example mapping logic based on top LDA keywords
def map_to_theme(text):
    text = text.lower()
    if "login" in text or "access" in text or "otp" in text:
        return "Account Access Issues"
    elif "slow" in text or "transfer" in text or "transaction" in text:
        return "Transaction Performance"
    elif "ui" in text or "interface" in text or "design" in text:
        return "User Interface & Experience"
    elif "support" in text or "help" in text or "response" in text:
        return "Customer Support"
    elif "feature" in text or "request" in text:
        return "Feature Request"
    else:
        return "Other"

df['theme'] = df['cleaned_review'].apply(map_to_theme)


In [20]:
df_final = df[['short_review', 'cleaned_review', 'sentiment', 'sentiment_score', 'theme', 'bank', 'rating', 'date', 'source']]
df_final.to_csv("../data/processed/bank_review_sentiment_theme.csv", index=False)
