In [2]:
!pip install google_play_scraper

Defaulting to user installation because normal site-packages is not writeable
Collecting google_play_scraper
  Obtaining dependency information for google_play_scraper from https://files.pythonhosted.org/packages/33/f7/a23ef3cf8efc9ab3aee565971f59906811e6ce95475314ef7b18d02f30ba/google_play_scraper-1.2.7-py3-none-any.whl.metadata
  Downloading google_play_scraper-1.2.7-py3-none-any.whl.metadata (50 kB)
     -------------------------------------- 50.2/50.2 kB 256.4 kB/s eta 0:00:00
Downloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: google_play_scraper
Successfully installed google_play_scraper-1.2.7


In [3]:
from google_play_scraper import Sort, reviews

app_ids = {
    "CBE": "com.combanketh.mobilebanking",
    "BOA": "com.boa.boaMobileBanking",
    "Dashen": "com.dashen.dashensuperapp"
}

all_reviews = []  # to accumulate review data
for bank_name, app_id in app_ids.items():
    # Fetch 400 recent reviews for the app
    result, continuation_token = reviews(
        app_id, lang='en', country='us', sort=Sort.NEWEST, count=200
    )
    # If fewer than 400 returned, use continuation_token to get more
    while continuation_token and len(result) < 400:
        next_batch, continuation_token = reviews(app_id, continuation_token=continuation_token)
        result.extend(next_batch)
    # Trim to 400 if we got more
    bank_reviews = result[:400]
    # Append bank identifier and source to each review entry
    for r in bank_reviews:
        r['bank_name'] = bank_name
        r['source'] = "Google Play"
    all_reviews.extend(bank_reviews)


In [4]:
import pandas as pd

df = pd.DataFrame(all_reviews)
# Keep only relevant fields and rename them
df = df[['reviewId', 'content', 'score', 'at', 'bank_name', 'source']]
df.columns = ['review_id', 'review_text', 'rating', 'date', 'bank_name', 'source']

# 1. Remove duplicate reviews (if any duplicates exist)
df.drop_duplicates(subset='review_text', inplace=True)

# 2. Handle missing values
df = df.dropna(subset=['review_text', 'rating', 'date'])  # drop any review missing text or rating
# (We assume Google Play reviews always have a rating and timestamp; if any text is empty or missing, it's removed.)

# 3. Normalize date format to YYYY-MM-DD
df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')


In [5]:
print(df.info())
print(df.head(3))


<class 'pandas.core.frame.DataFrame'>
Index: 987 entries, 0 to 1199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_id    987 non-null    object
 1   review_text  987 non-null    object
 2   rating       987 non-null    int64 
 3   date         987 non-null    object
 4   bank_name    987 non-null    object
 5   source       987 non-null    object
dtypes: int64(1), object(5)
memory usage: 54.0+ KB
None
                              review_id  \
0  f0dd744a-9409-4619-9800-7ea501571b09   
1  3949d8e0-7ac4-4b43-b4f4-a45d6a888a85   
2  36f83ed0-3935-4ebd-98e5-34cf25095d32   

                                         review_text  rating        date  \
0  everytime you uninstall the app you have to re...       1  2025-06-04   
1  አካውንት የምናስገባበት ቦታ ስም ጽፈን ነው ከዚህ በፊት የላክንባቸውን አ...       4  2025-06-04   
2                                               best       5  2025-06-04   

  bank_name       source  
0       CBE

In [6]:
df.to_csv("bank_app_reviews_cleaned.csv", index=False)


In [8]:
!pip install vaderSentiment

Defaulting to user installation because normal site-packages is not writeable
Collecting vaderSentiment
  Obtaining dependency information for vaderSentiment from https://files.pythonhosted.org/packages/76/fc/310e16254683c1ed35eeb97386986d6c00bc29df17ce280aed64d55537e9/vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
   -------------------------------------- 126.0/126.0 kB 618.8 kB/s eta 0:00:00
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [9]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
# Apply VADER to each review text
df['vader_compound'] = df['review_text'].apply(lambda text: analyzer.polarity_scores(text)['compound'])
# Classify sentiment based on compound score
def label_sentiment(score):
    if score > 0.05:
        return "Positive"
    elif score < -0.05:
        return "Negative"
    else:
        return "Neutral"
df['vader_sentiment'] = df['vader_compound'].apply(label_sentiment)


In [11]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/96/f2/25b27b396af03d5b64e61976b14f7209e2939e9e806c10749b6d277c273e/transformers-4.52.4-py3-none-any.whl.metadata
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Obtaining dependency information for huggingface-hub<1.0,>=0.30.0 from https://files.pythonhosted.org/packages/67/8b/222140f3cfb6f17b0dd8c4b9a0b36bd4ebefe9fb0098ba35d6960abcda0f/huggingface_hub-0.32.4-py3-none-any.whl.metadata
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Obtaining dependency information for tokenizers<0.22,>=0.21 from https://files.pythonhosted.org/packages/e6/b6/072a8e053ae600dcc2ac0da81a23548e3b523301a442a6ca900e92ac35be/tokenizers-0.21.1-cp39-abi3-win_



In [13]:
!pip install tf-keras

Defaulting to user installation because normal site-packages is not writeable
Collecting tf-keras
  Obtaining dependency information for tf-keras from https://files.pythonhosted.org/packages/45/6b/d245122d108a94df5969ee7408ad343af1627730e91478e01ef098976bfa/tf_keras-2.19.0-py3-none-any.whl.metadata
  Downloading tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow<2.20,>=2.19 (from tf-keras)
  Obtaining dependency information for tensorflow<2.20,>=2.19 from https://files.pythonhosted.org/packages/5a/4d/bf95fd2ba2034b515c5ed8932ae990d626b90fb7f94d5fde36134a67ac09/tensorflow-2.19.0-cp39-cp39-win_amd64.whl.metadata
  Downloading tensorflow-2.19.0-cp39-cp39-win_amd64.whl.metadata (4.1 kB)
Collecting tensorboard~=2.19.0 (from tensorflow<2.20,>=2.19->tf-keras)
  Obtaining dependency information for tensorboard~=2.19.0 from https://files.pythonhosted.org/packages/5d/12/4f70e8e2ba0dbe72ea978429d8530b0333f0ed2140cc571a48802878ef99/tensorboard-2.19.0-py3-none-any.whl.metadata

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\btulu\\AppData\\Roaming\\Python\\Python39\\site-packages\\~l_dtypes\\_ml_dtypes_ext.cp39-win_amd64.pyd'
Check the permissions.



In [15]:
from transformers import pipeline

# Load the sentiment analysis pipeline with the specific model
sentiment_model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Apply the model to each review text (batching for efficiency)
# Note: For large data, consider batch_size or processing in chunks to avoid memory issues.
df['bert_sentiment'] = df['review_text'].apply(lambda text: sentiment_model(text)[0]['label'])


ValueError: Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.