In [3]:
%pip install google-play-scraper

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting google-play-scraper
  Using cached google_play_scraper-1.2.7-py3-none-any.whl.metadata (50 kB)
Using cached google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: google-play-scraper
Successfully installed google-play-scraper-1.2.7


In [24]:
import pandas as pd
from google_play_scraper import Sort, reviews
from datetime import datetime
import time
import os
import uuid


In [38]:
# Define app IDs for the three banks
apps = {
    'CBE': 'com.combanketh.mobilebanking',
    'BOA': 'com.boa.boaMobileBanking',
    'Dashen': 'com.dashen.dashensuperapp'
}

In [39]:
# Function to scrape reviews
def scrape_reviews(app_id, bank_name, count=400):
    result, _ = reviews(
        app_id,
        lang='en',
        country='et',
        sort=Sort.NEWEST,
        count=count
    )
    data = []
    for review in result:
        data.append({
            'review_id': str(uuid.uuid4()),
            'review': review['content'],
            'rating': review['score'],
            'date': review['at'].strftime('%Y-%m-%d'),
            'bank': bank_name,
            'source': 'Google Play'
        })
    return pd.DataFrame(data)  # <-- Unindented

In [40]:
# Scrape and combine reviews
all_reviews = []
for bank, app_id in apps.items():
    df = scrape_reviews(app_id, bank)
    all_reviews.append(df)

In [41]:
print(all_reviews)

[                                review_id  \
0    08d41b8d-c5f5-4ca9-9287-9ae8294cf0e6   
1    747a0d8f-7f36-41ca-b377-f9f687ac2eec   
2    64b198ae-91c6-40d4-ba3c-229f97e01c98   
3    828c5fcd-f084-4e57-ad56-201735b8e413   
4    46f43687-fa3d-434f-98c1-383d691a4223   
..                                    ...   
395  60646230-7648-41ac-a1fa-097df01c20f8   
396  495355a0-2937-4c92-bc67-239cb1270186   
397  05a49191-0c52-4d6f-953a-205ceb055dd1   
398  ea28263b-f488-4c99-bbde-ce820ee36cc3   
399  2ee6fbc2-cf0b-4aea-a82e-da3f1a5170d6   

                                                review  rating        date  \
0    "Why don’t your ATMs support account-to-accoun...       4  2025-06-06   
1                          what is this app problem???       1  2025-06-05   
2         the app is proactive and a good connections.       5  2025-06-05   
3      I cannot send to cbebirr app. through this app.       3  2025-06-05   
4                                                 good       4  2025

In [42]:
# Combine into a single DataFrame
df_reviews = pd.concat(all_reviews, ignore_index=True)

In [43]:
print(df_reviews)

                                 review_id  \
0     08d41b8d-c5f5-4ca9-9287-9ae8294cf0e6   
1     747a0d8f-7f36-41ca-b377-f9f687ac2eec   
2     64b198ae-91c6-40d4-ba3c-229f97e01c98   
3     828c5fcd-f084-4e57-ad56-201735b8e413   
4     46f43687-fa3d-434f-98c1-383d691a4223   
...                                    ...   
1195  121f30ef-0f05-4695-a424-5b2ffbdf1483   
1196  dea35742-2646-4ac9-98c0-c4fa4c509da9   
1197  d5a42e90-cd0f-4276-b8f8-d17b6a5d9eb1   
1198  4818ccd5-f8fa-44d0-a9b7-adcd7ad55d28   
1199  58f9de7f-6855-4e78-aa65-e7a01ff565a9   

                                                 review  rating        date  \
0     "Why don’t your ATMs support account-to-accoun...       4  2025-06-06   
1                           what is this app problem???       1  2025-06-05   
2          the app is proactive and a good connections.       5  2025-06-05   
3       I cannot send to cbebirr app. through this app.       3  2025-06-05   
4                                                  g

In [44]:
# Preprocessing
# Remove duplicates
df_reviews = df_reviews.drop_duplicates(subset=['review', 'date', 'bank'])

In [45]:
print(df_reviews)

                                 review_id  \
0     08d41b8d-c5f5-4ca9-9287-9ae8294cf0e6   
1     747a0d8f-7f36-41ca-b377-f9f687ac2eec   
2     64b198ae-91c6-40d4-ba3c-229f97e01c98   
3     828c5fcd-f084-4e57-ad56-201735b8e413   
4     46f43687-fa3d-434f-98c1-383d691a4223   
...                                    ...   
1195  121f30ef-0f05-4695-a424-5b2ffbdf1483   
1196  dea35742-2646-4ac9-98c0-c4fa4c509da9   
1197  d5a42e90-cd0f-4276-b8f8-d17b6a5d9eb1   
1198  4818ccd5-f8fa-44d0-a9b7-adcd7ad55d28   
1199  58f9de7f-6855-4e78-aa65-e7a01ff565a9   

                                                 review  rating        date  \
0     "Why don’t your ATMs support account-to-accoun...       4  2025-06-06   
1                           what is this app problem???       1  2025-06-05   
2          the app is proactive and a good connections.       5  2025-06-05   
3       I cannot send to cbebirr app. through this app.       3  2025-06-05   
4                                                  g

In [46]:
# Handle missing data
if 'review' in df_reviews.columns:
    df_reviews['review'] = df_reviews['review'].fillna('No review text')
if 'rating' in df_reviews.columns:
    df_reviews['rating'] = df_reviews['rating'].fillna(0).astype(int)

In [47]:
print(df_reviews)

                                 review_id  \
0     08d41b8d-c5f5-4ca9-9287-9ae8294cf0e6   
1     747a0d8f-7f36-41ca-b377-f9f687ac2eec   
2     64b198ae-91c6-40d4-ba3c-229f97e01c98   
3     828c5fcd-f084-4e57-ad56-201735b8e413   
4     46f43687-fa3d-434f-98c1-383d691a4223   
...                                    ...   
1195  121f30ef-0f05-4695-a424-5b2ffbdf1483   
1196  dea35742-2646-4ac9-98c0-c4fa4c509da9   
1197  d5a42e90-cd0f-4276-b8f8-d17b6a5d9eb1   
1198  4818ccd5-f8fa-44d0-a9b7-adcd7ad55d28   
1199  58f9de7f-6855-4e78-aa65-e7a01ff565a9   

                                                 review  rating        date  \
0     "Why don’t your ATMs support account-to-accoun...       4  2025-06-06   
1                           what is this app problem???       1  2025-06-05   
2          the app is proactive and a good connections.       5  2025-06-05   
3       I cannot send to cbebirr app. through this app.       3  2025-06-05   
4                                                  g

In [49]:
# Ensure date format
if 'date' in df_reviews.columns:
    df_reviews['date'] = pd.to_datetime(df_reviews['date'], errors='coerce').dt.strftime('%Y-%m-%d')

In [None]:
# Save to CSV
output_dir = 'data'
os.makedirs(output_dir, exist_ok=True)
df_reviews.to_csv(os.path.join(output_dir, 'bank_reviews.csv'), index=False)

print(f"Saved {len(df_reviews)} reviews to data/bank_reviews.csv")

In [57]:
# Preprocess each bank's reviews before saving
for bank, df_bank in zip(apps.keys(), all_reviews):
  # Remove duplicates
  df_bank = df_bank.drop_duplicates(subset=['review', 'date', 'bank'])
  # Handle missing data
  if 'review' in df_bank.columns:
    df_bank['review'] = df_bank['review'].fillna('No review text')
  if 'rating' in df_bank.columns:
    df_bank['rating'] = df_bank['rating'].fillna(0).astype(int)
  # Ensure date format
  if 'date' in df_bank.columns:
    df_bank['date'] = pd.to_datetime(df_bank['date'], errors='coerce').dt.strftime('%Y-%m-%d')
  filename = f"{bank.lower()}_reviews.csv"
  df_bank.to_csv(os.path.join(output_dir, filename), index=False)
  print(f"Saved {len(df_bank)} reviews to {os.path.join(output_dir, filename)}")

Saved 387 reviews to data\cbe_reviews.csv
Saved 399 reviews to data\boa_reviews.csv
Saved 399 reviews to data\dashen_reviews.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bank['review'] = df_bank['review'].fillna('No review text')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bank['rating'] = df_bank['rating'].fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bank['date'] = pd.to_datetime(df_bank['date'], errors='coerce').dt.strftime('%Y