In [12]:
# ----------------------------------------
# 📦 Step 1: Import required libraries
# ----------------------------------------
from google_play_scraper import reviews, Sort       # To scrape Google Play app reviews
import pandas as pd                                 # For data manipulation and analysis
from datetime import datetime                       # To normalize and format dates
import os                                           # For creating folders and managing paths

# ----------------------------------------
# 🏦 Step 2: Define target apps and corresponding bank names
# ----------------------------------------
# This dictionary maps each app's package name (used for scraping)
# to a human-readable bank name for clarity in the dataset
apps = {
    'com.combanketh.mobilebanking': 'CBE',       # Commercial Bank of Ethiopia
    'com.boa.boaMobileBanking': 'BOA',           # Bank of Abyssinia
    'com.dashen.dashensuperapp': 'Dashen'        # Dashen Bank
}

# ----------------------------------------
# 📁 Step 3: Ensure directory for raw data exists
# ----------------------------------------
# Create a folder structure: data/raw
output_dir = os.path.join('../data/raw')          # Set the desired path
os.makedirs(output_dir, exist_ok=True)            # Create the folder if it doesn't exist

# ----------------------------------------
# 🧺 Step 4: Initialize a list to store all reviews
# ----------------------------------------
# We'll append each app's reviews as a DataFrame to this list and combine them later
all_reviews = []

# ----------------------------------------
# 🔁 Step 5: Loop through each app and scrape reviews
# ----------------------------------------
for package_name, bank_name in apps.items():
    print(f"📥 Scraping reviews for {bank_name}...")

    # Use google_play_scraper to fetch up to ~500 recent reviews
    reviews_list, _ = reviews(
        package_name,               # App identifier
        lang='en',                  # Language: English
        country='us',               # Country: US (change to 'et' for Ethiopia if needed)
        sort=Sort.NEWEST,           # Get the most recent reviews
        count=500                   # Number of reviews to fetch
    )

    # Convert list of dictionaries to a DataFrame
    df = pd.DataFrame(reviews_list)

    # Add custom columns for 'bank' and 'source' (Google Play)
    df['bank'] = bank_name
    df['source'] = 'Google Play'

    # Append this bank's reviews to the main list
    all_reviews.append(df)

# ----------------------------------------
# 📊 Step 6: Combine all banks' data into a single DataFrame
# ----------------------------------------
combined_df = pd.concat(all_reviews, ignore_index=True)

# ----------------------------------------
# 🧹 Step 7: Preprocess the data
# ----------------------------------------

# 7.1 Remove duplicate rows based on content (e.g., review content + userName)
combined_df.drop_duplicates(subset=['content', 'userName'], inplace=True)

# 7.2 Handle missing values — remove rows where 'content' or 'score' is missing
combined_df.dropna(subset=['content', 'score'], inplace=True)

# 7.3 Normalize the date format — convert 'at' column to YYYY-MM-DD
combined_df['date'] = pd.to_datetime(combined_df['at']).dt.date

# ----------------------------------------
# 📦 Step 8: Select and rename the required columns
# ----------------------------------------
final_df = combined_df[['content', 'score', 'date', 'bank', 'source']]
final_df.columns = ['review', 'rating', 'date', 'bank', 'source']  # Rename columns

# ----------------------------------------
# 💾 Step 9: Save cleaned data to CSV
# ----------------------------------------
csv_path = os.path.join(output_dir, 'bank_reviews_cleaned.csv')
final_df.to_csv(csv_path, index=False)

# ✅ Done
print(f"✅ Scraping and cleaning complete! Data saved to: {csv_path}")


📥 Scraping reviews for CBE...
📥 Scraping reviews for BOA...
📥 Scraping reviews for Dashen...
✅ Scraping and cleaning complete! Data saved to: ../data/raw\bank_reviews_cleaned.csv


In [13]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('../data/raw/bank_reviews_cleaned.csv')

# Count the number of reviews for each bank
bank_counts = df['bank'].value_counts()

# Print the counts
print("📊 Number of reviews per bank:")
print(bank_counts)


📊 Number of reviews per bank:
bank
CBE       500
BOA       499
Dashen    446
Name: count, dtype: int64
