In [2]:
from google_play_scraper import reviews, Sort
import pandas as pd

In [5]:
# Import required libraries
from google_play_scraper import reviews, Sort
import pandas as pd
import os

# ----------------------------
# 1. Define the target apps with package names and bank names
# ----------------------------
# This dictionary maps each app's unique package identifier on Google Play
# to a human-friendly bank name to label our data
apps = {
    'com.combanketh.mobilebanking': 'CBE',       # Commercial Bank of Ethiopia
    'com.boa.boaMobileBanking': 'BOA',           # Bank of Abyssinia
    'com.dashen.dashensuperapp': 'Dashen'        # Dashen Bank
}

# ----------------------------
# 2. Create folder(s) to store scraped data if they don't exist
# ----------------------------
# We create a nested folder path 'data/raw' where the final CSV will be saved
os.makedirs('data/raw', exist_ok=True)

# ----------------------------
# 3. Initialize a list to hold DataFrames for each bank's reviews
# ----------------------------
# We will collect reviews from each bank separately and store them temporarily here
all_reviews = []

# ----------------------------
# 4. Loop through each app to scrape reviews
# ----------------------------
for package, bank_name in apps.items():
    print(f"📥 Scraping reviews for {bank_name}...")

    # ----------------------------
    # 4a. Use google_play_scraper to fetch reviews
    # ----------------------------
    # Fetch up to 600 recent reviews to account for possible data cleaning losses
    # 'lang' and 'country' set to English reviews from the US store (adjust if needed)
    # 'Sort.NEWEST' fetches the most recent reviews first
    bank_reviews, _ = reviews(
        package,
        lang='en',
        country='us',
        sort=Sort.NEWEST,
        count=600   # Fetch more than 400 to have enough valid reviews after cleaning
    )

    # ----------------------------
    # 4b. Convert scraped reviews to a pandas DataFrame
    # ----------------------------
    # Extract only relevant columns: review text, rating score, and date of review
    df = pd.DataFrame(bank_reviews)[['content', 'score', 'at']]

    # Rename columns for clarity and consistency
    df.columns = ['review', 'rating', 'date']

    # Add columns for metadata: which bank the review belongs to and data source
    df['bank'] = bank_name
    df['source'] = 'Google Play'

    # ----------------------------
    # 4c. Data Cleaning: Remove duplicates and missing data
    # ----------------------------
    # Drop duplicate reviews based on the review text to avoid counting repeats
    df.drop_duplicates(subset='review', inplace=True)

    # Drop any rows where critical data is missing (review, rating, or date)
    df.dropna(subset=['review', 'rating', 'date'], inplace=True)

    # ----------------------------
    # 4d. Validate review count after cleaning
    # ----------------------------
    # Check if after cleaning, we have fewer than 400 valid reviews for this bank
    if len(df) < 400:
        print(f"⚠️ Warning: Only {len(df)} valid reviews for {bank_name}, less than 400.")

    # Add this cleaned DataFrame to our list for later concatenation
    all_reviews.append(df)

# ----------------------------
# 5. Combine reviews from all banks into one DataFrame
# ----------------------------
df_all = pd.concat(all_reviews, ignore_index=True)

# ----------------------------
# 6. Normalize the 'date' column to a consistent string format (YYYY-MM-DD)
# ----------------------------
df_all['date'] = pd.to_datetime(df_all['date']).dt.strftime('%Y-%m-%d')

# ----------------------------
# 7. Save the cleaned combined data to a CSV file
# ----------------------------
# Path to save the CSV file inside the 'data/raw' directory
csv_path = os.path.join('data', 'raw', 'bank_reviews_cleaned.csv')

# Save DataFrame to CSV without the DataFrame index column
df_all.to_csv(csv_path, index=False)

# ----------------------------
# 8. Final status message
# ----------------------------
print(f"\n✅ Done! Scraped and cleaned {len(df_all)} reviews across 3 banks.")
print(f"💾 Data saved to: {csv_path}")


📥 Scraping reviews for CBE...
📥 Scraping reviews for BOA...
📥 Scraping reviews for Dashen...

✅ Done! Scraped and cleaned 1382 reviews across 3 banks.
💾 Data saved to: data\raw\bank_reviews_cleaned.csv
