In [4]:
import csv
from datetime import datetime
from google_play_scraper import reviews, Sort
import os
import pandas as pd

os.chdir(r"C:\Wk 2 Mobile Banking App Review Analysis\Customer-Experience-Analytics")
DATA_DIR = 'notebooks/data'
os.makedirs(DATA_DIR, exist_ok=True)  

def scrape_play_store_reviews(app_id, bank_name):
    results, _ = reviews(
        app_id,
        lang='en',
        country='us',
        sort=Sort.NEWEST,
        count=400
    )

    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = os.path.join(DATA_DIR, f'{bank_name}_reviews_{timestamp}.csv')

    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['review_text', 'rating', 'date', 'bank_name', 'source'])
        writer.writeheader()

        for entry in results:
            writer.writerow({
                'review_text': entry['content'],
                'rating': entry['score'],
                'date': entry['at'].strftime('%Y-%m-%d'),
                'bank_name': bank_name,
                'source': 'Google Play'
            })

    print(f"✅ stored {len(results)} reviews to {filename}")
    return filename  

def preprocess_reviews(file_path):
    df = pd.read_csv(file_path)

    # Remove duplicates
    df = df.drop_duplicates(subset=['review_text'])

    # Handle missing data
    df = df.dropna()

    # Normalize dates
    df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')

    return df

# Main execution
if __name__ == "__main__":
    banks = {
        "CBE": "com.combanketh.mobilebanking",
        "BOA": "com.boa.boaMobileBanking",
        "Dashen": "com.dashen.dashensuperapp"
    }

    for bank_name, app_id in banks.items():
        filename = scrape_play_store_reviews(app_id, bank_name)
        
        
        cleaned_data = preprocess_reviews(filename)
        
        cleaned_filename = os.path.join(DATA_DIR, f'cleaned_{bank_name}_reviews.csv')  # Save cleaned data in DATA_DIR
        cleaned_data.to_csv(cleaned_filename, index=False)  # Save cleaned data
        print(f"✅ Cleaned data saved  {bank_name}: {len(cleaned_data)} records")


✅ stored 400 reviews to notebooks/data\CBE_reviews_20250608_142541.csv
✅ Cleaned data saved  CBE: 321 records
✅ stored 400 reviews to notebooks/data\BOA_reviews_20250608_142542.csv
✅ Cleaned data saved  BOA: 344 records
✅ stored 400 reviews to notebooks/data\Dashen_reviews_20250608_142544.csv
✅ Cleaned data saved  Dashen: 366 records
