In [None]:
# Import necessary libraries
from google_play_scraper import reviews, Sort
import pandas as pd
from datetime import datetime

# Define the output CSV path
output_csv = 'data/raw_reviews.csv'


In [None]:
# Bank apps with their Google Play package names
apps = {
    "CBE": "com.commercialbank.cbe",     # Replace with correct package names
    "BOA": "com.bankofafrica.eba",       # Replace with correct package names
    "Dashen": "com.dashenbank.mobile"    # Replace with correct package names
}

all_reviews = []

print("Starting to scrape reviews...")

for bank, package_name in apps.items():
    print(f"Scraping 400+ reviews for {bank}...")
    result, _ = reviews(
        package_name,
        lang='en',        # Use 'am' for Amharic if needed and supported
        country='et',
        sort=Sort.NEWEST,
        count=400
    )
    
    for r in result:
        all_reviews.append({
            "review": r['content'],
            "rating": r['score'],
            "date": r['at'],
            "bank": bank,
            "source": "Google Play"
        })

print(f"Scraped {len(all_reviews)} reviews in total.")


In [None]:
# Convert to DataFrame
df = pd.DataFrame(all_reviews)

# Drop duplicates based on the review text
df.drop_duplicates(subset=['review'], inplace=True)

# Drop rows with missing critical data
df.dropna(subset=['review', 'rating', 'date'], inplace=True)

# Normalize dates to YYYY-MM-DD format
df['date'] = df['date'].dt.strftime('%Y-%m-%d')

print(f"Data shape after cleaning: {df.shape}")
print("Sample data preview:")
df.head()


In [None]:
# Save the cleaned DataFrame to CSV
df.to_csv(output_csv, index=False)
print(f"Cleaned data saved to '{output_csv}'")


In [None]:
print("Summary of Task 1 Data Collection & Preprocessing:")
print(f"Total reviews collected (after cleaning): {len(df)}")

missing_data_percentage = df.isnull().mean() * 100
print("\nMissing Data Percentages:")
print(missing_data_percentage)

print("\nReviews per bank:")
print(df['bank'].value_counts())


In [None]:

# Load cleaned data
df = pd.read_csv("data/cleaned/combined_cleaned_reviews.csv")

# Total reviews
total_reviews = len(df)

# Missing data %
missing = df.isnull().mean() * 100

print(f"Total Reviews: {total_reviews}")
print("Missing Data Percentage per Column:")
print(missing)

# Reviews per bank
print("\nReviews per Bank:")
print(df['bank'].value_counts())


## Next Steps

- Use this cleaned dataset for sentiment and thematic analysis.
- Implement further preprocessing if necessary.
- Document the methodology in README.md.
- Commit scripts and notebook with meaningful messages.
