# Setup and Imports

In [1]:
# Cell 1: Install necessary libraries (run only once if not installed)
# !pip install google-play-scraper pandas scikit-learn transformers torch spacy matplotlib seaborn sqlalchemy cx_Oracle psycopg2-binary ipywidgets widgetsnbextension
# !python -m spacy download en_core_web_sm
# !jupyter nbextension enable --py widgetsnbextension # For interactive elements if you use them

import pandas as pd
import os
import logging
from datetime import datetime

# Configure logging for the notebook
# This ensures logs go to a file and to the console output in Jupyter
log_file_path = 'pipeline_jupyter.log'
if os.path.exists(log_file_path):
    os.remove(log_file_path) # Clear previous log for a fresh run

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[logging.FileHandler(log_file_path), logging.StreamHandler()])

logging.info("Jupyter Notebook setup complete.")


2025-06-07 17:02:27,202 - INFO - Jupyter Notebook setup complete.


In [2]:
# Import modules from your 'src' and 'config' directories
# Add your project's root directory to the Python path
# This allows importing modules from 'src' and 'config'
import sys
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.data_collection import collect_all_bank_reviews
from src.preprocessing import preprocess_reviews, tokenize_and_lemmatize # Added tokenize_and_lemmatize here for consistency
from src.sentiment_analysis import add_sentiment_scores
from src.thematic_analysis import extract_keywords_tfidf, assign_themes
from src.database_manager import DatabaseManager
from src.insights_generator import (
    generate_sentiment_summary,
    plot_sentiment_distribution,
    plot_sentiment_by_bank,
    plot_themes_by_bank,
    generate_recommendations
)
from config.app_config import BANK_APPS, TARGET_REVIEWS_PER_BANK, DB_CONFIG

logging.info("All custom modules and configurations imported.")

2025-06-07 17:02:56,581 - ERROR - Error loading SpaCy model in preprocessing: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.. Please run 'python -m spacy download en_core_web_sm'
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

2025-06-07 17:04:30,027 - ERROR - Error loading sentiment analysis model: No module named 'torch'
2025-06-07 17:04:30,410 - ERROR - Error loading SpaCy model in thematic analysis: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.. Please ensure 'python -m spacy download en_core_web_sm' has been run.
2025-06-07 17:04:39,255 - INFO - All custom modules and configurations imported.


# Configuration Review

In [3]:
#Display configuration
logging.info("--- Configuration Details ---")
logging.info(f"Banks to scrape: {list(BANK_APPS.keys())}")
logging.info(f"Target reviews per bank: {TARGET_REVIEWS_PER_BANK}")
logging.info(f"Database type: {DB_CONFIG.get('DB_TYPE', 'N/A')}")
logging.info(f"Database host: {DB_CONFIG.get('DB_HOST', 'N/A')}")
logging.info("-" * 30)

2025-06-07 17:06:25,968 - INFO - --- Configuration Details ---
2025-06-07 17:06:25,969 - INFO - Banks to scrape: ['CBE', 'BOA', 'DashenBank']
2025-06-07 17:06:25,971 - INFO - Target reviews per bank: 400
2025-06-07 17:06:25,974 - INFO - Database type: postgresql
2025-06-07 17:06:25,976 - INFO - Database host: localhost
2025-06-07 17:06:25,978 - INFO - ------------------------------


# Data Collection and Preprocessing

In [5]:
#Data Collection
logging.info("\n--- Task 1: Data Collection ---")
raw_reviews_df = collect_all_bank_reviews()

if not raw_reviews_df.empty:
    os.makedirs('data/raw_reviews', exist_ok=True)
    raw_reviews_df.to_csv('data/raw_reviews/raw_bank_reviews.csv', index=False)
    logging.info(f"Raw reviews collected and saved: {len(raw_reviews_df)} rows.")
    print("Raw Reviews DataFrame Head:")
    display(raw_reviews_df.head())
else:
    logging.error("Failed to collect raw reviews.")


2025-06-07 17:08:09,302 - INFO - 
--- Task 1: Data Collection ---
2025-06-07 17:08:09,307 - INFO - Attempting to scrape reviews for CBE (App ID: com.combanketh.mobilebanking)...
2025-06-07 17:08:15,635 - INFO - Successfully scraped 400 reviews for CBE.
2025-06-07 17:08:20,640 - INFO - Attempting to scrape reviews for BOA (App ID: com.boa.boaMobileBanking)...
2025-06-07 17:08:21,935 - INFO - Successfully scraped 400 reviews for BOA.
2025-06-07 17:08:26,943 - INFO - Attempting to scrape reviews for DashenBank (App ID: com.dashen.dashensuperapp)...
2025-06-07 17:08:28,001 - INFO - Successfully scraped 400 reviews for DashenBank.
2025-06-07 17:08:33,005 - INFO - Total raw reviews collected across all banks: 1200
2025-06-07 17:08:33,021 - INFO - Raw reviews collected and saved: 1200 rows.


Raw Reviews DataFrame Head:


Unnamed: 0,content,score,at,userName,appVersion,reviewId,bank,source
0,"""Why don’t your ATMs support account-to-accoun...",4,2025-06-06 09:54:11,A Google user,,be2cb2ac-bbe0-4175-81c4-9f6c86afdaaa,CBE,Google Play Store
1,what is this app problem???,1,2025-06-05 22:16:56,A Google user,5.1.0,8efd71e9-59cd-41ce-8c5c-12052dee9ad0,CBE,Google Play Store
2,the app is proactive and a good connections.,5,2025-06-05 15:55:10,A Google user,5.1.0,b12d0383-9b27-4e49-a94d-277a43b15800,CBE,Google Play Store
3,I cannot send to cbebirr app. through this app.,3,2025-06-05 11:12:49,A Google user,,dd9f9e37-177a-46df-b877-d0edaa9aed29,CBE,Google Play Store
4,good,4,2025-06-05 10:21:59,A Google user,,8e34703c-203c-4180-8b32-bfd0b3f0c871,CBE,Google Play Store


In [6]:
# Data Preprocessing
logging.info("\n--- Task 1: Data Preprocessing ---")
if not raw_reviews_df.empty:
    processed_reviews_df = preprocess_reviews(raw_reviews_df.copy()) # Use a copy to avoid SettingWithCopyWarning
    
    os.makedirs('data/processed_reviews', exist_ok=True)
    processed_reviews_df.to_csv('data/processed_reviews/processed_bank_reviews.csv', index=False)
    logging.info(f"Processed reviews saved: {len(processed_reviews_df)} rows.")
    print("\nProcessed Reviews DataFrame Head:")
    display(processed_reviews_df.head())
    print("\nProcessed Reviews DataFrame Info:")
    processed_reviews_df.info()
else:
    logging.error("No raw data to preprocess.")
    processed_reviews_df = pd.DataFrame() # Ensure processed_reviews_df is defined

2025-06-07 17:08:41,821 - INFO - 
--- Task 1: Data Preprocessing ---
2025-06-07 17:08:41,824 - INFO - Starting preprocessing for 1200 reviews...
2025-06-07 17:08:41,828 - INFO - Removed 0 rows with missing review content. 1200 rows remaining.
2025-06-07 17:08:41,831 - INFO - Removed duplicates. 1200 rows remaining.
2025-06-07 17:08:41,844 - INFO - Dates normalized to YYYY-MM-DD format.
2025-06-07 17:08:41,877 - INFO - Missing/empty review_text after preprocessing: 4.17%
2025-06-07 17:08:41,877 - INFO - KPI met: Less than 5% missing or empty review text after preprocessing.
2025-06-07 17:08:41,877 - INFO - Preprocessing complete. Final DataFrame shape: (1200, 9)
2025-06-07 17:08:41,911 - INFO - Processed reviews saved: 1200 rows.



Processed Reviews DataFrame Head:


Unnamed: 0,reviewId,review_text,processed_text,rating,date,bank,source,userName,appVersion
0,be2cb2ac-bbe0-4175-81c4-9f6c86afdaaa,"""Why dont your ATMs support account-to-account...",,4,2025-06-06,CBE,Google Play Store,A Google user,
1,8efd71e9-59cd-41ce-8c5c-12052dee9ad0,what is this app problem???,,1,2025-06-05,CBE,Google Play Store,A Google user,5.1.0
2,b12d0383-9b27-4e49-a94d-277a43b15800,the app is proactive and a good connections.,,5,2025-06-05,CBE,Google Play Store,A Google user,5.1.0
3,dd9f9e37-177a-46df-b877-d0edaa9aed29,I cannot send to cbebirr app. through this app.,,3,2025-06-05,CBE,Google Play Store,A Google user,
4,8e34703c-203c-4180-8b32-bfd0b3f0c871,good,,4,2025-06-05,CBE,Google Play Store,A Google user,



Processed Reviews DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   reviewId        1200 non-null   object
 1   review_text     1200 non-null   object
 2   processed_text  1200 non-null   object
 3   rating          1200 non-null   int64 
 4   date            1200 non-null   object
 5   bank            1200 non-null   object
 6   source          1200 non-null   object
 7   userName        1200 non-null   object
 8   appVersion      918 non-null    object
dtypes: int64(1), object(8)
memory usage: 84.5+ KB


# Sentiment and Thematic Analysis

In [7]:
#Sentiment Analysis
logging.info("\n--- Task 2: Sentiment Analysis ---")
if not processed_reviews_df.empty:
    sentiment_analyzed_df = add_sentiment_scores(processed_reviews_df.copy())
    logging.info("Sentiment analysis complete.")
    print("\nReviews with Sentiment Scores:")
    display(sentiment_analyzed_df[['review_text', 'sentiment_label', 'sentiment_score']].head())
else:
    logging.error("No processed data for sentiment analysis.")
    sentiment_analyzed_df = pd.DataFrame() # Ensure sentiment_analyzed_df is defined

2025-06-07 17:09:45,269 - INFO - 
--- Task 2: Sentiment Analysis ---
2025-06-07 17:09:45,272 - INFO - Starting sentiment analysis for 1200 reviews...
2025-06-07 17:09:45,290 - INFO - Sentiment analysis complete. 0.00% of reviews processed (non-default sentiment).
2025-06-07 17:09:45,294 - INFO - Sentiment analysis complete.



Reviews with Sentiment Scores:


Unnamed: 0,review_text,sentiment_label,sentiment_score
0,"""Why dont your ATMs support account-to-account...",NEUTRAL,0.5
1,what is this app problem???,NEUTRAL,0.5
2,the app is proactive and a good connections.,NEUTRAL,0.5
3,I cannot send to cbebirr app. through this app.,NEUTRAL,0.5
4,good,NEUTRAL,0.5


In [8]:
# Prepare text for Thematic Analysis (Tokenization & Lemmatization)
logging.info("Preparing text for Thematic Analysis (Tokenization & Lemmatization)...")
if not sentiment_analyzed_df.empty:
    sentiment_analyzed_df['processed_text'] = sentiment_analyzed_df['review_text'].apply(tokenize_and_lemmatize)
    print("\nReviews with Processed Text for Thematic Analysis:")
    display(sentiment_analyzed_df[['review_text', 'processed_text']].head())
else:
    logging.error("No data for thematic analysis text processing.")

2025-06-07 17:10:05,743 - INFO - Preparing text for Thematic Analysis (Tokenization & Lemmatization)...



Reviews with Processed Text for Thematic Analysis:


Unnamed: 0,review_text,processed_text
0,"""Why dont your ATMs support account-to-account...",
1,what is this app problem???,
2,the app is proactive and a good connections.,
3,I cannot send to cbebirr app. through this app.,
4,good,


In [9]:
#Thematic Analysis - Keyword Extraction and Theme Assignment
logging.info("\n--- Task 2: Thematic Analysis ---")
if not sentiment_analyzed_df.empty:
    # Extract keywords
    top_keywords = extract_keywords_tfidf(sentiment_analyzed_df, text_column='processed_text', top_n=50)
    logging.info(f"Top 10 Keywords extracted: {top_keywords[:10]}")

    # Assign themes
    final_analysis_df = assign_themes(sentiment_analyzed_df.copy(), top_keywords, text_column='review_text') # Use original text for theme matching

    logging.info("Thematic analysis complete.")
    print("\nReviews with Identified Themes:")
    display(final_analysis_df[['review_text', 'identified_themes']].head())
else:
    logging.error("No data for thematic analysis.")
    final_analysis_df = pd.DataFrame() # Ensure final_analysis_df is defined

2025-06-07 17:10:22,791 - INFO - 
--- Task 2: Thematic Analysis ---
2025-06-07 17:10:22,792 - INFO - Extracting keywords using TF-IDF...
2025-06-07 17:10:22,886 - INFO - Top 10 Keywords extracted: []
2025-06-07 17:10:22,891 - INFO - Assigning themes to reviews...
2025-06-07 17:10:23,036 - INFO - Theme assignment complete.
2025-06-07 17:10:23,036 - INFO - Thematic analysis complete.



Reviews with Identified Themes:


Unnamed: 0,review_text,identified_themes
0,"""Why dont your ATMs support account-to-account...","Transaction Performance, Customer Support"
1,what is this app problem???,Other
2,the app is proactive and a good connections.,Other
3,I cannot send to cbebirr app. through this app.,Other
4,good,Other


# Store Cleaned Data in Oracle

In [10]:
#Store Cleaned Data in Oracle (or PostgreSQL)
logging.info("\n--- Task 3: Storing Data in Database ---")
if not final_analysis_df.empty:
    db_manager = DatabaseManager(DB_CONFIG)
    if db_manager.engine:
        db_manager.create_tables()
        db_manager.insert_banks_data(final_analysis_df) # Insert unique bank names
        db_manager.insert_reviews_data(final_analysis_df) # Insert review data

        logging.info("Data stored in database successfully.")
        
        # Optional: Verify by reading back from DB
        data_from_db = db_manager.read_reviews_from_db()
        print("\nData read from Database Head:")
        display(data_from_db.head())
        print(f"Total rows read from DB: {len(data_from_db)}")
        # Use data_from_db for subsequent insights if successful, otherwise use final_analysis_df
        data_for_insights = data_from_db if not data_from_db.empty else final_analysis_df
    else:
        logging.error("Database connection failed. Skipping database operations.")
        data_for_insights = final_analysis_df # Fallback to DataFrame if DB fails
else:
    logging.error("No data available to store in the database.")
    data_for_insights = pd.DataFrame() # Ensure data_for_insights is defined

2025-06-07 17:11:24,004 - INFO - 
--- Task 3: Storing Data in Database ---
2025-06-07 17:11:24,006 - ERROR - An unexpected error occurred during postgresql connection: 'DB_NAME'
2025-06-07 17:11:24,009 - ERROR - Database connection failed. Skipping database operations.


# Insights and Recommendations

In [11]:
# Generate Insights and Recommendations
logging.info("\n--- Task 4: Generating Insights and Recommendations ---")

if not data_for_insights.empty:
    os.makedirs('reports', exist_ok=True)

    # Sentiment Summary
    sentiment_summary_df = generate_sentiment_summary(data_for_insights)
    if not sentiment_summary_df.empty:
        logging.info("\nSentiment Summary (aggregated by bank and sentiment):\n" + sentiment_summary_df.to_string())
        print("\nSentiment Summary (aggregated by bank and sentiment):")
        display(sentiment_summary_df)

    # Plot Sentiment Distribution
    plot_sentiment_distribution(data_for_insights, save_path='reports/sentiment_distribution.png')
    print("\nSentiment Distribution Plot saved to reports/sentiment_distribution.png")

    # Plot Sentiment by Bank
    plot_sentiment_by_bank(data_for_insights, save_path='reports/sentiment_by_bank.png')
    print("Average Sentiment by Bank Plot saved to reports/sentiment_by_bank.png")

    # Plot Themes by Bank
    plot_themes_by_bank(data_for_insights, save_path='reports/themes_by_bank.png')
    print("Prevalence of Themes by Bank Plot saved to reports/themes_by_bank.png")

    # Generate Recommendations
    recommendations_dict = generate_recommendations(data_for_insights)
    logging.info("\nActionable Recommendations:")
    print("\nActionable Recommendations:")
    with open('reports/recommendations.txt', 'w') as f:
        for bank, rec_text in recommendations_dict.items():
            logging.info(rec_text)
            print(rec_text)
            f.write(rec_text + "\n\n")
    logging.info("Recommendations saved to reports/recommendations.txt")

else:
    logging.error("No data available for generating insights and recommendations.")

logging.info("\n--- Jupyter Notebook Pipeline Execution Complete ---")

2025-06-07 17:12:17,378 - INFO - 
--- Task 4: Generating Insights and Recommendations ---
2025-06-07 17:12:17,684 - INFO - Generated sentiment summary per bank.
2025-06-07 17:12:17,698 - INFO - 
Sentiment Summary (aggregated by bank and sentiment):
            mean_POSITIVE  mean_NEUTRAL  mean_NEGATIVE  count_POSITIVE  count_NEUTRAL  count_NEGATIVE
bank                                                                                                 
BOA                   NaN           0.5            NaN               0            400               0
CBE                   NaN           0.5            NaN               0            400               0
DashenBank            NaN           0.5            NaN               0            400               0



Sentiment Summary (aggregated by bank and sentiment):


Unnamed: 0_level_0,mean_POSITIVE,mean_NEUTRAL,mean_NEGATIVE,count_POSITIVE,count_NEUTRAL,count_NEGATIVE
bank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BOA,,0.5,,0,400,0
CBE,,0.5,,0,400,0
DashenBank,,0.5,,0,400,0


2025-06-07 17:12:18,271 - INFO - Sentiment distribution plot saved to reports/sentiment_distribution.png



Sentiment Distribution Plot saved to reports/sentiment_distribution.png


2025-06-07 17:12:18,682 - INFO - Sentiment by bank plot saved to reports/sentiment_by_bank.png


Average Sentiment by Bank Plot saved to reports/sentiment_by_bank.png


2025-06-07 17:12:19,689 - INFO - Themes by bank plot saved to reports/themes_by_bank.png
2025-06-07 17:12:19,689 - INFO - Generating actionable recommendations...
2025-06-07 17:12:19,709 - INFO - Recommendations generated.
2025-06-07 17:12:19,709 - INFO - 
Actionable Recommendations:
2025-06-07 17:12:19,712 - INFO - --- Recommendations for CBE ---
Overall Sentiment: 0.0% Positive, 0.0% Negative Reviews.

Few to no negative reviews, focus on maintaining quality.

General Recommendation:
- Implement a continuous feedback loop: regularly scrape and analyze reviews to identify emerging trends.
- Prioritize development efforts based on the severity and frequency of reported pain points.
- Engage with users who leave critical reviews to understand their issues better and demonstrate responsiveness.

2025-06-07 17:12:19,713 - INFO - --- Recommendations for BOA ---
Overall Sentiment: 0.0% Positive, 0.0% Negative Reviews.

Few to no negative reviews, focus on maintaining quality.

General Recom

Prevalence of Themes by Bank Plot saved to reports/themes_by_bank.png

Actionable Recommendations:
--- Recommendations for CBE ---
Overall Sentiment: 0.0% Positive, 0.0% Negative Reviews.

Few to no negative reviews, focus on maintaining quality.

General Recommendation:
- Implement a continuous feedback loop: regularly scrape and analyze reviews to identify emerging trends.
- Prioritize development efforts based on the severity and frequency of reported pain points.
- Engage with users who leave critical reviews to understand their issues better and demonstrate responsiveness.

--- Recommendations for BOA ---
Overall Sentiment: 0.0% Positive, 0.0% Negative Reviews.

Few to no negative reviews, focus on maintaining quality.

General Recommendation:
- Implement a continuous feedback loop: regularly scrape and analyze reviews to identify emerging trends.
- Prioritize development efforts based on the severity and frequency of reported pain points.
- Engage with users who leave critical re