In [1]:
import os
import sys

# Navigate to the project root directory
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the project root to the system path
if project_root not in sys.path:
    sys.path.append(project_root)

# Verify the path was added
print("Project root added to sys.path:", project_root)
# Cell 2: Import modules and run the analysis pipeline

import pandas as pd
from src.analyzer import run_analysis_pipeline
from src.preprocess import save_clean_data
from src.analyzer import aggregate_data

Project root added to sys.path: d:\week-2\fintech-app-sentiment-2024


  from .autonotebook import tqdm as notebook_tqdm


In [None]:


# Define the path to the clean data from Task 1
clean_data_path = os.path.join(project_root, 'data', 'raw', 'bank_reviews_clean.csv')

# Load the clean data
try:
    clean_df = pd.read_csv(clean_data_path)
    print(f"Successfully loaded {len(clean_df)} reviews from {clean_data_path}")
except FileNotFoundError:
    print(f"Error: The file '{clean_data_path}' was not found. Please run Task 1 first.")
    clean_df = pd.DataFrame() 

# Run the analysis pipeline
if not clean_df.empty:
    print("\n--- Starting Sentiment and Thematic Analysis ---")
    analyzed_df = run_analysis_pipeline(clean_df)
    
    # Display a preview of the analyzed data
    print("\n--- Analyzed Data Preview ---")
    display(analyzed_df.head())
    print(f"Total reviews analyzed: {len(analyzed_df)}")
    
    # Save the final analyzed data to the processed folder
    processed_dir = os.path.join(project_root, 'data', 'processed')
    os.makedirs(processed_dir, exist_ok=True)
    
    save_path = os.path.join(processed_dir, 'analyzed_reviews.csv')
    save_clean_data(analyzed_df, save_path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading spaCy model 'en_core_web_sm'...
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Successfully loaded 1200 reviews from d:\week-2\fintech-app-sentiment-2024\data\raw\bank_reviews_clean.csv

--- Starting Sentiment and Thematic Analysis ---


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu



--- Analyzed Data Preview ---


Unnamed: 0,review,rating,date,bank,source,sentiment_label,sentiment_score,identified_themes
0,good,5,2025-08-17,CBE,Google Play,positive,0.999816,Other
1,this app is the good apps,5,2025-08-17,CBE,Google Play,positive,0.999853,Other
2,bayeegar,5,2025-08-15,CBE,Google Play,positive,0.749315,Other
3,አሪፋ ነዉ ነገር ግን Recent Transaction በጣም የቆየዉን ነዉ ...,2,2025-08-15,CBE,Google Play,negative,0.952693,Transaction Performance
4,congra ethio,5,2025-08-15,CBE,Google Play,positive,0.980097,Other


Total reviews analyzed: 1200
Cleaned data saved to d:\week-2\fintech-app-sentiment-2024\data\processed\analyzed_reviews.csv


In [None]:
# Cell 3: Data Aggregation and Visualization

import matplotlib.pyplot as plt
import seaborn as sns

# Make sure you have the analyzed DataFrame from the previous cell
if 'analyzed_df' in locals() and not analyzed_df.empty:
    print("\n--- Aggregating and Visualizing Data ---")
    
    # Aggregate sentiment and themes using the new function
    sentiment_agg, theme_agg = aggregate_data(analyzed_df)
    
    # Display the aggregated data
    print("\nMean Sentiment Score by Bank and Rating:")
    display(sentiment_agg)
    
    print("\nTop Themes by Bank:")
    display(theme_agg)

    # Visualization 1: Mean Sentiment by Bank and Rating
    plt.figure(figsize=(10, 6))
    sns.barplot(x='rating', y='mean_sentiment', hue='bank', data=sentiment_agg)
    plt.title('Mean Sentiment Score by Bank and Rating')
    plt.xlabel('Rating (Stars)')
    plt.ylabel('Mean Sentiment Score (-1 to 1)')
    plt.show()

    # Visualization 2: Top Themes by Bank
    plt.figure(figsize=(14, 8))
    sns.barplot(x='count', y='theme', hue='bank', data=theme_agg, orient='h')
    plt.title('Frequency of Identified Themes by Bank')
    plt.xlabel('Count of Reviews')
    plt.ylabel('Identified Theme')
    plt.show()
else:
    print("No analyzed data found. Please run the previous cell first.")