In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os

# Set style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

In [None]:
# Load all annotated reviews
files = glob.glob('../data/annotated/*.csv')
dfs = []
for f in files:
    dfs.append(pd.read_csv(f))

if dfs:
    df = pd.concat(dfs, ignore_index=True)
    print(f"Loaded {len(df)} reviews.")
    print(df.head())
else:
    print("No data found. Please run the scraping and sentiment scripts first.")

In [None]:
# Rating Distribution by Bank
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='rating', hue='bank', palette='viridis')
plt.title('Rating Distribution by Bank')
plt.xlabel('Rating (Stars)')
plt.ylabel('Count')
plt.legend(title='Bank')
plt.show()

In [None]:
# Sentiment Distribution by Bank
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='sentiment_label', hue='bank', palette='coolwarm', order=['POSITIVE', 'NEUTRAL', 'NEGATIVE'])
plt.title('Sentiment Distribution by Bank')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.legend(title='Bank')
plt.show()

In [None]:
# Average Sentiment Score by Bank
avg_sentiment = df.groupby('bank')['sentiment_score'].mean().reset_index()
plt.figure(figsize=(8, 5))
sns.barplot(data=avg_sentiment, x='bank', y='sentiment_score', palette='magma')
plt.title('Average Sentiment Score by Bank')
plt.ylabel('Average Compound Score')
plt.show()

In [None]:
from collections import Counter

# Aggregate keywords per bank
def get_top_keywords(bank_name, top_n=10):
    bank_df = df[df['bank'] == bank_name]
    all_keywords = []
    for k in bank_df['keywords'].dropna():
        if isinstance(k, str):
            all_keywords.extend([x.strip() for x in k.split(',') if x.strip()])
    
    return Counter(all_keywords).most_common(top_n)

banks = df['bank'].unique()
for bank in banks:
    top = get_top_keywords(bank)
    if not top:
        continue
    words, counts = zip(*top)
    
    plt.figure(figsize=(10, 5))
    sns.barplot(x=list(counts), y=list(words), palette='Blues_d')
    plt.title(f'Top Keywords for {bank}')
    plt.xlabel('Frequency')
    plt.show()