# Thematic Analysis for Ethiopian Mobile Banking Reviews

This notebook explores thematic analysis to identify key themes and topics in mobile banking app reviews.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from pathlib import Path
from wordcloud import WordCloud

# Add parent directory to path
sys.path.append(str(Path().resolve().parent))

# Import thematic analysis module
from src.thematic import (preprocess_text, extract_keywords_tfidf, extract_keywords_spacy,
                         identify_themes, topic_modeling, assign_themes_to_reviews)

# Set plot style
plt.style.use('ggplot')
sns.set(font_scale=1.2)
sns.set_style("whitegrid")

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

## Load Data

In [None]:
# Load the processed reviews
df = pd.read_csv('../data/processed/processed_reviews.csv')
print(f"Loaded {len(df)} reviews")

# Load sentiment results if available
try:
    sentiment_df = pd.read_csv('../data/task2/sentiment_results.csv')
    print(f"Loaded sentiment results for {len(sentiment_df)} reviews")
    # Merge with original data if needed
    df = sentiment_df
except FileNotFoundError:
    print("Sentiment results not found, using processed reviews only")
    
df.head()

## Text Preprocessing

In [None]:
# Preprocess text for thematic analysis
df['processed_text'] = df['review_text'].apply(preprocess_text)

# Display some examples
for i in range(5):
    print(f"Original: {df.iloc[i]['review_text']}")
    print(f"Processed: {df.iloc[i]['processed_text']}")
    print("---")

## Keyword Extraction with TF-IDF

In [None]:
# Extract keywords using TF-IDF
bank_keywords = extract_keywords_tfidf(df, n_keywords=20, ngram_range=(1, 3))

# Display top keywords for each bank
for bank, keywords in bank_keywords.items():
    print(f"\n{bank} Top Keywords:")
    for keyword, score in keywords[:10]:
        print(f"  {keyword}: {score:.4f}")

In [None]:
# Create word clouds for each bank
for bank, keywords in bank_keywords.items():
    # Create a dictionary of word frequencies
    word_freq = {word: score for word, score in keywords}
    
    # Generate word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white', 
                         max_words=100, colormap='viridis').generate_from_frequencies(word_freq)
    
    # Plot word cloud
    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'{bank} - Key Terms Word Cloud')
    plt.axis('off')
    plt.tight_layout()
    plt.show()

## Theme Identification

In [None]:
# Identify themes based on keywords
bank_themes = identify_themes(bank_keywords, num_themes=5)

# Display identified themes for each bank
for bank, themes in bank_themes.items():
    print(f"\n{bank} Themes:")
    for theme, score in themes:
        print(f"  {theme}: {score:.4f}")

In [None]:
# Convert themes to DataFrame for visualization
theme_rows = []
for bank, themes in bank_themes.items():
    for theme, score in themes:
        theme_rows.append({
            'bank': bank,
            'theme': theme,
            'theme_score': score
        })
theme_df = pd.DataFrame(theme_rows)

# Plot themes by bank
plt.figure(figsize=(14, 10))
theme_pivot = theme_df.pivot(index='bank', columns='theme', values='theme_score')
sns.heatmap(theme_pivot, annot=True, cmap='viridis', fmt='.2f')
plt.title('Theme Relevance by Bank')
plt.xlabel('Theme')
plt.ylabel('Bank')
plt.tight_layout()
plt.show()

## Topic Modeling

In [None]:
# Perform topic modeling using NMF
topics_nmf = topic_modeling(df, n_topics=5, method='nmf')

# Display NMF topics
print("NMF Topics:")
for topic, keywords in topics_nmf.items():
    print(f"\n{topic}: {', '.join(keywords)}")

In [None]:
# Perform topic modeling using LDA
topics_lda = topic_modeling(df, n_topics=5, method='lda')

# Display LDA topics
print("LDA Topics:")
for topic, keywords in topics_lda.items():
    print(f"\n{topic}: {', '.join(keywords)}")

In [None]:
# Create word clouds for each topic
for topic, keywords in topics_nmf.items():
    # Create a dictionary of word frequencies (using index as score)
    word_freq = {word: (len(keywords) - i) for i, word in enumerate(keywords)}
    
    # Generate word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white', 
                         max_words=50, colormap='plasma').generate_from_frequencies(word_freq)
    
    # Plot word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'{topic} Word Cloud')
    plt.axis('off')
    plt.tight_layout()
    plt.show()

## Assigning Themes to Reviews

In [None]:
# Assign themes to individual reviews
themed_df = assign_themes_to_reviews(df, bank_themes)
themed_df.head()

In [None]:
# Check distribution of themes
plt.figure(figsize=(14, 8))
theme_counts = themed_df['identified_themes'].value_counts().sort_values(ascending=False)
sns.barplot(x=theme_counts.index, y=theme_counts.values)
plt.title('Distribution of Themes in Reviews')
plt.xlabel('Theme')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Theme Analysis by Sentiment

In [None]:
# Check if sentiment analysis results are available
if 'sentiment_label' in themed_df.columns:
    # Analyze themes by sentiment
    theme_sentiment = themed_df.groupby(['identified_themes', 'sentiment_label']).size().unstack().fillna(0)
    
    # Calculate percentages
    theme_sentiment_pct = theme_sentiment.div(theme_sentiment.sum(axis=1), axis=0) * 100
    
    # Sort by positive sentiment percentage
    if 'positive' in theme_sentiment_pct.columns:
        theme_sentiment_pct = theme_sentiment_pct.sort_values('positive', ascending=False)
    
    # Plot
    plt.figure(figsize=(14, 10))
    theme_sentiment_pct.plot(kind='bar', stacked=True, colormap='RdYlGn')
    plt.title('Sentiment Distribution by Theme')
    plt.xlabel('Theme')
    plt.ylabel('Percentage')
    plt.legend(title='Sentiment')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Show the raw counts as well
    theme_sentiment = theme_sentiment.loc[theme_sentiment_pct.index]  # Use same order as percentages
    plt.figure(figsize=(14, 10))
    theme_sentiment.plot(kind='bar', stacked=True, colormap='RdYlGn')
    plt.title('Sentiment Counts by Theme')
    plt.xlabel('Theme')
    plt.ylabel('Count')
    plt.legend(title='Sentiment')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("Sentiment analysis results not available")

In [None]:
# Analyze themes by bank
theme_bank = themed_df.groupby(['bank', 'identified_themes']).size().unstack().fillna(0)

# Calculate percentages
theme_bank_pct = theme_bank.div(theme_bank.sum(axis=1), axis=0) * 100

# Plot heatmap
plt.figure(figsize=(16, 12))
sns.heatmap(theme_bank_pct, annot=True, cmap='YlGnBu', fmt='.1f')
plt.title('Theme Distribution by Bank (Percentage)')
plt.ylabel('Bank')
plt.xlabel('Theme')
plt.tight_layout()
plt.show()

## Theme Analysis by Rating

In [None]:
# Analyze themes by rating
theme_rating = themed_df.groupby(['identified_themes', 'rating']).size().unstack().fillna(0)

# Calculate percentages
theme_rating_pct = theme_rating.div(theme_rating.sum(axis=1), axis=0) * 100

# Plot
plt.figure(figsize=(14, 10))
theme_rating_pct.plot(kind='bar', stacked=True, colormap='viridis')
plt.title('Rating Distribution by Theme')
plt.xlabel('Theme')
plt.ylabel('Percentage')
plt.legend(title='Rating')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Example Reviews by Theme

In [None]:
# Display example reviews for each theme
for theme in themed_df['identified_themes'].unique():
    if theme == 'Other':
        continue
        
    # Get reviews for this theme
    theme_reviews = themed_df[themed_df['identified_themes'] == theme]
    
    # Get positive, negative, and neutral examples if sentiment is available
    if 'sentiment_label' in theme_reviews.columns:
        print(f"\n{theme} - Example Reviews:")
        print("=" * 80)
        
        for sentiment in ['positive', 'negative', 'neutral']:
            if sentiment in theme_reviews['sentiment_label'].values:
                sentiment_reviews = theme_reviews[theme_reviews['sentiment_label'] == sentiment]
                sample = sentiment_reviews.sample(min(3, len(sentiment_reviews)))
                
                print(f"\n{sentiment.capitalize()} reviews:")
                for _, row in sample.iterrows():
                    print(f"Bank: {row['bank']} | Rating: {row['rating']} | Score: {row.get('sentiment_score', 'N/A'):.4f}")
                    print(f"Review: {row['review_text']}")
                    print("---")
    else:
        # Just show examples without sentiment breakdown
        print(f"\n{theme} - Example Reviews:")
        print("=" * 80)
        sample = theme_reviews.sample(min(5, len(theme_reviews)))
        for _, row in sample.iterrows():
            print(f"Bank: {row['bank']} | Rating: {row['rating']}")
            print(f"Review: {row['review_text']}")
            print("---")

## Save Thematic Analysis Results

In [None]:
# Save results to CSV
output_dir = '../data/task2'
os.makedirs(output_dir, exist_ok=True)

# Save final results with themes
themed_df.to_csv(f'{output_dir}/final_results.csv', index=False)
print(f"Saved final results to {output_dir}/final_results.csv")

# Save themes by bank
theme_rows = []
for bank, themes in bank_themes.items():
    for theme, score in themes:
        theme_rows.append({
            'bank': bank,
            'theme': theme,
            'theme_score': score
        })
theme_df = pd.DataFrame(theme_rows)
theme_df.to_csv(f'{output_dir}/bank_themes.csv', index=False)
print(f"Saved bank themes to {output_dir}/bank_themes.csv")

## Thematic Analysis Conclusion

Our thematic analysis revealed several key insights:

1. The most common themes across Ethiopian mobile banking reviews are:
   - Account Access Issues
   - Transaction Performance
   - App Performance & Reliability
   - User Interface & Experience
   - Customer Support

2. Different banks have different strength areas and pain points:
   - Some banks excel in transaction performance but struggle with account access
   - Others have strong user interfaces but face reliability issues
   - Customer support appears as a significant theme for several banks

3. Theme and sentiment analysis together reveal:
   - Account Access Issues generally receive negative sentiment
   - User Interface & Experience themes have mixed sentiment
   - Transaction Performance varies significantly by bank

4. Low ratings (1-2 stars) are strongly associated with:
   - App Performance & Reliability issues
   - Account Access Issues
   - Failed transactions

5. High ratings (4-5 stars) are associated with:
   - Positive User Interface & Experience
   - Good transaction performance
   - Useful features and functionality

These insights can help banks prioritize improvements to their mobile applications to better meet customer needs and expectations.