# Define Imports

In [1]:
import pandas as pd
from transformers import pipeline
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

  from .autonotebook import tqdm as notebook_tqdm


# Class Declarations

## Sentiment Analyzer Class

In [2]:
class SentimentAnalyzer:
    """
    A class to perform sentiment analysis using a Hugging Face model.
    """
    def __init__(self, model_name="distilbert-base-uncased-finetuned-sst-2-english"):
        """
        Initializes the sentiment analysis pipeline.
        The challenge recommends this model.
        """
        print(f"Loading sentiment model: {model_name}...")
        # Using device=0 will use GPU if available, -1 for CPU
        self.sentiment_pipeline = pipeline("sentiment-analysis", model=model_name, device=-1)
        print("Model loaded successfully.")

    def analyze(self, df, text_column='review'):
        """
        Applies sentiment analysis to a DataFrame column.
        Args:
            df (pd.DataFrame): The input DataFrame.
            text_column (str): The column containing text to analyze.
        Returns:
            pd.DataFrame: The DataFrame with added 'sentiment_label' and 'sentiment_score' columns.
        """
        print("Applying sentiment analysis...")
        # The pipeline returns a list of dictionaries like {'label': 'POSITIVE', 'score': 0.999}
        # We need to handle potential long texts by truncating them for the model
        sentiments = self.sentiment_pipeline(df[text_column].fillna('').tolist(), truncation=True)
        
        # Extract labels and scores
        df['sentiment_label'] = [s['label'] for s in sentiments]
        df['sentiment_score'] = [s['score'] for s in sentiments]
        print("Sentiment analysis complete.")
        return df

## ThematicAnalyzer

In [3]:
class ThematicAnalyzer:
    """
    A class to perform thematic analysis using NLP techniques.
    """
    def __init__(self, stop_words='english'):
        self.vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 3), stop_words=stop_words)

    def preprocess_text(self, text):
        """
        Lemmatizes text and removes stopwords and punctuation.
        """
        doc = nlp(text.lower())
        lemmas = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.is_alpha]
        return " ".join(lemmas)
    
    def extract_keywords(self, df, text_column='review'):
        """
        Extracts top keywords for each bank using TF-IDF.
        """
        print("Extracting keywords using TF-IDF...")
        bank_keywords = {}
        for bank in df['bank'].unique():
            print(f"  - Processing for {bank}")
            bank_df = df[df['bank'] == bank]
            processed_reviews = bank_df[text_column].apply(self.preprocess_text)
            
            tfidf_matrix = self.vectorizer.fit_transform(processed_reviews)
            feature_names = self.vectorizer.get_feature_names_out()
            bank_keywords[bank] = feature_names
        print("Keyword extraction complete.")
        return bank_keywords

    def assign_themes(self, df, text_column='review'):
        """
        Assigns predefined themes based on keyword matching.
        This is a rule-based approach as suggested by the challenge.
        """
        print("Assigning themes...")
        # Define keywords for each theme
        theme_map = {
            'Account & Login': ['login', 'account', 'password', 'register', 'signin', 'otp'],
            'Transactions & Transfers': ['transfer', 'transaction', 'payment', 'send', 'money', 'slow', 'fast', 'fee'],
            'UI & Experience': ['ui', 'interface', 'design', 'easy', 'simple', 'update', 'dark mode'],
            'Bugs & Performance': ['bug', 'crash', 'error', 'slow', 'performance', 'stuck', 'fix', 'issue'],
            'Features & Services': ['feature', 'service', 'loan', 'statement', 'fingerprint', 'biometric']
        }

        def find_theme(review_text):
            review_text = review_text.lower()
            found_themes = []
            for theme, keywords in theme_map.items():
                if any(keyword in review_text for keyword in keywords):
                    found_themes.append(theme)
            return ", ".join(found_themes) if found_themes else 'General Feedback'

        df['themes'] = df[text_column].apply(find_theme)
        print("Theme assignment complete.")
        return df

## Load cleaned data from task-1

In [4]:
df_cleaned = pd.read_csv('../data/cleaned_play_store_reviews.csv')

## Perform Sentiment Analysis

In [5]:
sentiment_analyzer = SentimentAnalyzer()
df_sentiment = sentiment_analyzer.analyze(df_cleaned.copy())

Loading sentiment model: distilbert-base-uncased-finetuned-sst-2-english...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu


Model loaded successfully.
Applying sentiment analysis...
Sentiment analysis complete.


## Perform Thematic Analysis

In [6]:
thematic_analyzer = ThematicAnalyzer()
df_final = thematic_analyzer.assign_themes(df_sentiment.copy())

Assigning themes...
Theme assignment complete.


## Display Top Keywords Per Bank

In [7]:
keywords_per_bank = thematic_analyzer.extract_keywords(df_final.copy())
for bank, keywords in keywords_per_bank.items():
    print(f"\nTop keywords for {bank}: {', '.join(keywords[:10])}")

Extracting keywords using TF-IDF...
  - Processing for CBE
  - Processing for BOA
  - Processing for DASHEN
Keyword extraction complete.

Top keywords for CBE: access, account, add, allow, amazing, amazing app, app, app like, application, bad

Top keywords for BOA: access, account, amazing, android, app, app crash, app work, application, ask, automatically

Top keywords for DASHEN: able, account, ahead, amazing, amazing app, amole, app, application, bank, bank super


## Finalize and Save

In [8]:
df_final.reset_index(inplace=True)
df_final.rename(columns={
    'index': 'review_id',
    'review': 'review_text',
    'themes': 'identified_theme(s)'
}, inplace=True)

# Select and reorder columns
output_columns = ['review_id', 'review_text', 'sentiment_label', 'sentiment_score', 'identified_theme(s)', 'rating', 'date', 'bank']
df_output = df_final[output_columns]

print("\nFinal DataFrame Head:")
print(df_output.head())

# Save the results to a new CSV file 
output_path = '../data/analyzed_reviews.csv'
df_output.to_csv(output_path, index=False, encoding='utf-8')
print(f"\nAnalyzed data with sentiment and themes saved to {output_path}")


Final DataFrame Head:
   review_id                                        review_text  \
0          0                         So bad now and hard to use   
1          1  it is so amazing app. but, it is better to upd...   
2          2                                         v.good app   
3          3                                      very good app   
4          4           Very amazing app indeed. I'm enjoying it   

  sentiment_label  sentiment_score  \
0        NEGATIVE         0.999806   
1        POSITIVE         0.949643   
2        POSITIVE         0.995270   
3        POSITIVE         0.999868   
4        POSITIVE         0.999882   

                                 identified_theme(s)  rating        date bank  
0                                   General Feedback       5  2025-06-09  CBE  
1  Transactions & Transfers, UI & Experience, Fea...       5  2025-06-09  CBE  
2                                   General Feedback       4  2025-06-09  CBE  
3                        