In [3]:
# === Data Handling & Preprocessing ===
import pandas as pd                         # For loading and handling datasets
import nltk                                 # Natural Language Toolkit for text processing
from nltk.corpus import stopwords           # Commonly used words to be filtered out
from nltk.tokenize import word_tokenize     # For breaking text into individual words
from nltk.stem import WordNetLemmatizer     # For reducing words to their base form



# === Sentiment Analysis Libraries ===
from textblob import TextBlob               # Provides simple API for common NLP tasks like sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer  # Pre-trained model for sentiment scoring

# === Machine Learning Utilities ===
from sklearn.feature_extraction.text import TfidfVectorizer  # Convert text into numeric form based on word importance
from sklearn.model_selection import train_test_split         # Split data into training and testing sets
from sklearn.naive_bayes import MultinomialNB                # Naive Bayes classifier for text classification
from sklearn.metrics import accuracy_score                   # To evaluate model performance

# === Keyword Extraction (also using TF-IDF) ===
from sklearn.feature_extraction.text import TfidfVectorizer  # (re-imported, but harmless) Used for extracting key terms

# === Data Visualization ===
import matplotlib.pyplot as plt              # For plotting graphs and charts
from wordcloud import WordCloud              # To generate word cloud visualizations from text

In [4]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bob\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bob\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Bob\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Bob\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [5]:
import pandas as pd 
df = pd.read_csv('../data/raw/banks_review_cleaned.csv')
# Display the first few rows to understand the structure
df.head(5)

Unnamed: 0,review,rating,date,bank,source
0,"""Why don’t your ATMs support account-to-accoun...",4,2025-06-06,CBE,Google Play
1,what is this app problem???,1,2025-06-05,CBE,Google Play
2,the app is proactive and a good connections.,5,2025-06-05,CBE,Google Play
3,I cannot send to cbebirr app. through this app.,3,2025-06-05,CBE,Google Play
4,good,4,2025-06-05,CBE,Google Play


In [None]:
import pandas as pd

# === Required NLP imports ===
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# === Load dataset ===
df = pd.read_csv('../data/raw/banks_review_cleaned.csv')

# === Text Preprocessing Function ===
def preprocess_text(text):
    # Tokenize the text and convert to lowercase
    tokens = word_tokenize(text.lower())

    # Initialize the WordNet lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Get the set of English stopwords
    stop_words = set(stopwords.words('english'))

    # Lemmatize and filter out non-alphanumeric tokens and stopwords
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word.isalnum() and word not in stop_words
    ]

    # Return the cleaned text as a single string
    return ' '.join(tokens)

# === Apply Preprocessing to the DataFrame ===
# Make sure the column name matches your CSV
df['processed_review'] = df['review'].apply(preprocess_text)


NameError: name 'word_tokenize' is not defined