### DEPENDENCIES

In [38]:
# DEPENDENCIES

import re
import math
import os
import numpy as np
import pandas as pd
import random
import string
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud
from textblob import TextBlob
from textstat import flesch_reading_ease

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import torch

from rank_bm25 import BM25Okapi

from scipy.stats import chi2_contingency

import nltk
nltk.data.path.append(nltk.__path__[0])
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

import warnings
warnings.filterwarnings(action = 'ignore')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PRIYAM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\PRIYAM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PRIYAM\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PRIYAM\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\PRIYAM\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\PRIYAM\AppData\Roaming\nltk_data...
[nltk_data]   Package avera

### CONFIGURATION

In [6]:
DATA_PATH     = "../data/IMDB_Dataset.csv"
PLOT_PATH     = "../plots/EDA_plots/"
MAX_FEATURES  = 10000
COMMON_WORDS  = 5000
RANDOM_STATE  = 1234

### LOADING THE DATASET

In [7]:
# LOADING THE IMDB DATASET

def load_imdb_dataset():
    """
    Load the IMDb dataset and return the reviews and their corresponding sentiments.

    Returns:
        tuple: A DataFrame with reviews and their sentiments.
    """
    try:
        
        file_path = DATA_PATH
        df        = pd.read_csv(file_path)
        df        = df[['review', 'sentiment']]
        
        return df
    
    except Exception as e:
        print("Error loading dataset:", e)
        return None


### TEXT PREPROCESSING

In [8]:
# PROCESSING THE REVIEWS

def remove_emojis(text):
    
    """
    Remove emojis from the given text.
    
    Args:
        text (str) : Input text.
        
    Returns:
        str        : Text without emojis.
    """
    
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # Transport & map symbols
        u"\U0001F700-\U0001F77F"  # Alchemical symbols
        u"\U0001F780-\U0001F7FF"  # Geometric shapes extended
        u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA00-\U0001FA6F"  # Chess symbols
        u"\U0001FA70-\U0001FAFF"  # Symbols and pictographs extended-A
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251"  # Enclosed characters
        "]+",
        flags     = re.UNICODE,
    )
    return emoji_pattern.sub(r'', text)

def preprocess_reviews(reviews):
    
    """
    Preprocess reviews by removing unnecessary characters, punctuation, digits, stop words,
    and by performing lemmatization.

    Args:
        reviews (pd.Series) : Series of text reviews.

    Returns:
        pd.Series           : Preprocessed reviews.
    """

    reviews = reviews.str.replace('<.*?>', '', regex=True)
    
    reviews = reviews.str.strip()
    
    reviews = reviews.str.lower()

    reviews = reviews.apply(remove_emojis)
    
    reviews = reviews.apply(lambda text: text.translate(str.maketrans('', '', string.punctuation)))
    reviews = reviews.apply(lambda text: ''.join([char for char in text if not char.isdigit()]))
    
    reviews = reviews.apply(word_tokenize)
    
    reviews = reviews.apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])
    
    reviews = reviews.apply(lambda tokens: ' '.join(tokens))
    
    return reviews

In [9]:
df                        = load_imdb_dataset()

le                        = LabelEncoder()
df['sentiment_encoded']   = le.fit_transform(df['sentiment'])

df['sentiment_encoded'].unique()
sentiment_tensor          = torch.tensor(df['sentiment_encoded'])

df['cleaned_review']      = preprocess_reviews(df['review'])

words                     = [word for review in df['cleaned_review'] for word in review.split()]
vocab                     = set(words)

print(f"Total unique words: {len(vocab)}")
print(f"Sample vocabulary: {list(vocab)[:20]}")

Total unique words: 204253
Sample vocabulary: ['humansviolating', 'americathe', 'charater', 'jumpingoff', 'porker', 'mewhy', 'detonating', 'burnsmeanwhile', 'wallpaperscommendable', 'averagemiike', 'briskit', 'budgethalloween', 'chiaki', 'winged', 'ebonics', 'ariadna', 'imponderable', 'leavechoosing', 'bemuses', 'scopophilia']


## EXPLORATORY DATA ANALYSIS

In [10]:
class SentimentAnalysis_EDA:
    
    def __init__(self, data):
        
        self.data       = data
        self.output_dir = PLOT_PATH
        
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
    
    def text_length_analysis(self):

        self.data['word_count']       = self.data['cleaned_review'].apply(lambda x: len(x.split()))
        self.data['sentence_count']   = self.data['cleaned_review'].apply(lambda x: len(TextBlob(x).sentences))
        self.data['char_count']       = self.data['cleaned_review'].apply(len)
        
        plt.figure(figsize = (10, 6))
        sns.boxplot(data   = self.data, 
                    x      = 'sentiment', 
                    y      = 'word_count')
        plt.title('Word Count Analysis')
        plt.savefig(os.path.join(self.output_dir, 'text_length_word_count.png'))
        plt.clf()

        plt.figure(figsize = (10, 6))
        sns.boxplot(data   = self.data, 
                    x      = 'sentiment', 
                    y      = 'sentence_count')
        plt.title('Sentence Count Analysis')
        plt.savefig(os.path.join(self.output_dir, 'text_length_sentence_count.png'))
        plt.clf()
        
        plt.figure(figsize = (10, 6))
        sns.boxplot(data   = self.data, 
                    x      = 'sentiment', 
                    y      = 'char_count')
        plt.title('Character Count Analysis')
        plt.savefig(os.path.join(self.output_dir, 'text_length_char_count.png'))
        plt.clf()

    def word_frequency_analysis(self):
    
        positive_reviews     = self.data[self.data['sentiment'] == 'positive']['cleaned_review']
        all_words_positive   = ' '.join(positive_reviews)
        word_freq_positive   = Counter(all_words_positive.split())
        top_words_positive   = word_freq_positive.most_common(15)
    
        negative_reviews     = self.data[self.data['sentiment'] == 'negative']['cleaned_review']
        all_words_negative   = ' '.join(negative_reviews)
        word_freq_negative   = Counter(all_words_negative.split())
        top_words_negative   = word_freq_negative.most_common(15)

        plt.figure(figsize=(16, 8))
    
        plt.subplot(1, 2, 1) 
        df_pos = pd.DataFrame(top_words_positive, columns = ['Word', 'Frequency'])
        sns.barplot(data     = df_pos, 
                    x        = 'Frequency', 
                    y        = 'Word', 
                    orient   = 'h', 
                    palette  = 'Blues')
        
        plt.title('Top 15 Words Used in Positive Sentiment')
    
        plt.subplot(1, 2, 2)  
        df_neg = pd.DataFrame(top_words_negative, columns = ['Word', 'Frequency'])
        sns.barplot(data     = df_neg, 
                    x        = 'Frequency', 
                    y        = 'Word', 
                    orient   = 'h', 
                    palette  = 'Reds')
        
        plt.title('Top 15 Words Used in Negative Sentiment')
    
        plt.tight_layout()
        combined_output_path = os.path.join(self.output_dir, 'word_frequency.png')
        plt.savefig(combined_output_path)
        plt.clf()


    def wordcloud_analysis(self):
    
        positive_reviews    = ' '.join(self.data[self.data['sentiment'] == 'positive']['cleaned_review'])
        wordcloud_positive  = WordCloud(width            = 400, 
                                        height           = 400, 
                                        background_color = 'white').generate(positive_reviews)
    
        negative_reviews    = ' '.join(self.data[self.data['sentiment'] == 'negative']['cleaned_review'])
        wordcloud_negative  = WordCloud(width            = 400, 
                                        height           = 400, 
                                        background_color = 'white').generate(negative_reviews)
    
        fig, axes           = plt.subplots(2, 1, figsize = (12, 16))
    
        axes[0].imshow(wordcloud_positive, interpolation = 'bilinear')
        axes[0].axis('off')
        axes[0].set_title('Word Cloud for Positive Sentiment', fontsize = 16)
    
        axes[1].imshow(wordcloud_negative, interpolation = 'bilinear')
        axes[1].axis('off')
        axes[1].set_title('Word Cloud for Negative Sentiment', fontsize = 16)
    
        plt.tight_layout()
        combined_output_path = os.path.join(self.output_dir, 'wordcloud.png')
        plt.savefig(combined_output_path)
        plt.clf()

    
    def sentiment_intensity_analysis(self):
        
        self.data['sentiment_intensity'] = self.data['cleaned_review'].apply(lambda x: TextBlob(x).sentiment.polarity)
        
        plt.figure(figsize = (10, 6))
        sns.kdeplot(data   = self.data, 
                    x      = 'sentiment_intensity', 
                    hue    = 'sentiment', 
                    fill   = True)
        plt.title('Sentiment Intensity Analysis')
        plt.savefig(os.path.join(self.output_dir, 'sentiment_intensity_analysis.png'))
        plt.clf()

    def pos_distribution(self):
        
        target_pos            = ['NN', 'VB', 'JJ', 'RB', 'PRP', 'IN', 'CC', 'UH']
    
        self.data['pos_tags'] = self.data['cleaned_review'].apply(
        lambda x: [(word, pos) for word, pos in TextBlob(x).tags if pos in target_pos])
    
        positive_data         = self.data[self.data['sentiment'] == 'positive']
        negative_data         = self.data[self.data['sentiment'] == 'negative']
    
        positive_tags         = [pos for sublist in positive_data['pos_tags'] for _, pos in sublist]
        negative_tags         = [pos for sublist in negative_data['pos_tags'] for _, pos in sublist]
    
        positive_counts       = pd.DataFrame(Counter(positive_tags).items(), columns = ['POS', 'Count'])
        negative_counts       = pd.DataFrame(Counter(negative_tags).items(), columns = ['POS', 'Count'])

        pos_counts_merged     = positive_counts.merge(negative_counts, 
                                                      on       = 'POS', 
                                                      how      = 'outer', 
                                                      suffixes = ('_Positive', '_Negative')).fillna(0)
    
        pos_counts_melted     = pos_counts_merged.melt(id_vars    = 'POS', 
                                                       var_name   = 'Sentiment', 
                                                       value_name = 'Count')
    
        plt.figure(figsize = (12, 6))
        sns.barplot(data = pos_counts_melted, 
                    x    = 'POS', 
                    y    = 'Count', 
                    hue  = 'Sentiment')
        plt.title('POS Distribution by Sentiment')
        plt.xlabel('POS Tags')
        plt.ylabel('Count')
        plt.xticks(rotation = 45) 
        plt.tight_layout()
    
        plt.savefig(os.path.join(self.output_dir, 'pos_distribution.png'))
        plt.clf()


    def readability_plot(self):
        
        self.data['readability_score'] = self.data['cleaned_review'].apply(lambda x: flesch_reading_ease(x) if len(x.split()) > 1 else 0)
    
        positive_data                  = self.data[self.data['sentiment'] == 'positive']
        negative_data                  = self.data[self.data['sentiment'] == 'negative']
    
        plt.figure(figsize = (12, 6))
    
        plt.subplot(1, 2, 1)
        sns.boxplot(data  = positive_data,
                    x     = 'sentiment',
                    y     = 'readability_score',
                    color = 'blue')
        plt.title('Readability Score for Positive Sentiment')
        plt.xlabel('Sentiment')
        plt.ylabel('Readability Score')
        plt.ylim(-10, 120)

        plt.subplot(1, 2, 2)
        sns.boxplot(data  = negative_data,
                    x     = 'sentiment',
                    y     = 'readability_score',
                    color = 'red')
        plt.title('Readability Score for Negative Sentiment')
        plt.xlabel('Sentiment')
        plt.ylabel('Readability Score')
        plt.ylim(-10, 120)
    
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'readability_plot.png'))
        plt.clf()

    
    def perform_eda(self):
        
        self.text_length_analysis()
        self.word_frequency_analysis()
        self.wordcloud_analysis()
        self.sentiment_intensity_analysis()
        self.pos_distribution()
        self.readability_plot()
        
        print("EDA functions have been executed and plots have been saved in 'plots/EDA_plots'.")

In [11]:
eda = SentimentAnalysis_EDA(df)
eda.perform_eda()

EDA functions have been executed and plots have been saved in 'plots/EDA_plots'.


<Figure size 1200x600 with 0 Axes>

# FEATURE ENGINEERING

#### COUNT BASED FEATURE

In [12]:
def vectorize(reviews, top_n = COMMON_WORDS):
    
    """
    Preprocess reviews and convert them into vectors using top N most common words.

    Args:
        reviews (pd.Series)      : Series of preprocessed reviews.
        top_n (int)              : Number of most common words to include in the vocabulary.

    Returns:
        tuple                    : (vectors, word_index, top_words)
            vectors (np.ndarray) : Array of review vectors.
            word_index (dict)    : Mapping of words to their indices in the vectors.
            top_words (list)     : List of top N words in the vocabulary.
    """
    
    all_words    = [word for review in reviews for word in review.split()]

    word_counts  = Counter(all_words)

    top_words    = [word for word, _ in word_counts.most_common(top_n)]

    word_index   = {word: idx for idx, word in enumerate(top_words)}

    def vectorize(text):
        vec      = np.zeros(len(word_index))
        
        for word in text.split():
        
            if word in word_index:
                vec[word_index[word]] += 1
        
        return vec

    vectors      = np.array([vectorize(text) for text in reviews])

    return vectors, word_index, top_words

In [13]:
vectors, word_index, top_words = vectorize(df['cleaned_review'], top_n = COMMON_WORDS)

print("Top 200 words:", top_words)
print("Word index mapping:", word_index)
bow_tensor                     = torch.tensor(vectors, dtype=torch.float32)
print(bow_tensor[:5])

tensor([[0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [3., 2., 0.,  ..., 0., 0., 0.],
        [1., 2., 6.,  ..., 0., 0., 0.]])


#### BINARY FEATURE - PRESENT OR ABSENT

In [14]:
def binarize(reviews, top_n = COMMON_WORDS):
    """
    Preprocess reviews and convert them into binary vectors using the top N most common words.

    Args:
        reviews (pd.Series)             : Series of preprocessed reviews.
        top_n (int)                     : Number of most common words to include in the vocabulary.

    Returns:
        tuple                           : (binary_vectors, word_index, top_words)
            binary_vectors (np.ndarray) : Array of binary vectors.
            word_index (dict)           : Mapping of words to their indices in the vectors.
            top_words (list)            : List of top N words in the vocabulary.
    """

    all_words       = " ".join(reviews).split()

    word_freq       = Counter(all_words)

    top_words       = [word for word, _ in word_freq.most_common(top_n)]

    word_index      = {word: idx for idx, word in enumerate(top_words)}

    binary_vectors  = np.zeros((len(reviews), len(word_index)))

    for i, text in enumerate(reviews):

        for word in text.split():
        
            if word in word_index:
                binary_vectors[i][word_index[word]] = 1

    return binary_vectors, word_index, top_words

In [15]:
binary_vectors, word_index, top_words = binarize(df['cleaned_review'], top_n = COMMON_WORDS)

print("Top 200 words:", top_words)
print("Word index mapping:", word_index)
binary_tensor                         = torch.tensor(binary_vectors, dtype = torch.float32)
print(binary_tensor[:5])

tensor([[0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [1., 1., 0.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]])


#### FREQUENCY BASED FEATURE

In [16]:
def frequencies(reviews, top_n = COMMON_WORDS):
    
    """
    Preprocess reviews and convert them into frequency vectors using the top N most common words.

    Args:
        reviews (pd.Series)                : Series of preprocessed reviews.
        top_n (int)                        : Number of most common words to include in the vocabulary.

    Returns:
        tuple                              : (frequency_vectors, word_index, top_words)
            frequency_vectors (np.ndarray) : Array of frequency vectors.
            word_index (dict)              : Mapping of words to their indices in the vectors.
            top_words (list)               : List of top N words in the vocabulary.
    """
    
    all_words         = " ".join(reviews).split()

    word_freq         = Counter(all_words)

    top_words         = [word for word, _ in word_freq.most_common(top_n)]

    word_index        = {word: idx for idx, word in enumerate(top_words)}

    frequency_vectors = np.zeros((len(reviews), len(word_index)))

    for i, text in enumerate(reviews):
    
        for word in text.split():
        
            if word in word_index:
                frequency_vectors[i][word_index[word]] += 1

    return frequency_vectors, word_index, top_words

In [17]:
frequency_vectors, word_index, top_words = frequencies(df['cleaned_review'], top_n = COMMON_WORDS)

print("Top 200 words:", top_words)
print("Word index mapping:", word_index)
frequency_tensor                         = torch.tensor(frequency_vectors, dtype = torch.float32)
print(frequency_tensor[:5])

tensor([[0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [3., 2., 0.,  ..., 0., 0., 0.],
        [1., 2., 6.,  ..., 0., 0., 0.]])


#### NORMALIZED FREQUENCY FEATURE

In [18]:
def normalized_frequency(reviews, top_n = COMMON_WORDS):
    
    """
    Preprocess reviews and convert them into normalized term frequency (TF) vectors
    using the top N most common words.

    Args:
        reviews (pd.Series)         : Series of preprocessed reviews.
        top_n (int)                 : Number of most common words to include in the vocabulary.

    Returns:
        tuple                       : (tf_vectors, word_index, top_words)
            tf_vectors (np.ndarray) : Array of normalized term frequency vectors.
            word_index (dict)       : Mapping of words to their indices in the vectors.
            top_words (list)        : List of top N words in the vocabulary.
    """
    
    all_words             = " ".join(reviews).split()

    word_freq             = Counter(all_words)

    top_words             = [word for word, _ in word_freq.most_common(top_n)]

    word_index            = {word: idx for idx, word in enumerate(top_words)}

    tf_vectors            = np.zeros((len(reviews), len(word_index)))

    for i, text in enumerate(reviews):
        words             = text.split()
        total_words       = len(words)
        
        for word in words:
        
            if word in word_index:
                tf_vectors[i][word_index[word]] += 1
        
        if total_words > 0:
            tf_vectors[i] = tf_vectors[i] / total_words

    return tf_vectors, word_index, top_words

In [19]:
tf_vectors, word_index, top_words = normalized_frequency(df['cleaned_review'], top_n = COMMON_WORDS)

print("Top 200 words:", top_words)
print("Word index mapping:", word_index)
norm_frequency_tensor             = torch.tensor(tf_vectors, dtype=torch.float32)

print(norm_frequency_tensor[:5])

tensor([[0.0000, 0.0000, 0.0060,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0119,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0118,  ..., 0.0000, 0.0000, 0.0000],
        [0.0455, 0.0303, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0080, 0.0160, 0.0480,  ..., 0.0000, 0.0000, 0.0000]])


#### TERM FREQUENCY-INVERSED DOCUMENT FREQUENCY

In [20]:
def tf_idf(reviews, top_n = COMMON_WORDS):
    
    """
    Preprocess reviews and compute TF-IDF vectors for the top N most common words.

    Args:
        reviews (pd.Series)             : Series of preprocessed reviews.
        top_n (int)                     : Number of most common words to include in the vocabulary.

    Returns:
        tuple                           : (tf_idf_vectors, idf_values, word_index, top_words)
            tf_idf_vectors (np.ndarray) : Array of TF-IDF vectors.
            idf_values (np.ndarray)     : IDF values for the vocabulary.
            word_index (dict)           : Mapping of words to their indices in the vectors.
            top_words (list)            : List of top N words in the vocabulary.
    """

    all_words             = " ".join(reviews).split()

    word_freq             = Counter(all_words)

    top_words             = [word for word, _ in word_freq.most_common(top_n)]

    word_index            = {word: idx for idx, word in enumerate(top_words)}

    n_docs                = len(reviews)
    idf_values            = np.zeros(len(word_index))

    for word, idx in word_index.items():
        doc_count         = sum(1 for text in reviews if word in text.split())
        idf_values[idx]   = math.log(n_docs / (1 + doc_count))

    tf_idf_vectors        = np.zeros((len(reviews), len(word_index)))

    for i, text in enumerate(reviews):
        tf_vector         = np.zeros(len(word_index))
        words             = text.split()
        total_words       = len(words)

        for word in words:

            if word in word_index:
                tf_vector[word_index[word]] += 1

        tf_vector         = tf_vector / total_words if total_words > 0 else tf_vector
        tf_idf_vectors[i] = tf_vector * idf_values

    return tf_idf_vectors, idf_values, word_index, top_words

In [21]:
tf_idf_vectors, idf_values, word_index, top_words  = tf_idf(df['cleaned_review'], top_n = COMMON_WORDS)

print("Top 200 words:", top_words)
print("Word index mapping:", word_index)
tf_idf_tensor = torch.tensor(tf_idf_vectors, dtype = torch.float32)
print(tf_idf_tensor[:5])

tensor([[0.0000, 0.0000, 0.0034,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0069,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0068,  ..., 0.0000, 0.0000, 0.0000],
        [0.0204, 0.0168, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0036, 0.0089, 0.0276,  ..., 0.0000, 0.0000, 0.0000]])


#### WORD LEVEL N-GRAMS

In [22]:
def generate_ngrams(corpus, ngram_ranges = [(1, 1)], max_features = MAX_FEATURES):
    
    """
    Generate n-grams for the given corpus for specified n-gram ranges.

    Args:
        corpus (pd.Series)              : Input text corpus (list or pandas Series).
        ngram_ranges (list of tuples)   : List of n-gram ranges to generate.
        max_features (int)              : Maximum number of features to include.

    Returns:
        dict                            : A dictionary where keys are n-gram ranges and values are arrays of n-grams.
    """
    
    ngram_results                  = {}
    
    for ngram_range in ngram_ranges:
        vectorizer                 = CountVectorizer(ngram_range = ngram_range, max_features = max_features)
        vectorizer.fit(corpus)
        ngram_results[ngram_range] = vectorizer.get_feature_names_out()
    
    return ngram_results

In [23]:
ngram_ranges = [(1, 1), (2, 2), (3, 3)]  
ngrams       = generate_ngrams(df['cleaned_review'], ngram_ranges = ngram_ranges, max_features = MAX_FEATURES)

for ngram_range, ngram_list in ngrams.items():
    
    print(f"{ngram_range} (first 10): {ngram_list[:10]}")

(1, 1) (first 10): ['aaron' 'abandon' 'abandoned' 'abbey' 'abbott' 'abc' 'abducted' 'ability'
 'able' 'ably']
(2, 2) (first 10): ['abbott costello' 'ability make' 'able find' 'able get' 'able keep'
 'able make' 'able see' 'able watch' 'abraham lincoln' 'absolute worst']
(3, 3) (first 10): ['ab tak chappan' 'abbott costello meet' 'able get away' 'abre los ojos'
 'absolute must see' 'absolute waste time' 'absolute worst movie'
 'absolutely love movie' 'absolutely loved film' 'absolutely loved movie']


#### SKIP GRAMS

In [24]:
def generate_skipgrams(corpus, window_size = 2, max_features = MAX_FEATURES):
    
    """
    Generate skip-grams for the given corpus with a specified window size.

    Args:
        corpus (pd.Series or list) : Input text corpus as a list or pandas Series.
        window_size (int)          : Maximum skip distance between words in the skip-grams.
        max_features (int)         : Maximum number of skip-grams to include.

    Returns:
        list                       : A list of the most common skip-grams (up to max_features).
    
    """

    skipgrams_list        = []

    for review in corpus:
        tokens            = review.split()
        skipgrams_list.extend(_extract_skipgrams(tokens, window_size))
    
    skipgram_counts       = Counter(skipgrams_list)
    most_common_skipgrams = skipgram_counts.most_common(max_features)
    
    return [item[0] for item in most_common_skipgrams]


def _extract_skipgrams(tokens, window_size):
    
    """
    Extract skip-grams from a tokenized list of words within a specified window size.

    Args:
        tokens (list)     : Tokenized words from a review.
        window_size (int) : Maximum skip distance.

    Returns:
        list              : A list of skip-grams (tuples of two words).
    """
    
    skipgrams             = []
    for i in range(len(tokens)):
        
        for j in range(i + 1, min(i + window_size + 1, len(tokens))):
            skipgrams.append((tokens[i], tokens[j]))
    
    return skipgrams


In [25]:
skipgrams = generate_skipgrams(df['cleaned_review'], window_size = 2, max_features = MAX_FEATURES)
print("Skipgrams (first 10):", skipgrams[:10])

Skipgrams (first 10): [('ive', 'seen'), ('look', 'like'), ('movie', 'like'), ('ever', 'seen'), ('movie', 'one'), ('good', 'movie'), ('one', 'movie'), ('dont', 'know'), ('special', 'effect'), ('movie', 'ever')]


#### BM25 SCORE

In [26]:
def prepare_corpus(corpus, max_features = MAX_FEATURES):
    
    """
    Prepares the corpus by vectorizing the text and converting it into an array.

    Args:
        corpus (list or pd.Series) : The input text corpus.
        max_features (int)         : Maximum number of features to consider in vectorization.

    Returns:
        tuple                      : Vectorized data array and the fitted CountVectorizer instance.
    """
    
    vectorizer = CountVectorizer(max_features = max_features)
    X          = vectorizer.fit_transform(corpus).toarray()
    
    return X, vectorizer


def prepare_query(query, vectorizer):
    
    """
    Prepares the query by tokenizing and converting it to indices based on the vectorizer vocabulary.

    Args:
        query (str)                   : The input query string.
        vectorizer (CountVectorizer)  : The fitted CountVectorizer instance.

    Returns:
        list                          : Indices of the query tokens found in the vectorizer vocabulary.
    """
    
    query_tokens = query.split()
    query_idx    = [vectorizer.vocabulary_.get(word) for word in query_tokens if word in vectorizer.vocabulary_]
    
    return query_idx


def compute_bm25_scores(corpus, query, max_features=50):
    
    """
    Computes BM25 scores for a given query and corpus.

    Args:
        corpus (list or pd.Series)  : The input text corpus.
        query (str)                 : The input query string.
        max_features (int)          : Maximum number of features to consider in vectorization.

    Returns:
        np.ndarray                  : BM25 scores for the documents in the corpus.
    """
    
    X, vectorizer  = prepare_corpus(corpus, max_features)
    bm25           = BM25Okapi(X)
    query_idx      = prepare_query(query, vectorizer)
    bm25_scores    = bm25.get_scores(query_idx)
    
    return bm25_scores


In [27]:
query = "br"
bm25_scores = compute_bm25_scores(df['cleaned_review'], 
                                  query         = query, 
                                  max_features  = MAX_FEATURES)

print("BM25 Scores (first 5 documents):")
print(bm25_scores[:5])
bm25_tensor = torch.tensor(bm25_scores, dtype=torch.float32)
print(bm25_tensor[:5])

BM25 Scores (first 5 documents):
[0. 0. 0. 0. 0.]
tensor([0., 0., 0., 0., 0.])


#### FIELD-WEIGHTED BM25 (BM25F)

In [28]:
def prepare_corpus(corpus, max_features = MAX_FEATURES):
    
    """
    Prepares the corpus by vectorizing the text and converting it into an array.

    Args:
        corpus (list or pd.Series) : The input text corpus.
        max_features (int)         : Maximum number of features to consider in vectorization.

    Returns:
        tuple                      : Vectorized data array and the fitted CountVectorizer instance.
    """
    
    vectorizer  = CountVectorizer(max_features = max_features)
    X           = vectorizer.fit_transform(corpus).toarray()
    
    return X, vectorizer


def prepare_query(query, vectorizer):
    
    """
    Prepares the query by tokenizing and converting it to indices based on the vectorizer vocabulary.

    Args:
        query (str)                  : The input query string.
        vectorizer (CountVectorizer) : The fitted CountVectorizer instance.

    Returns:
        list                         : Indices of the query tokens found in the vectorizer vocabulary.
    """
    
    query_tokens  = query.split()
    query_idx     = [vectorizer.vocabulary_.get(word) for word in query_tokens if word in vectorizer.vocabulary_]
    
    return query_idx


def compute_weighted_bm25_scores(corpus, query, field_weights, max_features=50):
    
    """
    Computes weighted BM25 scores for a given query and corpus.

    Args:
        corpus (list or pd.Series)  : The input text corpus.
        query (str)                 : The input query string.
        field_weights (dict)        : Weights for different fields, defaulting to a weight of 1 if not found.
        max_features (int)          : Maximum number of features to consider in vectorization.

    Returns:
        list                        : Weighted BM25 scores for the documents in the corpus.
    """
    
    X, vectorizer        = prepare_corpus(corpus, max_features)
    bm25                 = BM25Okapi(X)
    
    query_idx             = prepare_query(query, vectorizer)
    
    bm25_scores           = bm25.get_scores(query_idx)
    
    weighted_bm25_scores  = [score * field_weights.get('title', 1) for score in bm25_scores]
    
    return weighted_bm25_scores

In [29]:
query                 = "br"
field_weights         = {'title': 2, 'body': 1}
weighted_bm25_scores  = compute_weighted_bm25_scores(df['cleaned_review'], 
                                                     query          = query, 
                                                     field_weights  = field_weights, 
                                                     max_features   = MAX_FEATURES)
print("Weighted BM25 Scores (first 10):")
print(weighted_bm25_scores[:10])
w_bm25_tensor = torch.tensor(weighted_bm25_scores, dtype=torch.float32)
print(w_bm25_tensor[:5])

Weighted BM25 Scores (first 10):
[np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0)]
tensor([0., 0., 0., 0., 0.])


#### POSITIONAL N-GRAMS

In [30]:
def generate_positional_ngrams(corpus, n = 2, max_features = MAX_FEATURES):
    
    """
    Generate positional n-grams for the given corpus.

    Args:
        corpus (pd.Series or list)  : Input text corpus as a list or pandas Series.
        n (int)                     : The size of the n-grams.
        max_features (int)          : Maximum number of positional n-grams to return.

    Returns:
        list                        : A list of the most common positional n-grams (up to max_features).
    """
    
    positional_ngrams   = []
    
    for review in corpus:
        tokens          = review.split()
        positional_ngrams.extend(_extract_positional_ngrams(tokens, n))
    
    ngram_counts        = Counter(positional_ngrams)
    most_common_ngrams  = ngram_counts.most_common(max_features)

    return [item[0] for item in most_common_ngrams]


def _extract_positional_ngrams(tokens, n):
    
    """
    Extract positional n-grams from a tokenized list of words.

    Args:
        tokens (list) : Tokenized words from a review.
        n (int)       : The size of the n-grams.

    Returns:
        list          : A list of positional n-grams (tuples containing words and their positions).
    """
    
    positional_ngrams     = []
    
    for i in range(len(tokens) - n + 1):
        positional_ngram  = tuple((tokens[i + j], i + j) for j in range(n))
        positional_ngrams.append(positional_ngram)
    
    return positional_ngrams

In [31]:
positional_ngrams = generate_positional_ngrams(df['cleaned_review'], n = 2, max_features = MAX_FEATURES)
print("Positional N-grams (first 10):", positional_ngrams[:10])

Positional N-grams (first 10): [(('saw', 0), ('movie', 1)), (('saw', 0), ('film', 1)), (('first', 0), ('saw', 1)), (('ever', 4), ('seen', 5)), (('watched', 0), ('movie', 1)), (('ever', 3), ('seen', 4)), (('movie', 2), ('ever', 3)), (('ever', 5), ('seen', 6)), (('worst', 1), ('movie', 2)), (('movie', 3), ('ever', 4))]


### FEATURE SELECTION

#### STATISTICAL METHOD - CHI-SQUARE TEST

In [32]:
def perform_chi2_test(tf_idf_tensor, sentiment_tensor, feature_names, top_n = COMMON_WORDS):
    
    """
    Performs Chi-Square tests for the association between features and sentiment.

    Args:
        tf_idf_tensor (Tensor or np.ndarray)     : TF-IDF feature matrix as a tensor or numpy array.
        sentiment_tensor (Tensor or np.ndarray)  : Sentiment labels as a tensor or numpy array.
        feature_names (list)                     : List of feature names.
        top_n (int)                              : Number of top features to consider for Chi-Square testing.

    Returns:
        list                                     : Sorted results containing feature names, Chi2 values, and p-values.
    """
    
    sentiment_np           = sentiment_tensor.numpy() if hasattr(sentiment_tensor, 'numpy') else sentiment_tensor
    bow_np                 = tf_idf_tensor.numpy() if hasattr(tf_idf_tensor, 'numpy') else tf_idf_tensor
    
    chi2_results           = []
    p_values               = []

    for col, feature_name in enumerate(feature_names[:top_n]):
        feature_values     = bow_np[:, col]
        contingency_table  = pd.crosstab(sentiment_np, feature_values)
        
        chi2, p, _, _      = chi2_contingency(contingency_table)

        chi2_results.append(chi2)
        p_values.append(p)

    sorted_results = sorted(zip(feature_names[:top_n], chi2_results, p_values),
                            key = lambda x: x[1] 
    )

    return sorted_results


In [33]:
max_features    = MAX_FEATURES
corpus          = df['cleaned_review']
vectorizer      = CountVectorizer(max_features = max_features)
X               = vectorizer.fit_transform(corpus).toarray()
feature_names   = list(vectorizer.get_feature_names_out())
sorted_results  = perform_chi2_test(tf_idf_tensor, sentiment_tensor, feature_names, top_n = COMMON_WORDS)

print("Chi-Square Test Results (Sorted by Chi2):")
for feature_name, chi2, p in sorted_results:
    print(f"{feature_name}: Chi2 = {chi2:.4f}, p-value = {p:.4f}")

Chi-Square Test Results (Sorted by Chi2):
instantly: Chi2 = 24.0007, p-value = 0.4616
europa: Chi2 = 26.0003, p-value = 0.4631
joe: Chi2 = 30.0029, p-value = 0.4655
goodfellas: Chi2 = 38.0180, p-value = 0.5145
heartwarming: Chi2 = 41.0010, p-value = 0.4706
grammar: Chi2 = 49.0072, p-value = 0.5132
immense: Chi2 = 51.0024, p-value = 0.5131
julianne: Chi2 = 54.0003, p-value = 0.4359
goofy: Chi2 = 55.3337, p-value = 0.5000
holocaust: Chi2 = 56.3367, p-value = 0.3876
inheritance: Chi2 = 57.0016, p-value = 0.3641
kathy: Chi2 = 57.0168, p-value = 0.5119
inevitable: Chi2 = 57.0274, p-value = 0.3996
itv: Chi2 = 59.0146, p-value = 0.4018
gathered: Chi2 = 60.0157, p-value = 0.4751
interplay: Chi2 = 61.0010, p-value = 0.3342
kirk: Chi2 = 64.0001, p-value = 0.4765
incidentally: Chi2 = 64.0039, p-value = 0.3715
intensity: Chi2 = 64.0065, p-value = 0.5116
jodie: Chi2 = 64.0097, p-value = 0.4761
inconsistent: Chi2 = 64.3406, p-value = 0.5695
inspector: Chi2 = 64.3406, p-value = 0.6034
inventor: Chi2 

#### STATISTICAL METHOD - MUTUAL INFORMATION (MI)

In [34]:
def calculate_top_mi(tf_idf_tensor, sentiment_tensor, feature_names, top_n = COMMON_WORDS):
   
    """
    Calculate mutual information (MI) for top N features.

    Parameters:
    - tf_idf_tensor     : numpy array, TF-IDF transformed document-term matrix.
    - sentiment_tensor  : numpy array, sentiment labels (binary classification).
    - feature_names     : list, list of feature names from TF-IDF.
    - top_n             : int, number of top features to consider.

    Returns:
    - top_mi            : list of tuples (feature_name, mutual_information)
    """
    
    if len(tf_idf_tensor) != len(sentiment_tensor):
        raise ValueError("The number of samples in tf_idf_tensor and sentiment_tensor must be the same.")

    mi                = mutual_info_classif(tf_idf_tensor, sentiment_tensor)

    mi_sorted_indices = np.argsort(mi)[::-1]

    top_mi            = [(feature_names[idx], mi[idx]) for idx in mi_sorted_indices[:top_n]]

    return top_mi

top_50_mi = calculate_top_mi(tf_idf_tensor, sentiment_tensor, feature_names, top_n = COMMON_WORDS)

print("Top 50 features by mutual information:")
for feature_name, mutual_info in top_50_mi:
    print(f"{feature_name}: Mutual Information = {mutual_info:.4f}")


Top 50 features by mutual information:
absence: Mutual Information = 0.0487
aerial: Mutual Information = 0.0380
abruptly: Mutual Information = 0.0263
ancestor: Mutual Information = 0.0233
amazing: Mutual Information = 0.0225
amateur: Mutual Information = 0.0170
alcohol: Mutual Information = 0.0153
allusion: Mutual Information = 0.0147
acquaintance: Mutual Information = 0.0137
also: Mutual Information = 0.0134
angeles: Mutual Information = 0.0128
acceptable: Mutual Information = 0.0127
anderson: Mutual Information = 0.0125
antonio: Mutual Information = 0.0124
appearance: Mutual Information = 0.0123
among: Mutual Information = 0.0114
ada: Mutual Information = 0.0111
abyss: Mutual Information = 0.0105
aimed: Mutual Information = 0.0102
associate: Mutual Information = 0.0097
accident: Mutual Information = 0.0096
ashley: Mutual Information = 0.0095
flavor: Mutual Information = 0.0095
caddyshack: Mutual Information = 0.0094
absent: Mutual Information = 0.0094
ambiguity: Mutual Information = 

In [35]:
top_n                    = COMMON_WORDS
mi                       = mutual_info_classif(tf_idf_tensor, sentiment_tensor)
selected_features        = [feature for feature, mi_score in zip(feature_names, mi) if mi_score > 0.001]
print(selected_features)

selected_feature_indices = [feature_names.index(feature) for feature in selected_features]

X_selected = tf_idf_tensor[:, selected_feature_indices].numpy()

['aaron', 'abandon', 'abandoned', 'abbey', 'abbott', 'abc', 'abducted', 'ability', 'able', 'aboard', 'abominable', 'abomination', 'abortion', 'abroad', 'abruptly', 'absence', 'absent', 'absolute', 'absolutely', 'absorbed', 'abstract', 'absurdity', 'abundance', 'abuse', 'abused', 'abusive', 'abyss', 'accept', 'acceptable', 'accepting', 'accepts', 'accident', 'acclaim', 'acclaimed', 'accompany', 'accomplished', 'accusation', 'accused', 'accustomed', 'achieve', 'achieved', 'achievement', 'achieving', 'acknowledge', 'acknowledged', 'acquaintance', 'acquired', 'across', 'actionpacked', 'active', 'activist', 'actor', 'actress', 'actual', 'ad', 'ada', 'adam', 'adapt', 'adaptation', 'adapted', 'adapting', 'adaption', 'add', 'added', 'addict', 'adding', 'additionally', 'address', 'adequate', 'adjective', 'administration', 'admirable', 'admirably', 'admire', 'admired', 'admirer', 'admit', 'admitted', 'admittedly', 'adopt', 'adult', 'adulthood', 'advantage', 'adventure', 'adventurous', 'adversary

## MODEL FITTING

#### LOGISTIC REGRESSION WITH L2 REGULARIZATION

In [39]:
def logistic_regression(X_selected, sentiment_tensor, output_dir):
    
    """
        Train a Logistic Regression model for sentiment analysis, evaluate its performance, 
        and save the confusion matrix plot to the specified directory.

        Args:
            X_selected (numpy.ndarray or pandas.DataFrame)  : The feature matrix containing selected input features for model training.
            sentiment_tensor (torch.Tensor)                 : A PyTorch tensor containing sentiment labels (0 or 1).
            output_dir (str)                                : The directory where the confusion matrix plot will be saved.

        Returns:
            tuple: 
                - log_reg_model (LogisticRegression)        : Trained Logistic Regression model.
                - accuracy (float)                          : Accuracy of the model on the test set.
                - precision (float)                         : Precision score of the model on the test set.
                - recall (float)                            : Recall score of the model on the test set.
                - f1 (float)                                : F1 score of the model on the test set.
    """
    
    y                                = sentiment_tensor.numpy()

    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

    log_reg_model                    = LogisticRegression(penalty  = 'l2', 
                                                          solver   = 'liblinear', 
                                                          max_iter = 1000)
    log_reg_model.fit(X_train, y_train)

    y_pred                           = log_reg_model.predict(X_test)

    accuracy                         = accuracy_score(y_test, y_pred)
    conf_matrix                      = confusion_matrix(y_test, y_pred)
    precision                        = precision_score(y_test, y_pred, average = 'binary')
    recall                           = recall_score(y_test, y_pred, average = 'binary')
    f1                               = f1_score(y_test, y_pred, average = 'binary')

    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

    plt.figure(figsize = (8, 6))
    sns.heatmap(conf_matrix,
                annot       = True,
                fmt         = "d",
                cmap        = "Greens",
                cbar        = False,
                xticklabels = ['Negative', 'Positive'],
                yticklabels = ['Negative', 'Positive'])
    plt.title("Confusion Matrix for Logistic Regression")
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")

    metrics_text = (
        f"Accuracy: {accuracy * 100:.2f}%\n"
        f"Precision: {precision:.2f} | Recall: {recall:.2f} | F1 Score: {f1:.2f}"
    )
    plt.text(0.5, 1.1, metrics_text, fontsize = 12, ha = 'center', va = 'center', transform = plt.gca().transAxes)

    os.makedirs(output_dir, exist_ok=True)
    file_path = os.path.join(output_dir, 'logistic_regression_conf_matrix.png')
    plt.tight_layout()
    plt.savefig(file_path)
    plt.clf()

    print(f"Confusion matrix plot saved at: {file_path}")

    return log_reg_model, accuracy, precision, recall, f1


In [40]:
output_dir = PLOT_PATH
log_reg_model, accuracy, precision, recall, f1 = logistic_regression(X_selected, sentiment_tensor, output_dir)

Accuracy: 86.09%
Precision: 0.85
Recall: 0.88
F1 Score: 0.86
Confusion matrix plot saved at: ../plots/EDA_plots/logistic_regression_conf_matrix.png


<Figure size 800x600 with 0 Axes>

#### SUPPORT VECTOR MACHINE (SVM) WITH RBF KERNEL

In [41]:
def rbf_svm_model(X_selected, sentiment_tensor, output_dir):

    """
        Train a Support Vector Machine with RBF Kernel Function for sentiment analysis, evaluate its performance, 
        and save the confusion matrix plot to the specified directory.

        Args:
            X_selected (numpy.ndarray or pandas.DataFrame)  : The feature matrix containing selected input features for model training.
            sentiment_tensor (torch.Tensor)                 : A PyTorch tensor containing sentiment labels (0 or 1).
            output_dir (str)                                : The directory where the confusion matrix plot will be saved.

        Returns:
            tuple: 
                - svm_rbf_model (RBF SVM)                   : Trained Logistic Regression model.
                - accuracy (float)                          : Accuracy of the model on the test set.
                - precision (float)                         : Precision score of the model on the test set.
                - recall (float)                            : Recall score of the model on the test set.
                - f1 (float)                                : F1 score of the model on the test set.
    """

    y                                = sentiment_tensor.numpy()

    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size = 0.2, random_state = 42)

    svm_model                        = SVC(kernel = 'rbf')
    svm_model.fit(X_train, y_train)

    y_pred                           = svm_model.predict(X_test)

    accuracy                         = accuracy_score(y_test, y_pred)
    conf_matrix                      = confusion_matrix(y_test, y_pred)
    precision                        = precision_score(y_test, y_pred, average = 'binary')
    recall                           = recall_score(y_test, y_pred, average = 'binary')
    f1                               = f1_score(y_test, y_pred, average = 'binary')

    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

    plt.figure(figsize = (8, 6))
    sns.heatmap(conf_matrix,
                annot       = True,
                fmt         = "d",
                cmap        = "Blues",
                cbar        = False,
                xticklabels = ['Negative', 'Positive'],
                yticklabels = ['Negative', 'Positive'])
    plt.title("Confusion Matrix for SVM with RBF")
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")

    metrics_text = (
        f"Accuracy: {accuracy * 100:.2f}%\n"
        f"Precision: {precision:.2f} | Recall: {recall:.2f} | F1 Score: {f1:.2f}"
    )
    plt.text(0.5, 1.1, metrics_text, fontsize = 12, ha = 'center', va = 'center', transform = plt.gca().transAxes)

    os.makedirs(output_dir, exist_ok=True)
    file_path = os.path.join(output_dir, 'svm_confusion_matrix.png')
    plt.tight_layout()
    plt.savefig(file_path)
    plt.clf()

    print(f"Confusion matrix plot saved at: {file_path}")

    return svm_model, accuracy, precision, recall, f1


In [42]:
output_dir = PLOT_PATH
svm_model, accuracy, precision, recall, f1 = rbf_svm_model(X_selected, sentiment_tensor, output_dir)

Accuracy: 87.16%
Precision: 0.85
Recall: 0.90
F1 Score: 0.88
Confusion matrix plot saved at: ../plots/EDA_plots/svm_confusion_matrix.png


<Figure size 800x600 with 0 Axes>

#### RANDOM FOREST CLASSIFIER

In [45]:
def random_forest(X_selected, sentiment_tensor, output_dir):

    """
        Train a Random Forest (Ensemble Learning Technique) for sentiment analysis, evaluate its performance, 
        and save the confusion matrix plot to the specified directory.

        Args:
            X_selected (numpy.ndarray or pandas.DataFrame)  : The feature matrix containing selected input features for model training.
            sentiment_tensor (torch.Tensor)                 : A PyTorch tensor containing sentiment labels (0 or 1).
            output_dir (str)                                : The directory where the confusion matrix plot will be saved.

        Returns:
            tuple: 
                - random_forest_model (RandomForest)        : Trained Logistic Regression model.
                - accuracy (float)                          : Accuracy of the model on the test set.
                - precision (float)                         : Precision score of the model on the test set.
                - recall (float)                            : Recall score of the model on the test set.
                - f1 (float)                                : F1 score of the model on the test set.
    """

    y                                = sentiment_tensor.numpy()

    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size = 0.2, random_state = 42)

    rf_model                         = RandomForestClassifier(n_estimators = 100, 
                                                              max_depth    = 10, 
                                                              random_state = 42)
    rf_model.fit(X_train, y_train)

    y_pred                           = rf_model.predict(X_test)

    accuracy                         = accuracy_score(y_test, y_pred)
    conf_matrix                      = confusion_matrix(y_test, y_pred)
    precision                        = precision_score(y_test, y_pred, average = 'binary')
    recall                           = recall_score(y_test, y_pred, average = 'binary')
    f1                               = f1_score(y_test, y_pred, average = 'binary')

    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

    plt.figure(figsize = (8, 6))
    sns.heatmap(conf_matrix,
                annot       = True,
                fmt         = "d",
                cmap        = "Blues",
                cbar        = False,
                xticklabels = ['Negative', 'Positive'],
                yticklabels = ['Negative', 'Positive'])
    plt.title("Confusion Matrix for Random Forest Classifier")
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")

    metrics_text = (
        f"Accuracy: {accuracy * 100:.2f}%\n"
        f"Precision: {precision:.2f} | Recall: {recall:.2f} | F1 Score: {f1:.2f}"
    )
    plt.text(0.5, 1.1, metrics_text, fontsize = 12, ha = 'center', va = 'center', transform = plt.gca().transAxes)

    os.makedirs(output_dir, exist_ok=True)
    file_path = os.path.join(output_dir, 'random_forest_conf_matrix.png')
    plt.tight_layout()
    plt.savefig(file_path)
    plt.clf()

    print(f"Confusion matrix plot saved at: {file_path}")

    return rf_model, accuracy, precision, recall, f1


In [46]:
output_dir = PLOT_PATH
rf_model, accuracy, precision, recall, f1 = random_forest(X_selected, sentiment_tensor, output_dir)

Accuracy: 81.31%
Precision: 0.78
Recall: 0.87
F1 Score: 0.82
Confusion matrix plot saved at: ../plots/EDA_plots/random_forest_conf_matrix.png


<Figure size 800x600 with 0 Axes>

#### PREDICTING THE SENTIMENT

In [49]:
def predict_sentiment(review, vectorizer, svm_model):
    
    """
    Predict the sentiment of a given review using the trained SVM model.

    Args:
        review (str)                                   : The input review as a string.
        vectorizer (sklearn.feature_extraction.text.*) : The vectorizer used to transform input text data (e.g., TfidfVectorizer).
        svm_model (SVC)                                : The trained SVM model with an RBF kernel.

    Returns:
        str                                            : Predicted sentiment ('Positive' or 'Negative').
    """
    
    review_vectorized  = vectorizer.transform([review])

    prediction         = svm_model.predict(review_vectorized)

    sentiment          = "Positive" if prediction[0] == 1 else "Negative"
    
    return sentiment
    