### DEPENDENCIES

In [2]:
# DEPENDENCIES

import re
import tqdm
import torch
import numpy as np
import pandas as pd

import tensorflow_hub as hub

from scipy.sparse import hstack
from scipy.sparse import spmatrix
from scipy.sparse import issparse
from scipy.sparse import csr_matrix

from lightgbm import LGBMClassifier

from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn.feature_selection import chi2
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFECV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mutual_info_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import LabelPropagation
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import HistGradientBoostingClassifier

from gensim.models import Word2Vec
from gensim.models import FastText

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn

from transformers import BertModel
from transformers import BertTokenizer
from transformers import DistilBertModel
from transformers import PretrainedConfig
from transformers import DistilBertTokenizer

import warnings
warnings.filterwarnings(action = 'ignore')

#### CONFIGURATION

In [4]:
# DATA PATHS
DATA_PATH                                                       = '../data/IMDB_Dataset.csv'
TEST_DATA_PATH                                                  = '../data/test_data.csv'
Emotion_path                                                    = '../data/emotion_lexicon/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'


# CONFIGURATION VARIABLES
BATCH_SIZE                                                      = 250
MAX_FEATURES                                                    = 500
MODEL_NAME                                                      = "logistic_regression"
KERNEL_NAME                                                     = None # IF MODEL DOESN'T HAVE KERNEL, MAKE IT NONE
GLOVE_MODEL_PATH                                                = "models/glove.6B.100d.txt"
ELMO_MODEL_URL                                                  = "https://tfhub.dev/google/elmo/3"
WORD2VEC_MODEL                                                  = "word2vec-google-news-300"
BERT_CONFIG                                                     = "../models/BERT/config.json"
BERT_MODEL_SAFETENSORS                                          = "../models/BERT/model.safetensors"
BERT_TOKENIZER_CONFIG                                           = "../models/BERT/tokenizer_config.json"
BERT_TOKENIZER                                                  = "../models/BERT/tokenizer.json"
BERT_VOCABULARY                                                 = "../models/BERT/vocab.txt"
DISTILBERT_CONFIG                                               = "models/DISTILBERT/config.json"
DISTILBERT_MODEL_SAFETENSORS                                    = "models/DISTILBERT/model.safetensors"
DISTILBERT_TOKENIZER_CONFIG                                     = "models/DISTILBERT/tokenizer_config.json"
DISTILBERT_TOKENIZER                                            = "models/DISTILBERT/tokenizer.json"
DISTILBERT_VOCABULARY                                           = "models/DISTILBERT/vocab.txt"
DISTILBERT_PYTORCH_BIN                                          = "models/DISTILBERT/pytorch_model.bin"


# PARAMETER DICTIONARY
MODEL_PARAMS_DICT                                               = {'C'                 : 1.0,
                                                                   'tol'               : 0.001,
                                                                   'loss'              : 'log_loss',
                                                                   'solver'            : 'lbfgs',
                                                                   'penalty'           : 'l2',
                                                                   'max_iter'          : 1000,
                                                                   'max_depth'         : 50,
                                                                   'n_neighbors'       : 2,
                                                                   'max_features'      : 1,
                                                                   'learning_rate'     : 0.01,
                                                                   'min_samples_leaf'  : 5,
                                                                   'hidden_layer_size' : 1000,
                                                                   'l2_regularization' : 0.01,
                                                                   'min_samples_split' : 10             
                                                                  }


# RESULT PATHS
SENTIMENT_ANALYSIS_SVM_RBF_RESULT                               = 'results/sentiment_analysis_result_svm_rbf.csv'
SENTIMENT_ANALYSIS_LOGISTIC_RESULT                              = 'results/sentiment_analysis_result_logistic.csv'
SENTIMENT_ANALYSIS_LIGHTGBM_RESULT                              = 'results/sentiment_analysis_result_lightgbm.csv'
SENTIMENT_ANALYSIS_ADABOOST_RESULT                              = 'results/sentiment_analysis_result_adaboost.csv'
SENTIMENT_ANALYSIS_SVM_SIGMOID_RESULT                           = 'results/sentiment_analysis_result_svm_sigmoid.csv'
SENTIMENT_ANALYSIS_RANDOM_FOREST_RESULT                         = 'results/sentiment_analysis_result_random_forest.csv'
SENTIMENT_ANALYSIS_SVM_POLYNOMIAL_RESULT                        = 'results/sentiment_analysis_result_svm_polynomial.csv'
SENTIMENT_ANALYSIS_GRADIENT_BOOST_RESULT                        = 'results/sentiment_analysis_result_gradient_boost.csv'
SENTIMENT_ANALYSIS_LABEL_PROPAGATION_RESULT                     = 'results/sentiment_analysis_result_label_propagation.csv'
SENTIMENT_ANALYSIS_LOGISTIC_WITH_CUSTOM_FEAT                    = 'results/sentiment_analysis_result_logistic_with_coustom_features.csv'
SENTIMENT_ANALYSIS_GAUSSIAN_NAIVE_BAYES_RESULT                  = 'results/sentiment_analysis_result_gaussian_naive_bayes.csv'
SENTIMENT_ANALYSIS_MULTILAYER_PERCEPTRON_RESULT                 = 'results/sentiment_analysis_result_multilayer_perceptron.csv'
SENTIMENT_ANALYSIS_LOGISTIC_RESULT_BY_STAT_FEAT                 = 'results/sentiment_analysis_result_logistic_by_statistical_features.csv'
SENTIMENT_ANALYSIS_LIGHTGBM_RESULT_BY_STAT_FEAT                 = 'results/sentiment_analysis_result_lightgbm_by_statistical_features.csv'
SENTIMENT_ANALYSIS_LOGISTIC_DECISION_TREE_RESULT                = 'results/sentiment_analysis_result_logistic_model_tree.csv'
SENTIMENT_ANALYSIS_MULTINOMIAL_NAIVE_BAYES_RESULT               = 'results/sentiment_analysis_result_naive_bayes.csv'
SENTIMENT_ANALYSIS_SVM_RBF_RESULT_WITH_CONTEXTUALS              = 'results/sentiment_analysis_result_svm_rbf_with_contextuals.csv'
SENTIMENT_ANALYSIS_SVM_RBF_RESULT_WITH_CONTEXTUALS              = 'results/sentiment_analysis_result_svm_rbf_with_contextuals.csv'
SENTIMENT_ANALYSIS_SVM_RBF_BY_SEMANTIC_FEAT_RESULT              = 'results/sentiment_analysis_result_svm_rbf_by_semantic_features.csv'
SENTIMENT_ANALYSIS_ADABOOST_RESULT_WITH_CONTEXTUALS             = 'results/sentiment_analysis_result_adaboost_with_contextuals.csv'
SENTIMENT_ANALYSIS_LOGISTIC_GAUSSIAN_NAIVE_BAYES_RESULT         = 'results/sentiment_analysis_result_logistic_gaussian_naive_bayes.csv'
SENTIMENT_ANALYSIS_HIST_GRADIENT_BOOSTING_CLASSIFIER_RESULT     = 'results/sentiment_analysis_result_hist_gradient_boosting_classifier.csv'
SENTIMENT_ANALYSIS_GAUSSIAN_NAIVE_BAYES_RESULT_WITH_CONTEXTUALS = 'results/sentiment_analysis_gaussian_naive_bayes_with_contextuals.csv'
SENTIMENT_ANALYSIS_LOGISTIC_REG_BY_SEMANTIC_FEAT_RESULT         = 'results/sentiment_analysis_result_logistic_reg_by_semantic_features.csv'
SENTIMENT_ANALYSIS_SVM_RBF_BY_SEMANTIC_FEAT_RESULT              = 'results/sentiment_analysis_result_svm_rbf_by_semantic_features.csv'
SENTIMENT_ANALYSIS_SVM_SIGMOID_BY_SEMANTIC_FEAT_RESULT          = 'results/sentiment_analysis_result_svm_sigmoid_by_semantic_features.csv'
SENTIMENT_ANALYSIS_GAUSSIAN_NB_BY_SEMANTIC_FEAT_RESULT          = 'results/sentiment_analysis_result_gaussian_nb_by_semantic_features.csv'
SENTIMENT_ANALYSIS_LIGHT_GBM_BY_SEMANTIC_FEAT_RESULT            = 'results/sentiment_analysis_result_light_gbm_by_semantic_features.csv'
SENTIMENT_ANALYSIS_RANDOM_FOREST_BY_SEMANTIC_FEAT_RESULT        = 'results/sentiment_analysis_result_random_forest_by_semantic_features.csv'
SENTIMENT_ANALYSIS_LABEL_PROP_BY_SEMANTIC_FEAT_RESULT           = 'results/sentiment_analysis_result_label_prop_by_semantic_features.csv'
SENTIMENT_ANALYSIS_LOGISTIC_REG_BY_ALL_FEAT_RESULT              = 'results/sentiment_analysis_result_logistic_reg_by_all_features.csv'
SENTIMENT_ANALYSIS_GAUSSIAN_NB_BY_ALL_FEAT_RESULT               = 'results/sentiment_analysis_result_gaussian_nb_by_all_features.csv'
SENTIMENT_ANALYSIS_LABEL_PROP_BY_ALL_FEAT_RESULT                = 'results/sentiment_analysis_result_label_prop_by_all_features.csv'
SENTIMENT_ANALYSIS_MULTILAYER_PERCEPTRON_BY_ALL_FEAT_RESULT     = 'results/sentiment_analysis_result_multilayer_perceptron_by_all_features.csv'
SENTIMENT_ANALYSIS_LOGISTIC_REG_BY_BERT                         = 'results/sentiment_analysis_result_logistic_regression_by_bert.csv'

SAVE_PATH_VARIABLE = SENTIMENT_ANALYSIS_LOGISTIC_REG_BY_BERT

### DATA LOADER

In [6]:
def load_csv_data(filepath: str) -> pd.DataFrame:
    """
    Load the CSV dataset

    Arguments:
    ----------
        filepath { str } : 

    Errors:
    -------
        DataLoadingError : 

    Returns:
    --------
        { DataFrame }    : 
    """
    try:
        dataframe = pd.read_csv(filepath_or_buffer = filepath,
                                index_col          = None)

        return dataframe

    except Exception as DataLoadingError:
        raise RuntimeError(f"Error loading data: {repr(DataLoadingError)}")

### TEXT PREPROCESSING

In [8]:
class TextPreprocessor:
    """
    A class for preprocessing text data through cleaning, tokenization, and normalization
    
    Attributes:
    -----------
        lemmatizer : WordNetLemmatizer instance for word lemmatization
        
        stop_words : Set of stopwords to be removed from text
    """ 
    def __init__(self):
        """
        Initialize the TextPreprocessor with required NLTK resources
        
        Raises:
        -------
            LookupError : If required NLTK resources cannot be downloaded
        """
        try:
            # Download required NLTK data
            nltk.download('punkt', quiet=True)
            nltk.download('stopwords', quiet=True)
            nltk.download('wordnet', quiet=True)
            nltk.download('punkt_tab', quiet=True)
            
            self.lemmatizer = WordNetLemmatizer()
            self.stop_words = set(stopwords.words('english'))
            
        except LookupError as e:
            raise
    
    def clean_text(self, text:str) -> str:
        """
        Clean and normalize input text by removing HTML tags, special characters,
        and applying text normalization techniques
        
        Arguments:
        ----------
            text { str }      : Input text to be cleaned
            
        Raises:
        -------
            ValueError        : If input text is None or empty
            
            TextCleaningError : If any error occurs at any step of text cleaning process
            
        Returns:
        --------
                { str }       : Cleaned and normalized text
        """
        if ((not text) or (not isinstance(text, str))):
            raise ValueError("Input text must be a non-empty string")
            
        try:
            # Remove HTML tags
            text   = re.sub('<[^>]*>', '', text)
            
            # Remove special characters and digits
            text   = re.sub('[^a-zA-Z\s]', '', text)
            
            # Convert to lowercase
            text   = text.lower()
            
            # Tokenization
            tokens = word_tokenize(text)
            
            # Remove stopwords and lemmatize
            tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stop_words]
            
            return ' '.join(tokens)
        
        except Exception as TextCleaningError:
            raise

### SEMANTIC FEATURE ENGINEERING

In [10]:
# ----- SEMANTIC FEATURE ENGINEERING -----

class Semantic_Feature_Engineering:
    
    """
    A class for implementing various semantic feature engineering techniques.
    
    Attributes:
    -----------
        texts        { list }  : List of preprocessed text documents.
        
        max_features  { int }  : Maximum number of features to create.
    """
    
    def __init__(self, texts: list, max_features: int = None) -> None:
        
        """
        Initialize Semantic_Feature_Engineering with texts and parameters.
        
        Arguments:
        ----------
            texts        : List of preprocessed text documents.
            
            max_features : Maximum number of features (None for no limit).
            
        Raises:
        -------
            ValueError   : If texts is empty or parameters are invalid.
        """
        
        if not texts:
            raise ValueError("Input texts cannot be empty")
            
        self.texts        = texts
        self.max_features = max_features
    
    # ----- WORD2VEC MODEL -----
    
    def word2vec_cbow(self, vector_size: int = None, window: int = 5, min_count: int = 1, workers: int = 4) -> tuple:
        """
        Generate semantic features using Word2Vec (CBOW) and return the vectorizer and feature matrix.
    
        Arguments:
        ----------
            vector_size      : Dimensionality of word embeddings (default: 100).
            window           : Context window size (default: 5).
            min_count        : Ignores words with frequency lower than this (default: 1).
            workers          : Number of worker threads to train the model (default: 4).
    
        Returns:
        --------
            tuple:
                - Word2Vec   : The trained Word2Vec model (vectorizer).
                - np.ndarray : Document-level feature matrix (each document represented as the average of its word vectors).
        """
        try:
            print("Creating Word2Vec (CBOW) features")
            
            if vector_size is None:
                vector_size = self.max_features
            
            tokenized_texts         = [doc.split() for doc in self.texts]
            
            max_features            = self.max_features

            w2v_model               = Word2Vec(sentences   = tokenized_texts,
                                               vector_size = vector_size,
                                               window      = window,
                                               min_count   = min_count,
                                               workers     = workers,
                                               sg          = 0
                                               )
        
            features                = []
            
            for tokens in tokenized_texts:
                vectors             = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
            
                if vectors:
                    document_vector = np.mean(vectors, axis=0)
                else:
                    document_vector = np.zeros(vector_size)
            
                features.append(document_vector)
        
            feature_matrix          = np.array(features, dtype=np.float32)

            if self.max_features is not None and self.max_features < vector_size:
                feature_matrix      = feature_matrix[:, :self.max_features]
        
            print(f"Created {MAX_FEATURES} Word2Vec (CBOW) features with shape: {feature_matrix.shape}")
        
            return w2v_model, feature_matrix

        except Exception as e:
            raise Exception(f"Error in creating Word2Vec (CBOW) features: {str(e)}")
    
      # ----- GLOVE EMBEDDING -----
    
    def glove(self, glove_path: str, embedding_dim: int = 100, desired_features: int = None) -> tuple:
        """
        Generate semantic features using GloVe and return the feature matrix and embedding dictionary.
    
        Arguments:
        ----------
            glove_path        : Path to the GloVe embeddings file.
            embedding_dim     : Dimensionality of GloVe embeddings (default: 100).
            desired_features  : Number of features to extract (default: 10000).
    
        Returns:
        --------
            tuple:
                - np.ndarray  : Document-level feature matrix (each document represented as the average of its word vectors).
                - dict        : The GloVe embedding dictionary.
        """

        try:
            print("Creating GloVe features")
            
            if desired_features is None:
                desired_features = self.max_features
        
            glove_embeddings               = {}
            
            with open(glove_path, 'r', encoding='utf-8') as f:
                for line in f:
                    values                 = line.split()
                    word                   = values[0]
                    vector                 = np.asarray(values[1:], dtype='float32')
                    glove_embeddings[word] = vector

            tokenized_texts                = [doc.split() for doc in self.texts]
        
            features                       = []
        
            for tokens in tokenized_texts:
                vectors                    = [glove_embeddings[word] for word in tokens if word in glove_embeddings]
            
                if vectors:
                 document_vector           = np.mean(vectors, axis=0)
                
                else:
                   document_vector         = np.zeros(embedding_dim)
            
            features.append(document_vector)
        
            feature_matrix                 = np.array(features)
        
            if desired_features < embedding_dim:
                feature_matrix             = feature_matrix[:, :desired_features]
        
            print(f"{MAX_FEATURES} GloVe features created with shape: {feature_matrix.shape}")
        
            return glove_embeddings, feature_matrix

        except Exception as e:
            raise Exception(f"Error in creating GloVe features: {str(e)}")
    
    # ----- FAST-TEXT VECTORIZER ------
    
    def fasttext(self, vector_size: int = None, window: int = 5, min_count: int = 1, workers: int = 4, precision: type = np.float32) -> tuple:
        """
        Generate semantic features using the FastText model (skip-gram) and return the vectorizer and feature matrix.
    
        Arguments:
        ----------
            vector_size       : Dimensionality of word embeddings (default: 10000).
            window            : Context window size (default: 5).
            min_count         : Ignores words with frequency lower than this (default: 1).
            workers           : Number of worker threads to train the model (default: 4).
            precision         : Data type for the feature matrix (default: np.float32).
    
        Returns:
        --------
            tuple:
                - FastText    : The trained FastText model (vectorizer).
                - np.ndarray  : Document-level feature matrix (each document represented as the average of its word vectors).
        """
        try:
            print("Creating FastText (skip-gram) features")
            
            if vector_size is None:
                vector_size = self.max_features
        
            tokenized_texts         = [doc.split() for doc in self.texts]
        
            max_features            = self.max_features

            fasttext_model          = FastText(sentences   = tokenized_texts, 
                                               vector_size = vector_size, 
                                               window      = window, 
                                               min_count   = min_count, 
                                               workers     = workers, 
                                               sg          = 1
                                               )

            features                = []
        
            for tokens in tokenized_texts:
                vectors             = [fasttext_model.wv[word] for word in tokens if word in fasttext_model.wv]
            
                if vectors:
                    document_vector = np.mean(vectors, axis = 0)
                else:
                    document_vector = np.zeros(vector_size)
            
                features.append(document_vector)
        
            feature_matrix           = np.array(features, precision)

            if self.max_features is not None and self.max_features < vector_size:
                feature_matrix       = feature_matrix[:, :self.max_features]
        
            print(f"Created {MAX_FEATURES} FastText (skip-gram) features with shape: {feature_matrix.shape}")
        
            return fasttext_model, feature_matrix

        except Exception as e:
            raise Exception(f"Error in creating FastText (skip-gram) features: {str(e)}")
    
    
    # ----- CONTEXTUAL EMBEDDING -----
    class Contextual_Embedding:
        """
        
        Class to generate Contextual Embeddings using various models using ELMo, BERT, GPT.
        
        """
        
        def __init__(self, texts: list):
            
            """
            Initialize Semantic_Feature_Engineering with texts and parameters.
        
            Arguments:
            ----------
                texts        : List of preprocessed text documents.
            
            Raises:
            -------
                ValueError   : If texts is empty or parameters are invalid.
            """
            if not texts:
                raise ValueError("Input texts cannot be empty")
            
            self.texts        = texts
            self.max_features = None
        
        # ----- ELMO EMBEDDING -----
        
        def elmo(self, model_url: str, batch_size: int = 32) -> np.ndarray:
            
            """
            Generate contextual embeddings using ELMo model and return the feature matrix.
            
            Arguments:
            ----------
                options_file   : Path to the ELMo options file.
                weight_file    : Path to the ELMo pre-trained weights file.
                batch_size     : Batch size for processing text (default: 32).
        
            Returns:
            --------
                - np.ndarray   : Document-level feature matrix (average ELMo embeddings for each document).
                - elmo         : The trained ELMo model (vectorizer).
            """
            
            try:
                print("Creating ELMo Model Features")
    
                elmo_model = hub.load(model_url)

            
                document_embeddings  = []
            
                for text in self.texts:
                    tokens           = text.split()
                
                    if len(tokens) == 0:
                        document_embeddings.append(np.zeros(1024)) 
                        continue
                
                    embeddings       = elmo_model.signatures["default"](input={"tokens": tokens})["output"]
                
                    document_vector  = np.mean(embeddings, axis=0)
                
                    document_embeddings.append(document_vector)
            
                feature_matrix       = np.array(document_embeddings, dtype = np.float32)

                if self.max_features is not None and self.max_features < 1024:
                    feature_matrix   = feature_matrix[:, :self.max_features]
                    
                print(f"Created {MAX_FEATURES} ELMo Semantic Features: {feature_matrix.shape}")
            
                return feature_matrix, elmo_model
            
            except Exception as e:
                raise Exception(f"Error in creating ELMo features: {str(e)}")
    
    # ------ WORDNET FEATURES -----

    def wordnet(self) -> tuple:
        """
        Generate semantic features using WordNet, including synonyms, hypernyms, hyponyms, 
        and meronyms, and return the feature matrix and WordNet corpus.
    
        Arguments:
        ----------
            None
    
        Returns:
        --------
            tuple:
                - list                 : Document-level feature matrix where each document is represented 
                                         as aggregated WordNet-based features.
                - WordNetCorpusReader  : The WordNet corpus used for feature extraction.
        """
    
        try:
            print("Creating WordNet-based semantic features")
        
            wordnet_features_list = []

            for doc in self.texts:

                synonyms           = set()
                hypernyms          = set()
                hyponyms           = set()
                meronyms           = set()

                for word in doc.split():
                    synsets        = wn.synsets(word)
                    
                    # SYNONYMS
                    for synset in synsets:
                        synonyms.update(lemma.name() for lemma in synset.lemmas())
                    
                    # HYPERNYMS
                    for synset in synsets:
                        hypernyms.update(lemma.name() for hyper in synset.hypernyms() for lemma in hyper.lemmas())

                    # HYPONYMS
                    for synset in synsets:
                        hyponyms.update(lemma.name() for hypo in synset.hyponyms() for lemma in hypo.lemmas())

                    # MERONYMS
                    for synset in synsets:
                        meronyms.update(lemma.name() for mero in synset.part_meronyms() for lemma in mero.lemmas())

                document_features  = {"synonyms": list(synonyms),"hypernyms": list(hypernyms),"hyponyms": list(hyponyms),"meronyms": list(meronyms)}

                wordnet_features_list.append(document_features)

            feature_matrix         = np.array([len(doc_features["synonyms"]) 
                                               for doc_features in wordnet_features_list], 
                                              dtype = np.float32).reshape(-1, 1)

            print(f"Created {MAX_FEATURES} WordNet-based features with shape: {feature_matrix.shape}")

            return wn, feature_matrix

        except Exception as e:
            raise Exception(f"Error in creating WordNet-based features: {str(e)}")
    
    # ----- BERT LEVEL FEATURES -----

    def bert(self, max_seq_length: int = 128, max_features: int = None) -> tuple:
        """
        Generate semantic features using a pre-trained BERT model and return the transformer, feature matrix, and feature names.

        Arguments:
        ----------
            max_seq_length     : Maximum sequence length for BERT input (default: 128)
            max_features       : Number of features to reduce the embeddings and feature names to (default: None, uses MAX_FEATURES).

        Returns:
        --------
            tuple:
                - BertModel    : The loaded pre-trained BERT model.
                - np.ndarray   : Document-level feature matrix (each document represented as the CLS token embedding).
                - list         : List of reduced feature names (length equal to max_features).
        """
        
        try:
            if max_features is None:
                max_features = MAX_FEATURES

            print(f"Creating BERT-based features using pre-trained model")

            config                 = PretrainedConfig.from_json_file(BERT_CONFIG)
            
            tokenizer              = BertTokenizer.from_pretrained(BERT_TOKENIZER_CONFIG,
                                                                   tokenizer_file = BERT_TOKENIZER,
                                                                   vocab_file     = BERT_VOCABULARY,)
            
            model                  = BertModel.from_pretrained(BERT_MODEL_SAFETENSORS,
                                                               config           = config,
                                                               local_files_only = True,)
            
            model.eval()

            tokenized_texts        = [tokenizer
                                      (
                                          text,
                                          max_length     = max_seq_length,
                                          padding        = "max_length",
                                          truncation     = True,
                                          return_tensors = "pt",
                                          )
                                      for text in self.texts
                                      ]

            features               = []
            feature_names_set      = set()

            with torch.no_grad():
                for tokenized_text in tokenized_texts:
                    input_ids      = tokenized_text["input_ids"]
                    attention_mask = tokenized_text["attention_mask"]

                    outputs        = model(input_ids      = input_ids,
                                           attention_mask = attention_mask)
                    
                    cls_embedding  = outputs.last_hidden_state[:, 0, :].squeeze(0)
                    features.append(cls_embedding.numpy())

                tokens             = tokenizer.convert_ids_to_tokens(input_ids[0])
                feature_names_set.update([token for token in tokens if token not in ["[CLS]", "[SEP]", "[PAD]"]])

            feature_names          = sorted(feature_names_set)

            if len(feature_names) > max_features:
                feature_names      = feature_names[:max_features]

            feature_matrix         = np.array(features, dtype = np.float32)

            print(f"Reducing features to {max_features} dimensions using SVD...")
            svd                    = TruncatedSVD(n_components = max_features)
            reduced_feature_matrix = svd.fit_transform(feature_matrix)

            print(f"Created {max_features} BERT-based features with shape: {reduced_feature_matrix.shape}")
            return model, tokenizer, reduced_feature_matrix, feature_names

        except Exception as e:
            raise Exception(f"Error in creating BERT-based features: {str(e)}")
        
        
    
    # ----- DISTILBERT LEVEL FEATURES -----

    def distilbert(self, max_seq_length: int = 128, max_features: int = None) -> tuple:
        """
        Generate semantic features using a pre-trained DistilBERT model and return the transformer, feature matrix, and feature names.

        Arguments:
        ----------
            max_seq_length : Maximum sequence length for DistilBERT input (default: 128).
            max_features   : Number of features to reduce the embeddings to (default: None, uses MAX_FEATURES).

        Returns:
        --------
            tuple:
                - DistilBertModel : The loaded pre-trained DistilBERT model.
                - np.ndarray      : Document-level feature matrix (each document represented as the CLS token embedding).
                - list            : List of extracted feature names (unique tokens).
        """

        try:
            if max_features is None:
                max_features = MAX_FEATURES

            print("Creating DistilBERT-based features using pre-trained model")

            config                  = PretrainedConfig.from_json_file(DISTILBERT_CONFIG)
        
            tokenizer               = DistilBertTokenizer.from_pretrained(DISTILBERT_TOKENIZER_CONFIG,
                                                                          tokenizer_file = DISTILBERT_TOKENIZER,
                                                                          vocab_file     = DISTILBERT_VOCABULARY,)
        
            model                   = DistilBertModel.from_pretrained(DISTILBERT_MODEL_SAFETENSORS,
                                                                      config           = config,
                                                                      local_files_only = True,)

            model.eval()

            tokenized_texts         = [tokenizer(text,
                                                 max_length      = max_seq_length,
                                                 padding         = "max_length",
                                                 truncation      = True,
                                                 return_tensors  = "pt",
                                                 )
                                       for text in self.texts
                                       ]

            features                = []
            feature_names_set       = set()

            with torch.no_grad():
                for tokenized_text in tokenized_texts:
                    input_ids       = tokenized_text["input_ids"]
                    attention_mask  = tokenized_text["attention_mask"]

                    outputs         = model(input_ids      = input_ids, 
                                            attention_mask = attention_mask)
                    
                    cls_embedding   = outputs.last_hidden_state[:, 0, :].squeeze(0)
                    features.append(cls_embedding.numpy())

                    tokens          = tokenizer.convert_ids_to_tokens(input_ids[0])
                    feature_names_set.update([token for token in tokens if token not in ["[CLS]", "[SEP]", "[PAD]"]])

            feature_names           = sorted(feature_names_set)

            feature_matrix          = np.array(features, dtype=np.float32)

            print(f"Reducing features to {max_features} dimensions using SVD...")

            svd                     = TruncatedSVD(n_components = max_features)
            reduced_feature_matrix  = svd.fit_transform(feature_matrix)

            print(f"Created {max_features} DistilBERT-based features with shape: {reduced_feature_matrix.shape}")

            return model, tokenizer, reduced_feature_matrix, feature_names

        except Exception as e:
            raise Exception(f"Error in creating DistilBERT-based features: {str(e)}")

### TRANFORM VECTORIZER

In [12]:
## ----- DONE BY PRIYAM PAL -----

def vector_transform(texts, model, tokenizer = None, model_type = None, max_seq_length = 128):
    
    """
    Transform a list of texts into sentence embeddings using Word2Vec, FastText, or Transformer-based models.

    Arguments:
    ----------
    
        texts           : List of sentences (strings).
        model           : Pre-trained Word2Vec/FastText model or Transformer model (e.g., BERT).
        tokenizer       : Tokenizer for Transformer models (required if model_type is 'transformer').
        model_type      : Type of the model ('word2vec' or 'transformer').
        max_seq_length  : Maximum sequence length for Transformer models (default: 128).

    Returns:
    ----------
    
        NumPy array of sentence embeddings.
    """
    
    transformed = []

    if model_type == "word2vec" or model_type == "fasttext":
        
        for text in texts:
            words                = text.split()
            word_vectors         = [model.wv[word] for word in words if word in model.wv]

            if word_vectors:
                sentence_vector  = np.mean(word_vectors, axis=0)
            else:
                sentence_vector  = np.zeros(model.vector_size)

            transformed.append(sentence_vector)

    elif model_type == "distilbert" or model_type == "bert":
        
        if tokenizer is None:
            raise ValueError("Tokenizer is required for Transformer models.")

        model.eval()
        
        with torch.no_grad():
            for text in texts:
                encoded        = tokenizer(text, 
                                           max_length     = max_seq_length,
                                           padding        = "max_length",
                                           truncation     = True,
                                           return_tensors = "pt",)
                input_ids      = encoded["input_ids"]
                attention_mask = encoded["attention_mask"]

                outputs        = model(input_ids      = input_ids, 
                                       attention_mask = attention_mask)

                cls_embedding  = outputs.last_hidden_state[:, 0, :].squeeze(0).numpy()
                
                transformed.append(cls_embedding)

    else:
        raise ValueError("Unsupported model_type. Use 'word2vec' or 'transformer'.")

    return np.array(transformed, dtype=np.float32)

### FEATURE SELECTOR

In [14]:
class TextFeatureSelector:
    """
    A class for implementing various feature selection techniques for text data
    
    Attributes:
    -----------
        X           { spmatrix } : Feature matrix
        
        y           { ndarray }  : Target labels

        feature_names { list }   : Names of features
        
        n_features    { int }    : Number of features to select
    """
    
    def __init__(self, X: spmatrix, y: np.ndarray, feature_names: list, n_features: int = None) -> None:
        """
        Initialize TextFeatureSelector with feature matrix and labels
        
        Arguments:
        ----------
            X             : Sparse feature matrix
            
            y             : Target labels
            
            feature_names : List of feature names
            
            n_features    : Number of features to select (default: 100% of input features)
            
        Raises:
        -------
            ValueError    : If inputs are invalid or incompatible
        """
        if (X.shape[0] != len(y)):
            raise ValueError("Number of samples in X and y must match")
            
        if (X.shape[1] != len(feature_names)):
            raise ValueError("Number of features must match length of feature_names")
            
        self.X             = X
        self.y             = y
        self.feature_names = feature_names
        self.n_features    = n_features or X.shape[1]  # Default 100% of the input features
        
        
    def chi_square_selection(self) -> tuple:
        """
        Perform chi-square feature selection
        
        Returns:
        --------
            { tuple } : Tuple containing: - Selected feature indices
                                          - Chi-square scores
        """
        try:
            print("Performing chi-square feature selection...")
            
            # Scale features to non-negative for chi-square
            scaler            = MinMaxScaler()
            X_scaled          = scaler.fit_transform(self.X.toarray())
            
            # Apply chi-square selection
            selector          = SelectKBest(score_func = chi2, 
                                            k          = self.n_features)
            
            selector.fit(X_scaled, self.y)
            
            # Get selected features and scores
            selected_features = np.where(selector.get_support())[0]
            scores            = selector.scores_
            
            # Sort features by importance
            sorted_idx        = np.argsort(scores)[::-1]
            selected_features = sorted_idx[:self.n_features]
            
            print(f"Selected {len(selected_features)} features using chi-square")
            
            return selected_features, scores
            
        except Exception as e:
            raise
            
    def information_gain_selection(self) -> tuple:
        """
        Perform information gain feature selection
        
        Returns:
        --------
            { tuple } : Tuple containing: - Selected feature indices
                                          - Information gain scores
        """
        try:
            print("Performing information gain selection...")
            
            # Calculate mutual information scores
            selector          = SelectKBest(score_func = mutual_info_classif, 
                                            k          = self.n_features)
            selector.fit(self.X, self.y)
            
            # Get selected features and scores
            selected_features = np.where(selector.get_support())[0]
            scores            = selector.scores_
            
            # Sort features by importance
            sorted_idx        = np.argsort(scores)[::-1]
            selected_features = sorted_idx[:self.n_features]
            
            print(f"Selected {len(selected_features)} features using information gain")
            
            return selected_features, scores
            
        except Exception as e:
            raise
            
    def correlation_based_selection(self, threshold: float = 0.8) -> np.ndarray:
        """
        Perform correlation-based feature selection
        
        Arguments:
        ----------
            threshold { float } : Correlation threshold for feature removal
            
        Returns:
        --------
               { ndarray }      :  Selected feature indices
        """
        try:
            print("Performing correlation-based selection...")
            
            # Convert sparse matrix to dense for correlation calculation
            X_dense         = self.X.toarray()
            
            # Calculate correlation matrix
            corr_matrix     = np.corrcoef(X_dense.T)
            
            # Find highly correlated feature pairs
            high_corr_pairs = np.where(np.abs(corr_matrix) > threshold)
            
            # Keep track of features to remove
            to_remove       = set()
            
            # For each pair of highly correlated features
            for i, j in zip(*high_corr_pairs):
                if ((i != j) and (i not in to_remove) and (j not in to_remove)):
                    # Calculate correlation with target for both features
                    corr_i = mutual_info_score(X_dense[:, i], self.y)
                    corr_j = mutual_info_score(X_dense[:, j], self.y)
                    
                    # Remove feature with lower correlation to target
                    if (corr_i < corr_j):
                        to_remove.add(i)
                        
                    else:
                        to_remove.add(j)
            
            # Get selected features
            all_features      = set(range(self.X.shape[1]))
            selected_features = np.array(list(all_features - to_remove))
            
            # Select top k features if more than n_features remain
            if (len(selected_features) > self.n_features):
                # Calculate mutual information for remaining features
                mi_scores         = mutual_info_classif(self.X[:, selected_features], self.y)
                top_k_idx         = np.argsort(mi_scores)[::-1][:self.n_features]
                selected_features = selected_features[top_k_idx]
            
            print(f"Selected {len(selected_features)} features using correlation-based selection")
            
            return selected_features
            
        except Exception as e:
            raise
            
    def recursive_feature_elimination(self, estimator = None, cv: int = 5) -> tuple:
        """
        Perform Recursive Feature Elimination with cross-validation
        
        Arguments:
        ----------
            estimator  : Classifier to use (default: LogisticRegression)

            cv         : Number of cross-validation folds
            
        Returns:
        --------
            { tuple }  : Tuple containing: - Selected feature indices
                                           - Feature importance rankings
        """
        try:
            print("Performing recursive feature elimination...")
            
            # Use logistic regression if no estimator provided
            if (estimator is None):
                estimator = LogisticRegression(max_iter=1000)
            
            # Perform RFE with cross-validation
            selector = RFECV(estimator              = estimator,
                             min_features_to_select = self.n_features,
                             cv                     = cv,
                             n_jobs                 = -1)
            
            selector.fit(self.X, self.y)
            
            # Get selected features and rankings
            selected_features = np.where(selector.support_)[0]
            rankings          = selector.ranking_
            
            print(f"Selected {len(selected_features)} features using RFE")
            
            return selected_features, rankings
            
        except Exception as e:
            raise
           
        
    def forward_selection(self, estimator = None, cv: int = 5) -> np.ndarray:
        """
        Perform forward feature selection
        
        Arguments:
        ----------
            estimator : Classifier to use (default: LogisticRegression)
            
            cv        : Number of cross-validation folds
            
        Returns:
        --------
            Selected feature indices
        """
        try:
            print("Performing forward selection...")
            
            if (estimator is None):
                estimator = LogisticRegression(max_iter=1000)
            
            selected_features  = list()
            remaining_features = list(range(self.X.shape[1]))
            
            for i in tqdm(range(self.n_features)):
                best_score   = -np.inf
                best_feature = None
                
                # Try adding each remaining feature
                for feature in remaining_features:
                    current_features = selected_features + [feature]
                    X_subset         = self.X[:, current_features]
                    
                    # Calculate cross-validation score
                    scores = cross_val_score(estimator, 
                                             X_subset, 
                                             self.y,
                                             cv      = cv, 
                                             scoring = 'accuracy')
                    
                    avg_score = np.mean(scores)
                    
                    if (avg_score > best_score):
                        best_score   = avg_score
                        best_feature = feature
                
                if (best_feature is not None):
                    selected_features.append(best_feature)
                    remaining_features.remove(best_feature)
                
            print(f"Selected {len(selected_features)} features using forward selection")
            
            return np.array(selected_features)
            
        except Exception as e:
            raise
            
    def backward_elimination(self, estimator = None, cv: int = 5) -> np.ndarray:
        """
        Perform backward feature elimination
        
        Arguments:
        ----------
            estimator : Classifier to use (default: LogisticRegression)
            
            cv        : Number of cross-validation folds
            
        Returns:
        --------
            Selected feature indices
        """
        try:
            print("Performing backward elimination...")
            
            if (estimator is None):
                estimator = LogisticRegression(max_iter=1000)
            
            remaining_features = list(range(self.X.shape[1]))
            
            while len(remaining_features) > self.n_features:
                best_score    = -np.inf
                worst_feature = None
                
                # Try removing each feature
                for feature in remaining_features:
                    current_features = [f for f in remaining_features if f != feature]
                    X_subset         = self.X[:, current_features]
                    
                    # Calculate cross-validation score
                    scores           = cross_val_score(estimator, 
                                                       X_subset, 
                                                       self.y,
                                                       cv      = cv, 
                                                       scoring = 'accuracy')
                
                    avg_score = np.mean(scores)
                    
                    if (avg_score > best_score):
                        best_score    = avg_score
                        worst_feature = feature
                
                if (worst_feature is not None):
                    remaining_features.remove(worst_feature)
            
            print(f"Selected {len(remaining_features)} features using backward elimination")
            return np.array(remaining_features)
            
        except Exception as e:
            raise
            

### SENTIMENT ANALYZER

In [16]:
class SentimentAnalyzer:
    """
    A class for training and evaluating sentiment analysis models, including testing on unseen data
    """

    def __init__(self, X, y, feature_eng, selected_feature_indices, test_size=0.2, random_state=42, vectorizers=None):
        """
        Initialize the SentimentAnalyzer by splitting the data

        Arguments:
        ----------
            X                        : Feature matrix (sparse matrix or ndarray)
            
            y                        : Target labels (array-like)
            
            feature_eng              : Instance of TextFeatureEngineering
            
            vectorizers              : Tuple of vectorizers used for feature transformation
            
            selected_feature_indices : Indices of selected features after feature selection
            
            test_size                : Proportion of data to use for testing (default: 0.2)
            
            random_state             : Random seed for reproducibility
        """
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, 
                                                                                y, 
                                                                                test_size    = test_size, 
                                                                                random_state = random_state)
        
        self.feature_eng                                     = feature_eng
        self.vectorizers                                     = vectorizers
        self.selected_feature_indices                        = selected_feature_indices

        
    def train_model(self, model_type:str = "logistic_regression", kernel:str = None, **kwargs):
        """
        Train a sentiment analysis model

        Arguments:
        ----------
            model_type { str } : Type of model to train (e.g: "logistic_regression", "svm", "random_forest")
            
            kernel     { str } : Kernel type for SVM (e.g., "linear", "poly", "rbf", "sigmoid")
            
            kwargs             : Additional arguments for the model initialization

        Returns:
        --------
            Trained model
        """
        if (model_type == "logistic_regression"):
            model = LogisticRegression(max_iter = MODEL_PARAMS_DICT['max_iter'], 
                                       **kwargs)
            
        elif (model_type == "svm"):
            
            if (kernel is None):
                # Default kernel
                kernel = "rbf"  
                
            model = SVC(kernel = kernel, **kwargs)
            
        elif (model_type == "random_forest"):
            model = RandomForestClassifier(**kwargs)

        elif (model_type == "gaussian_naive_bayes"):
            if issparse(self.X_train):
                self.X_train = self.X_train.toarray()

            model = GaussianNB(**kwargs)
            
        elif (model_type == "multinomial_naive_bayes"):
            model = MultinomialNB(**kwargs)

        elif (model_type == "adaboost"):
            model = AdaBoostClassifier(**kwargs)

        elif (model_type == "gradient_boost"):
            model = GradientBoostingClassifier(**kwargs)

        elif (model_type == "lightgbm"):
            model = LGBMClassifier(**kwargs)

        elif (model_type == 'label_propagation'):
            model = LabelPropagation(kernel      = 'knn',
                                     n_neighbors = MODEL_PARAMS_DICT['n_neighbors'], 
                                     max_iter    = MODEL_PARAMS_DICT['max_iter'], 
                                     tol         = MODEL_PARAMS_DICT['tol'],
                                     **kwargs
                                    )

        elif (model_type == "multilayer_perceptron"):
            model = MLPClassifier(hidden_layer_sizes = MODEL_PARAMS_DICT['hidden_layer_size'], 
                                  max_iter           = MODEL_PARAMS_DICT['max_iter'], 
                                  **kwargs)

        elif (model_type == 'hist_gradient_boosting_classifier'):
            if issparse(self.X_train):
                self.X_train = self.X_train.toarray()

            model = HistGradientBoostingClassifier(loss              = MODEL_PARAMS_DICT['loss'], 
                                                   learning_rate     = MODEL_PARAMS_DICT['learning_rate'], 
                                                   max_iter          = MODEL_PARAMS_DICT['max_iter'],
                                                   min_samples_leaf  = MODEL_PARAMS_DICT['min_samples_leaf'],
                                                   l2_regularization = MODEL_PARAMS_DICT['l2_regularization'],
                                                   max_features      = MODEL_PARAMS_DICT['max_features'],
                                                   **kwargs)


        elif (model_type == "logistic_decision_tree"):
            # Create a logistic regression model
            logistic_model      = LogisticRegression(max_iter = MODEL_PARAMS_DICT['max_iter'], 
                                                     penalty  = MODEL_PARAMS_DICT['penalty'], 
                                                     C        = MODEL_PARAMS_DICT['C'], 
                                                     solver   = MODEL_PARAMS_DICT['solver'],
                                                     **kwargs)

            # Create a decision tree model
            decision_tree_model = DecisionTreeClassifier(max_depth         = MODEL_PARAMS_DICT['max_depth'], 
                                                         min_samples_split = MODEL_PARAMS_DICT['min_samples_split'], 
                                                         min_samples_leaf  = MODEL_PARAMS_DICT['min_samples_leaf'],
                                                         **kwargs)

            # Combine them in a stacking model
            model               = StackingClassifier(estimators      = [('decision_tree', decision_tree_model)], 
                                                     final_estimator = logistic_model, 
                                                     stack_method    = 'predict_proba',
                                                     **kwargs)
        
        elif (model_type == "logistic_gaussian_naive_bayes"):
            # Create a logistic regression model
            logistic_model = LogisticRegression(max_iter = MODEL_PARAMS_DICT['max_iter'], 
                                                penalty  = MODEL_PARAMS_DICT['penalty'], 
                                                C        = MODEL_PARAMS_DICT['C'], 
                                                solver   = MODEL_PARAMS_DICT['solver'],
                                                **kwargs)

            # Gaussian Naive Bayes does not work with sparse matrices, so convert to dense if needed
            if issparse(self.X_train):
                self.X_train = self.X_train.toarray()

            # Create Gaussian Naive Bayes model
            gaussian_naive_bayes = GaussianNB()

            # Combine them in a stacking model (Logistic Regression as base model, Gaussian Naive Bayes as final estimator)
            model                = StackingClassifier(estimators      = [('logistic_regression', logistic_model)], 
                                                      final_estimator = gaussian_naive_bayes, 
                                                      stack_method    = 'predict_proba',
                                                      **kwargs)
        
        else:
            raise ValueError("Unsupported model_type. Choose from : 'logistic_regression', 'svm', 'random_forest', 'multinomial_naive_bayes', \
                             'gaussian_naive_bayes', 'adaboost', 'gradient_boost', 'lightgbm', 'logistic_decision_tree', 'logistic_gaussian_naive_bayes', 'multilayer_perceptron".replace('  ', ''))

        print(f"Training {model_type}...")
        model.fit(self.X_train, self.y_train)

        return model

    def evaluate_model(self, model):
        """
        Evaluate a trained model on the test set

        Arguments:
        ----------
            model : Trained model

        Returns:
        --------
            Dictionary containing evaluation metrics
        """
        print("Evaluating model...")

        if (isinstance(model, StackingClassifier)):
            if (isinstance(model.final_estimator_, GaussianNB)):
                # Handle dense conversion for GaussianNB final estimator in stacking model
                X_test_dense = self.X_test.toarray() if hasattr(self.X_test, "toarray") else self.X_test
                y_pred        = model.predict(X_test_dense)
        
        elif ((isinstance(model, GaussianNB)) or (isinstance(model, HistGradientBoostingClassifier))):
            # Directly handle GaussianNB or HistGradientBoostingClassifier
            X_test_dense = self.X_test.toarray() if hasattr(self.X_test, "toarray") else self.X_test
            y_pred       = model.predict(X_test_dense)
        
        else:
            y_pred = model.predict(self.X_test)
            
        accuracy = accuracy_score(self.y_test, y_pred)
        report   = classification_report(self.y_test, y_pred)
        cm       = confusion_matrix(self.y_test, y_pred)

        print(f"Accuracy: {accuracy:.4f}")
        print("Classification Report:")
        print(report)
        print("Confusion Matrix:")
        print(cm)

        return {"accuracy"              : accuracy,
                "classification_report" : report,
                "confusion_matrix"      : cm,
               }

    
    def test_on_unseen_data(self, model, unseen_texts, unseen_labels=None, **preprocessed_features):
        """
        Test the model on unseen data

        Arguments:
        ----------
            model                 : Trained model
            
            unseen_texts          : List of unseen text data

            unseen_labels         : True labels for the unseen data

            preprocessed_features : Preprocessed feature matrices (e.g., binary_features, tfidf_features, bm25_features, etc.)

        Returns:
        --------
            Predictions for the unseen data
        """
        print("Processing unseen data...")

        # Dynamically combine all passed feature matrices
        unseen_combined_features = hstack([preprocessed_features[key] for key in preprocessed_features])

        # Select features using the indices chosen during feature selection
        unseen_selected_features = unseen_combined_features[:, self.selected_feature_indices]

        # Convert unseen features to dense for Gaussian Naive Bayes
        if ((isinstance(model, GaussianNB)) or (isinstance(model, HistGradientBoostingClassifier))):
            unseen_selected_features = unseen_selected_features.toarray() if hasattr(unseen_selected_features, "toarray") else unseen_selected_features
        
        elif (isinstance(model, StackingClassifier)):
            if (isinstance(model.final_estimator_, GaussianNB)):
                unseen_selected_features = unseen_selected_features.toarray() if hasattr(unseen_selected_features, "toarray") else unseen_selected_features

        # Predict sentiments
        predictions              = model.predict(unseen_selected_features)

        # Print predictions
        print("Predictions on Unseen Data:")
        for text, pred in zip(unseen_texts, predictions):
            print(f"Text: {text}\nPredicted Sentiment: {pred}\n")

        # Compute accuracy if unseen_labels are provided
        if unseen_labels is not None:
            print(f"Number of unseen_labels: {len(unseen_labels)}")

            if (len(unseen_labels) != len(predictions)):
                raise ValueError("The number of unseen_labels must match the number of predictions.")
                
            accuracy = accuracy_score(unseen_labels, predictions)
            print(f"Accuracy on Unseen Data : {accuracy:.4f}")
            return predictions, accuracy

        return predictions

### MAIN CONTROLLER FUNCTION

In [18]:
# LOAD THE DATA
imdb_ratings_data                            = load_csv_data(filepath = DATA_PATH)

# PREPROCESSING THE DATA
preprocessor                                 = TextPreprocessor()
imdb_ratings_data["clean_text"]              = imdb_ratings_data["review"].apply(preprocessor.clean_text)

In [19]:
# INITIALISING THE SEMANTIC FEATURE ENGINEERING CLASS
semantic_Feature_Eng                    = Semantic_Feature_Engineering(texts        = imdb_ratings_data['clean_text'].tolist(), 
                                                                       max_features = MAX_FEATURES
                                                                       )

# INITIALISING THE CONTEXTUAL EMBEDDING CLASS INSIDE SEMANTIC FEATURE ENGINEERING CLASS
contextual_Embedding                    = semantic_Feature_Eng.Contextual_Embedding(texts = imdb_ratings_data['clean_text'].tolist())

In [20]:
# ----------  CREATING THE FEATURES ----------


# ----- WORD - LEVEL FEATURES -----

# # count_vectorizer, count_features             = word_level_feature_eng.create_count_bow()
# freq_vectorizer, freq_features               = word_level_feature_eng.create_frequency_bow()
# # binary_vectorizer, binary_features           = word_level_feature_eng.create_binary_bow()
# # tfidf_vectorizer, tfidf_features             = word_level_feature_eng.create_tfidf()
# std_tfidf_vectorizer, std_tfidf_features     = word_level_feature_eng.create_standardized_tfidf()
# # bm25_transformer, bm25_features              = word_level_feature_eng.create_bm25()
# bm25f_transformer, bm25f_features            = word_level_feature_eng.create_bm25f()
# # bm25l_transformer, bm25l_features            = word_level_feature_eng.create_bm25l()
# # bm25t_transformer, bm25t_features            = word_level_feature_eng.create_bm25t()
# bm25_plus_transformer, bm25_plus_features    = word_level_feature_eng.create_bm25_plus()
# skipgrams_vectorizer, skipgram_features      = word_level_feature_eng.create_skipgrams()
# pos_ngram_vectorizer, pos_ngram_features     = word_level_feature_eng.create_positional_ngrams()

# # ----- CONTEXTUALS FEATURES -----

# window_vectorizer, window_features           = contextuals.window_based()
# # position_vectorizer, positional_features     = contextuals.position_based()
# ngram_vectorizer, trigrams                   = contextuals.generate_ngrams()
# # cross_doc_vectorizer, tfidf_matrix           = contextuals.cross_document()


# ----- SEMANTIC FEATURES -----

# w2v_model, w2v_features                         = semantic_Feature_Eng.word2vec_cbow()
# glove_embeddings, glove_model                   = semantic_Feature_Eng.glove(GLOVE_MODEL_PATH)
# fasttext_model, fasttext_features               = semantic_Feature_Eng.fasttext()
# wordnet_model, wordnet_features                 = semantic_Feature_Eng.wordnet()
# bert_model, bert_features, bert_feature_names   = semantic_Feature_Eng.bert()


# CONVERTING THE FEATURES INTO FEATURE MATRIX
# w2v_sparse                              = csr_matrix(w2v_features)
# glove_sparse                            = csr_matrix(glove_embeddings)
# fasttext_sparse                         = csr_matrix(fasttext_features)
# bert_sparse                             = csr_matrix(bert_features)

# # COMBINING THE SEMANTIC, WORD - LEVEL FEATURES, CONTEXTUAL FEATURES
# combined_features                       = hstack([w2v_sparse, 
#                                                   # glove_sparse, 
#                                                   # fasttext_sparse,
#                                                   freq_features, 
#                                                   std_tfidf_features,
#                                                   bm25f_features,
#                                                   bm25_plus_features,
#                                                   skipgram_features,
#                                                   pos_ngram_features,
#                                                   window_features,
#                                                   # positional_features,
#                                                   trigrams,
#                                                   # tfidf_matrix 
#                                                   ])

bert_model, bert_tokenizer, bert_features, bert_feature_names = semantic_Feature_Eng.bert()
bert_sparse                                   = csr_matrix(bert_features)

combined_features                             = hstack([bert_sparse])

print(f"Combined Feature Matrix Shape: {combined_features.shape}")

Creating BERT-based features using pre-trained model
Original number of tokens: 65
Reducing features to 500 dimensions using SVD...
Created 500 BERT-based features with shape: (50000, 500)
Combined Feature Matrix Shape: (50000, 500)


In [21]:
# ----- EXTRACTING THE FEATURE NAMES -----

feature_names                            = []

# w2v_feature_names                        = w2v_model.wv.index_to_key[:MAX_FEATURES]
# fasttext_feature_names                   = fasttext_model.wv.index_to_key[:MAX_FEATURES]

# # COMBINING THE FEATURE NAMES OF SEMANTIC, WORD-LEVEL, CONTEXTUAL FEATURES
# feature_names                            = (list(w2v_feature_names) + 
#                                             # list(fasttext_feature_names) + 
#                                             list(freq_vectorizer.get_feature_names_out()) +
#                                             list(std_tfidf_vectorizer.get_feature_names_out()) +
#                                             list(bm25f_transformer.count_vectorizer.get_feature_names_out()) +
#                                             list(bm25_plus_transformer.count_vectorizer.get_feature_names_out()) +
#                                             list(skipgrams_vectorizer.get_feature_names_out()) +
#                                             list(pos_ngram_vectorizer.get_feature_names_out()) + 
#                                             list(window_vectorizer.get_feature_names_out()) +
#                                             # list(position_vectorizer.get_feature_names_out()) +
#                                             # list(cross_doc_vectorizer.get_feature_names_out())
#                                             list(ngram_vectorizer.get_feature_names_out())
#                                            )

feature_names = list(bert_feature_names)
print(f"Extracted {len(bert_feature_names)} BERT feature names:")

print(f"Number of feature names extracted: {len(feature_names)}")

Extracted 65 BERT feature names:
Number of feature names extracted: 65


In [22]:
# ----- SELECTING THE FEATURES -----

# FEATURE SELECTOR
feature_selector                         = TextFeatureSelector(X             = combined_features,
                                                               y             = imdb_ratings_data['sentiment'].values,
                                                               feature_names = feature_names,
                                                               n_features    = MAX_FEATURES
                                                               )

# CHI-SQUARE SELECTION
chi_square_features, chi_square_scores   = feature_selector.chi_square_selection()

# COMBINING THE FEATURES
selected_combined_features               = combined_features[:, chi_square_features]


# # VECTORIZERS TUPLE
# vectorizers_tuple                        = (w2v_model,
#                                             # fasttext_model,
#                                             freq_vectorizer,
#                                             std_tfidf_vectorizer, 
#                                             bm25f_transformer,
#                                             bm25_plus_features, 
#                                             skipgrams_vectorizer, 
#                                             pos_ngram_vectorizer,
#                                             window_vectorizer,
#                                             # position_vectorizer,
#                                             ngram_vectorizer,
#                                             # cross_doc_vectorizer
#                                             )

vectorizers_tuple = (bert_model)

ValueError: Number of features must match length of feature_names

In [None]:
# ----- SENTIMENTAL ANALYSIS -----

sentiment_analyzer                       = SentimentAnalyzer(X                        = selected_combined_features, 
                                                             y                        = imdb_ratings_data["sentiment"].values,
                                                             feature_eng              = semantic_Feature_Eng,
                                                             vectorizers              = vectorizers_tuple,
                                                             selected_feature_indices = chi_square_features
                                                             )

In [None]:
# ----- MODEL FITTING ON TRAINING DATA -----


# TRAIN THE MODEL
trained_model                             = sentiment_analyzer.train_model(model_type = MODEL_NAME, kernel = KERNEL_NAME)

# EVALUATING THE RESULTS OF THE MODEL
evaluation_results                        = sentiment_analyzer.evaluate_model(trained_model)

In [None]:
test_data                                 = load_csv_data(filepath = TEST_DATA_PATH)

In [None]:
bert_features = vector_transform(list(test_data['Text'], bert_model, bert_tokenizer, model_type = "bert")

In [None]:
# COMBINING THE FEATURES
combined_features_transformed             = np.hstack([w2v_features_transformed])

# CONVERTING TO SPARSE MATRIX
combined_features_sparse                  = csr_matrix(combined_features_transformed)

In [None]:
# # ----- PREDICT THE TRAINED MODEL USING UNSEEN DATA USING SEMANTIC, WORD-LEVEL, CONTEXTUAL FEATURES -----

model_predictions, unseen_accuracy        = sentiment_analyzer.test_on_unseen_data(model               = trained_model, 
                                                                                   unseen_texts        = list(test_data['Text']),
                                                                                   unseen_labels       = list(test_data['Sentiment']),
                                                                                   combined_features   = combined_features_sparse,
                                                                                   )


all_test_data                              = {'texts'            : list(test_data['Text']), 
                                              'true_labels'      : list(test_data['Sentiment']), 
                                              'predicted_labels' : list(model_predictions)
                                              }

model_prediction_df                        = pd.DataFrame.from_dict(data   = all_test_data, 
                                                                    orient = 'index').T

model_prediction_df.to_csv(path_or_buf     = SAVE_PATH_VARIABLE,
                           index           = False)

print (f"Sentiment Analysis result by {MODEL_NAME} Model of Max Features {MAX_FEATURES} has been saved to : {SAVE_PATH_VARIABLE}")
