In [138]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import textstat
from sklearn.model_selection import cross_val_score, train_test_split
import tqdm
from scipy.sparse import spmatrix
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import mutual_info_classif
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('~/Project/NLP_Tasks/data/IMDB_Dataset.csv')

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
class TextPreprocessor:
    """
    A class for preprocessing text data through cleaning, tokenization, and normalization
    
    Attributes:
    -----------
        lemmatizer : WordNetLemmatizer instance for word lemmatization
        
        stop_words : Set of stopwords to be removed from text
    """ 
    def __init__(self):
        """
        Initialize the TextPreprocessor with required NLTK resources
        
        Raises:
        -------
            LookupError : If required NLTK resources cannot be downloaded
        """
        try:
            # Download required NLTK data
            nltk.download('punkt', quiet=True)
            nltk.download('stopwords', quiet=True)
            nltk.download('wordnet', quiet=True)
            nltk.download('punkt_tab', quiet=True)
            
            self.lemmatizer = WordNetLemmatizer()
            self.stop_words = set(stopwords.words('english'))
            
        except LookupError as e:
            raise
    
    def clean_text(self, text:str) -> str:
        """
        Clean and normalize input text by removing HTML tags, special characters,
        and applying text normalization techniques
        
        Arguments:
        ----------
            text { str }      : Input text to be cleaned
            
        Raises:
        -------
            ValueError        : If input text is None or empty
            
            TextCleaningError : If any error occurs at any step of text cleaning process
            
        Returns:
        --------
                { str }       : Cleaned and normalized text
        """
        if ((not text) or (not isinstance(text, str))):
            raise ValueError("Input text must be a non-empty string")
            
        try:
            # Remove HTML tags
            text   = re.sub('<[^>]*>', '', text)
            
            # Remove special characters and digits
            text   = re.sub('[^a-zA-Z\s]', '', text)
            
            # Convert to lowercase
            text   = text.lower()
            
            # Tokenization
            tokens = word_tokenize(text)
            
            # Remove stopwords and lemmatize
            tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stop_words]
            
            return ' '.join(tokens)
        
        except Exception as TextCleaningError:
            raise

  text   = re.sub('[^a-zA-Z\s]', '', text)


In [5]:
preprocessor = TextPreprocessor()
data['cleaned_review'] = data['review'].apply(preprocessor.clean_text)

In [6]:
data.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode you...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...


# Statistical Feature Engineering

In [8]:
def document_statistics(df):
    df['char_count'] = df['cleaned_review'].apply(lambda x: len(x))
    df['word_count'] = df['cleaned_review'].apply(lambda x: len(x.split()))
    df['sent_count'] = df['review'].apply(lambda x: len(sent_tokenize(x)))
    df['AWL'] = df['char_count'].div(df['word_count'])
    df['ASL'] = df['word_count'].div(df['sent_count'])
    df['unique_word_count'] = df['cleaned_review'].apply(lambda x: len(set(word_tokenize(x))))
    df['UWR'] = df['unique_word_count'].div(df['word_count'])
    df['FRE'] = (206.835 - 1.015*df['ASL'] - 84.6*df['AWL'])
    return df

In [9]:
document_statistics(data)

Unnamed: 0,review,sentiment,cleaned_review,char_count,word_count,sent_count,AWL,ASL,unique_word_count,UWR,FRE
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode you...,1125,167,10,6.736527,16.700000,140,0.838323,-380.025680
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...,640,84,7,7.619048,12.000000,76,0.904762,-449.916429
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,580,85,4,6.823529,21.250000,81,0.952941,-392.004338
3,Basically there's a family where a little boy ...,negative,basically there family little boy jake think t...,446,66,6,6.757576,11.000000,53,0.803030,-376.020909
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...,851,125,9,6.808000,13.888889,101,0.808000,-383.219022
...,...,...,...,...,...,...,...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,thought movie right good job wasnt creative or...,540,84,8,6.428571,10.500000,72,0.857143,-347.679643
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad acting idiotic direc...,392,55,3,7.127273,18.333333,50,0.909091,-414.740606
49997,I am a Catholic taught in parochial elementary...,negative,catholic taught parochial elementary school nu...,805,115,6,7.000000,19.166667,96,0.834783,-404.819167
49998,I'm going to have to disagree with the previou...,negative,im going disagree previous comment side maltin...,819,114,8,7.184211,14.250000,109,0.956140,-415.412961


In [174]:
class Statistical_Feature_Engineering():
    """
    A class for statistical feature engineering.
    Attributes:
    ----------
    vectorizer : CountVectorizer instance for text vectorization.
    
    """
    def __init__(self,max_features=1000):
        """
        Intializes the Statistical_Feature_Engineering.
        """
        self.max_features = max_features
        self.vectorizer = CountVectorizer(max_features=self.max_features)

    def document_statistics(self,df):
        """
        Calculates basic documnet statistics i.e. Character Count, Word Count, Sentence Count, Average Word Length(AWL), 
        Average Sentence Length(ASL), Unique Word Ratio(UWR).

        Arguments:
        ----------
        df {DataFrame} : Input Data.

        Returns:
        --------
        df {DataFrame} : Output data with the calculated document statistics.
        
        """
        
        df['char_count'] = df['cleaned_review'].apply(lambda x: len(x))
        df['word_count'] = df['cleaned_review'].apply(lambda x: len(x.split()))
        df['sent_count'] = df['review'].apply(lambda x: len(sent_tokenize(x)))
        df['AWL'] = df['char_count'].div(df['word_count'])
        df['ASL'] = df['word_count'].div(df['sent_count'])
        df['unique_word_count'] = df['cleaned_review'].apply(lambda x: len(set(word_tokenize(x))))
        df['UWR'] = df['unique_word_count'].div(df['word_count'])
        return df


    def readability_score(self,df, score='FRE'):
        """
        Calculates the readability scores i.e. Flesch Readine Ease(FRE), Gunning Fog Index(GFI), SMOG Index(SMOG.

        Arguments:
        ----------
        df {DataFrame} : Input Data.

        score {str} : score type {'FRE', 'GFI', SMOG}.

        Returns:
        ---------
        fre {series} : FRE scores.

        gfi {series} : GFI scores.

        smog {series} : SMOG scores.
        
        """
        if(score == 'FRE'):
            fre = df['cleaned_review'].apply(textstat.flesch_reading_ease)
            return fre
        elif(score == 'GFI'):
            gfi = df['cleaned_review'].apply(textstat.gunning_fog)
            return gfi
        elif(score == 'SMOG'):
            smog = df['cleaned_review'].apply(textstat.smog_index)
            return smog
        else:
            raise ValueError("Unsupported score type. Choose from 'FRE', 'GFI', or 'SMOG'.")
        

    def frequency_distribution(self,df,column,fit_transform=False):
        """
        Calculates the word counts in each document.

        df {DataFrame} : Input Data.

        column {str} : Column name for calculating the frequency distribution.
        """
        if fit_transform:
            X = self.vectorizer.fit_transform(df[column])
        else:
            X = self.vectorizer.transform(df[column])
        bow = pd.DataFrame(X.toarray(), columns=self.vectorizer.get_feature_names_out())
        return bow

In [11]:
sfe = Statistical_Feature_Engineering()
data = sfe.document_statistics(data)
data['FRE'] = sfe.readability_score(data)
bow = sfe.frequency_distribution(data, column='cleaned_review')

In [12]:
data = pd.merge(left=data,right=bow,on=data.index).drop(columns=['key_0'])

In [60]:
data.head()

Unnamed: 0,review_x,sentiment,cleaned_review,char_count,word_count,sent_count,AWL,ASL,unique_word_count,UWR,...,year,yes,yet,york,youll,young,younger,youre,youve,zombie
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode you...,1125,167,10,6.736527,16.7,140,0.838323,...,0,0,0,0,1,0,0,0,0,0
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...,640,84,7,7.619048,12.0,76,0.904762,...,0,0,0,0,0,0,0,0,0,0
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,580,85,4,6.823529,21.25,81,0.952941,...,1,0,0,0,0,1,0,0,0,0
3,Basically there's a family where a little boy ...,negative,basically there family little boy jake think t...,446,66,6,6.757576,11.0,53,0.80303,...,0,0,0,0,0,0,0,1,0,1
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...,851,125,9,6.808,13.888889,101,0.808,...,0,0,0,1,0,0,0,0,0,0


In [13]:
class TextFeatureSelector:
    """
    A class for implementing various feature selection techniques for text data
    
    Attributes:
    -----------
        X           { spmatrix } : Feature matrix
        
        y           { ndarray }  : Target labels

        feature_names { list }   : Names of features
        
        n_features    { int }    : Number of features to select
    """
    
    def __init__(self, X: spmatrix, y: np.ndarray, feature_names: list, n_features: int = None) -> None:
        """
        Initialize TextFeatureSelector with feature matrix and labels
        
        Arguments:
        ----------
            X             : Sparse feature matrix
            
            y             : Target labels
            
            feature_names : List of feature names
            
            n_features    : Number of features to select (default: 100% of input features)
            
        Raises:
        -------
            ValueError    : If inputs are invalid or incompatible
        """
        if (X.shape[0] != len(y)):
            raise ValueError("Number of samples in X and y must match")
            
        if (X.shape[1] != len(feature_names)):
            raise ValueError("Number of features must match length of feature_names")
            
        self.X             = X
        self.y             = y
        self.feature_names = feature_names
        self.n_features    = n_features or X.shape[1]  # Default 100% of the input features
        
        
    def chi_square_selection(self) -> tuple:
        """
        Perform chi-square feature selection
        
        Returns:
        --------
            { tuple } : Tuple containing: - Selected feature indices
                                          - Chi-square scores
        """
        try:
            print("Performing chi-square feature selection...")
            
            # Scale features to non-negative for chi-square
            scaler            = MinMaxScaler()
            X_scaled          = scaler.fit_transform(self.X.toarray())
            
            # Apply chi-square selection
            selector          = SelectKBest(score_func = chi2, 
                                            k          = self.n_features)
            
            selector.fit(X_scaled, self.y)
            
            # Get selected features and scores
            selected_features = np.where(selector.get_support())[0]
            scores            = selector.scores_
            
            # Sort features by importance
            sorted_idx        = np.argsort(scores)[::-1]
            selected_features = sorted_idx[:self.n_features]
            
            print(f"Selected {len(selected_features)} features using chi-square")
            
            return selected_features, scores
            
        except Exception as e:
            raise
            
    def information_gain_selection(self) -> tuple:
        """
        Perform information gain feature selection
        
        Returns:
        --------
            { tuple } : Tuple containing: - Selected feature indices
                                          - Information gain scores
        """
        try:
            print("Performing information gain selection...")
            
            # Calculate mutual information scores
            selector          = SelectKBest(score_func = mutual_info_classif, 
                                            k          = self.n_features)
            selector.fit(self.X, self.y)
            
            # Get selected features and scores
            selected_features = np.where(selector.get_support())[0]
            scores            = selector.scores_
            
            # Sort features by importance
            sorted_idx        = np.argsort(scores)[::-1]
            selected_features = sorted_idx[:self.n_features]
            
            print(f"Selected {len(selected_features)} features using information gain")
            
            return selected_features, scores
            
        except Exception as e:
            raise
            
    def correlation_based_selection(self, threshold: float = 0.8) -> np.ndarray:
        """
        Perform correlation-based feature selection
        
        Arguments:
        ----------
            threshold { float } : Correlation threshold for feature removal
            
        Returns:
        --------
               { ndarray }      :  Selected feature indices
        """
        try:
            print("Performing correlation-based selection...")
            
            # Convert sparse matrix to dense for correlation calculation
            X_dense         = self.X.toarray()
            
            # Calculate correlation matrix
            corr_matrix     = np.corrcoef(X_dense.T)
            
            # Find highly correlated feature pairs
            high_corr_pairs = np.where(np.abs(corr_matrix) > threshold)
            
            # Keep track of features to remove
            to_remove       = set()
            
            # For each pair of highly correlated features
            for i, j in zip(*high_corr_pairs):
                if ((i != j) and (i not in to_remove) and (j not in to_remove)):
                    # Calculate correlation with target for both features
                    corr_i = mutual_info_score(X_dense[:, i], self.y)
                    corr_j = mutual_info_score(X_dense[:, j], self.y)
                    
                    # Remove feature with lower correlation to target
                    if (corr_i < corr_j):
                        to_remove.add(i)
                        
                    else:
                        to_remove.add(j)
            
            # Get selected features
            all_features      = set(range(self.X.shape[1]))
            selected_features = np.array(list(all_features - to_remove))
            
            # Select top k features if more than n_features remain
            if (len(selected_features) > self.n_features):
                # Calculate mutual information for remaining features
                mi_scores         = mutual_info_classif(self.X[:, selected_features], self.y)
                top_k_idx         = np.argsort(mi_scores)[::-1][:self.n_features]
                selected_features = selected_features[top_k_idx]
            
            print(f"Selected {len(selected_features)} features using correlation-based selection")
            
            return selected_features
            
        except Exception as e:
            raise
            
    def recursive_feature_elimination(self, estimator = None, cv: int = 5) -> tuple:
        """
        Perform Recursive Feature Elimination with cross-validation
        
        Arguments:
        ----------
            estimator  : Classifier to use (default: LogisticRegression)

            cv         : Number of cross-validation folds
            
        Returns:
        --------
            { tuple }  : Tuple containing: - Selected feature indices
                                           - Feature importance rankings
        """
        try:
            print("Performing recursive feature elimination...")
            
            # Use logistic regression if no estimator provided
            if (estimator is None):
                estimator = LogisticRegression(max_iter=1000)
            
            # Perform RFE with cross-validation
            selector = RFECV(estimator              = estimator,
                             min_features_to_select = self.n_features,
                             cv                     = cv,
                             n_jobs                 = -1)
            
            selector.fit(self.X, self.y)
            
            # Get selected features and rankings
            selected_features = np.where(selector.support_)[0]
            rankings          = selector.ranking_
            
            print(f"Selected {len(selected_features)} features using RFE")
            
            return selected_features, rankings
            
        except Exception as e:
            raise
           
        
    def forward_selection(self, estimator = None, cv: int = 5) -> np.ndarray:
        """
        Perform forward feature selection
        
        Arguments:
        ----------
            estimator : Classifier to use (default: LogisticRegression)
            
            cv        : Number of cross-validation folds
            
        Returns:
        --------
            Selected feature indices
        """
        try:
            print("Performing forward selection...")
            
            if (estimator is None):
                estimator = LogisticRegression(max_iter=1000)
            
            selected_features  = list()
            remaining_features = list(range(self.X.shape[1]))
            
            for i in tqdm(range(self.n_features)):
                best_score   = -np.inf
                best_feature = None
                
                # Try adding each remaining feature
                for feature in remaining_features:
                    current_features = selected_features + [feature]
                    X_subset         = self.X[:, current_features]
                    
                    # Calculate cross-validation score
                    scores = cross_val_score(estimator, 
                                             X_subset, 
                                             self.y,
                                             cv      = cv, 
                                             scoring = 'accuracy')
                    
                    avg_score = np.mean(scores)
                    
                    if (avg_score > best_score):
                        best_score   = avg_score
                        best_feature = feature
                
                if (best_feature is not None):
                    selected_features.append(best_feature)
                    remaining_features.remove(best_feature)
                
            print(f"Selected {len(selected_features)} features using forward selection")
            
            return np.array(selected_features)
            
        except Exception as e:
            raise
            
    def backward_elimination(self, estimator = None, cv: int = 5) -> np.ndarray:
        """
        Perform backward feature elimination
        
        Arguments:
        ----------
            estimator : Classifier to use (default: LogisticRegression)
            
            cv        : Number of cross-validation folds
            
        Returns:
        --------
            Selected feature indices
        """
        try:
            print("Performing backward elimination...")
            
            if (estimator is None):
                estimator = LogisticRegression(max_iter=1000)
            
            remaining_features = list(range(self.X.shape[1]))
            
            while len(remaining_features) > self.n_features:
                best_score    = -np.inf
                worst_feature = None
                
                # Try removing each feature
                for feature in remaining_features:
                    current_features = [f for f in remaining_features if f != feature]
                    X_subset         = self.X[:, current_features]
                    
                    # Calculate cross-validation score
                    scores           = cross_val_score(estimator, 
                                                       X_subset, 
                                                       self.y,
                                                       cv      = cv, 
                                                       scoring = 'accuracy')
                
                    avg_score = np.mean(scores)
                    
                    if (avg_score > best_score):
                        best_score    = avg_score
                        worst_feature = feature
                
                if (worst_feature is not None):
                    remaining_features.remove(worst_feature)
            
            print(f"Selected {len(remaining_features)} features using backward elimination")
            return np.array(remaining_features)
            
        except Exception as e:
            raise
            

In [14]:
X = data.drop(columns=['review_x','cleaned_review','sentiment'],axis=1)
y = data['sentiment']

In [15]:
X_sparse = csr_matrix(X.values)

In [16]:
selector = TextFeatureSelector(X=X_sparse, y=y, feature_names=X.columns.tolist(), n_features=700)

In [17]:
selected_features, scores = selector.chi_square_selection()

Performing chi-square feature selection...
Selected 700 features using chi-square


In [18]:
X = X.iloc[:,selected_features.tolist()]

In [104]:
class SentimentAnalyzer:
    """
    A class for training and evaluating sentiment analysis models, including testing on unseen data
    """

    def __init__(self, X, y, feature_eng, selected_feature_indices, test_size=0.2, random_state=42):
        """
        Initialize the SentimentAnalyzer by splitting the data

        Arguments:
        ----------
            X                        : Feature matrix (sparse matrix or ndarray)
            
            y                        : Target labels (array-like)
            
            feature_eng              : Instance of TextFeatureEngineering
            
            vectorizers              : Tuple of vectorizers used for feature transformation
            
            selected_feature_indices : Indices of selected features after feature selection
            
            test_size                : Proportion of data to use for testing (default: 0.2)
            
            random_state             : Random seed for reproducibility
        """
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, 
                                                                                y, 
                                                                                test_size    = test_size, 
                                                                                random_state = random_state)
        
        self.feature_eng                                     = feature_eng
        #self.vectorizers                                     = vectorizers
        self.selected_feature_indices                        = selected_feature_indices

        
    def train_model(self, model_type:str = "logistic_regression", kernel=None, **kwargs):
        """
        Train a sentiment analysis model

        Arguments:
        ----------
            model_type { str } : Type of model to train (e.g: "logistic_regression", "svm", "random_forest")
            
            kernel     { str } : Kernel type for SVM (e.g., "linear", "poly", "rbf", "sigmoid")
            
            kwargs             : Additional arguments for the model initialization

        Returns:
        --------
            Trained model
        """
        if (model_type == "logistic_regression"):
            model = LogisticRegression(max_iter = 1000, **kwargs)
            
        elif (model_type == "svm"):
            
            if (kernel is None):
                # Default kernel
                kernel = "rbf"  
                
            model = SVC(kernel = kernel, **kwargs)
            
        elif (model_type == "random_forest"):
            model = RandomForestClassifier(**kwargs)
            
        elif model_type == "naive_bayes":
            model = MultinomialNB(**kwargs)

        elif model_type == "lightgbm":
            model = LGBMClassifier(**kwargs)

        elif model_type == "logistic_model_tree":
            # Create a logistic regression model
            logistic_model      = LogisticRegression(max_iter = 1000, **kwargs)

            # Create a decision tree model
            decision_tree_model = DecisionTreeClassifier(**kwargs)

            # Combine them in a stacking model
            model               = StackingClassifier(estimators      = [('decision_tree', decision_tree_model)], 
                                                     final_estimator = logistic_model, 
                                                     **kwargs)
        
        else:
            raise ValueError("Unsupported model_type. Choose from : 'logistic_regression', 'svm', 'random_forest', 'naive_bayes', 'lightgbm', 'logistic_model_tree'")

        print(f"Training {model_type}...")
        model.fit(self.X_train, self.y_train)

        return model

    def evaluate_model(self, model):
        """
        Evaluate a trained model on the test set

        Arguments:
        ----------
            model : Trained model

        Returns:
        --------
            Dictionary containing evaluation metrics
        """
        print("Evaluating model...")
        y_pred   = model.predict(self.X_test)

        accuracy = accuracy_score(self.y_test, y_pred)
        report   = classification_report(self.y_test, y_pred)
        cm       = confusion_matrix(self.y_test, y_pred)

        print(f"Accuracy: {accuracy:.4f}")
        print("Classification Report:")
        print(report)
        print("Confusion Matrix:")
        print(cm)

        return {"accuracy"              : accuracy,
                "classification_report" : report,
                "confusion_matrix"      : cm,
               }

    
    def test_on_unseen_data(self, model, unseen_texts, unseen_labels=None, **preprocessed_features):
        """
        Test the model on unseen data

        Arguments:
        ----------
            model                 : Trained model
            
            unseen_texts          : List of unseen text data

            unseen_labels         : True labels for the unseen data

            preprocessed_features : Preprocessed feature matrices (e.g., binary_features, tfidf_features, bm25_features, etc.)

        Returns:
        --------
            Predictions for the unseen data
        """
        print("Processing unseen data...")

        # Dynamically combine all passed feature matrices
        unseen_combined_features = hstack([preprocessed_features[key] for key in preprocessed_features])

        # Select features using the indices chosen during feature selection
        unseen_selected_features = unseen_combined_features[:, self.selected_feature_indices]

        # Predict sentiments
        predictions              = model.predict(unseen_selected_features)

        # Print predictions
        print("Predictions on Unseen Data:")
        for text, pred in zip(unseen_texts, predictions):
            print(f"Text: {text}\nPredicted Sentiment: {pred}\n")

        # Compute accuracy if unseen_labels are provided
        if unseen_labels is not None:
            print(f"Number of unseen_labels: {len(unseen_labels)}")

            if (len(unseen_labels) != len(predictions)):
                raise ValueError("The number of unseen_labels must match the number of predictions.")
                
            accuracy = accuracy_score(unseen_labels, predictions)
            print(f"Accuracy on Unseen Data : {accuracy:.4f}")
            return predictions, accuracy

        return predictions

In [106]:
sentiment_analyzer                            = SentimentAnalyzer(X                        = X, 
                                                                  y                        = y,
                                                                  feature_eng              = sfe,
                                                                  selected_feature_indices = selected_features)

In [108]:
logistic_model                                = sentiment_analyzer.train_model(model_type = "logistic_regression")

Training logistic_regression...


In [109]:
evaluation_results                            = sentiment_analyzer.evaluate_model(logistic_model)

Evaluating model...
Accuracy: 0.8588
Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.85      0.86      4961
    positive       0.85      0.87      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Confusion Matrix:
[[4194  767]
 [ 645 4394]]


In [112]:
test_data = pd.read_csv('~/Project/NLP_Tasks/data/test_data.csv')

In [114]:
test_data = test_data.rename(columns={'Text':'review','Sentiment':'sentiment'})

In [116]:
preprocessor = TextPreprocessor()
test_data['cleaned_review'] = test_data['review'].apply(preprocessor.clean_text)

In [118]:
test_data.head()

Unnamed: 0,review,sentiment,cleaned_review
0,"This movie is speedy enough with plot twists, ...",negative,movie speedy enough plot twist hard understand...
1,"Seriously, this is the best movie I've ever wa...",positive,seriously best movie ive ever watched everythi...
2,"The storyline was okay, but the acting was jus...",positive,storyline okay acting mark
3,A complete disaster of a movie. Don't waste yo...,negative,complete disaster movie dont waste time
4,I can't believe how amazing this was. Totally ...,positive,cant believe amazing totally worth


In [120]:
sfe = Statistical_Feature_Engineering()
test_data = sfe.document_statistics(test_data)
test_data['FRE'] = sfe.readability_score(test_data)
bow = sfe.frequency_distribution(test_data, column='cleaned_review')

In [122]:
test_data = pd.merge(left=test_data,right=bow,on=test_data.index).drop(columns=['key_0'])

In [124]:
test_data.head()

Unnamed: 0,review,sentiment,cleaned_review,char_count,word_count,sent_count,AWL,ASL,unique_word_count,UWR,...,watched,watching,way,weekend,well,worth,wow,year,youll,youre
0,"This movie is speedy enough with plot twists, ...",negative,movie speedy enough plot twist hard understand...,62,9,1,6.888889,9.0,8,0.888889,...,0,0,0,0,0,0,0,0,0,0
1,"Seriously, this is the best movie I've ever wa...",positive,seriously best movie ive ever watched everythi...,57,8,2,7.125,4.0,8,1.0,...,1,0,0,0,0,0,0,0,0,0
2,"The storyline was okay, but the acting was jus...",positive,storyline okay acting mark,26,4,1,6.5,4.0,4,1.0,...,0,0,0,0,0,0,0,0,0,0
3,A complete disaster of a movie. Don't waste yo...,negative,complete disaster movie dont waste time,39,6,2,6.5,3.0,6,1.0,...,0,0,0,0,0,0,0,0,0,0
4,I can't believe how amazing this was. Totally ...,positive,cant believe amazing totally worth,34,5,2,6.8,2.5,5,1.0,...,0,0,0,0,0,1,0,0,0,0


In [126]:
X_test = test_data.drop(columns=['review','sentiment','cleaned_review'],axis=1)
X_test = csr_matrix(X_test.values)

In [128]:
sentiment_analyzer.test_on_unseen_data(model=logistic_model,
                                      unseen_texts=test_data['review'],
                                      unseen_labels=test_data['sentiment'],
                                      statistical_features = X_test)

Processing unseen data...


IndexError: index (1007) out of range

In [194]:
#Load Dataset
train_data = pd.read_csv("~/Project/NLP_Tasks/data/IMDB_Dataset.csv") 
test_data = pd.read_csv('~/Project/NLP_Tasks/data/test_data.csv')
test_data = test_data.rename(columns={'Text':'review','Sentiment':'sentiment'})

# TextPreprocessor
text_preprocessor = TextPreprocessor()

train_data['cleaned_review'] = train_data['review'].apply(text_preprocessor.clean_text)
test_data['cleaned_review'] = test_data['review'].apply(text_preprocessor.clean_text)

#Statistical_Feature_Engineering
feature_engineer = Statistical_Feature_Engineering()

#document_statistics
train_data  = feature_engineer.document_statistics(train_data)
test_data  = feature_engineer.document_statistics(test_data)

#frequency_distribution
train_bow = feature_engineer.frequency_distribution(train_data, 'cleaned_review',fit_transform=True)
test_bow = feature_engineer.frequency_distribution(test_data, 'cleaned_review',fit_transform=False)

# Model Training

In [225]:
stat_features = train_data[['char_count', 'word_count', 'sent_count', 'AWL', 'ASL', 'unique_word_count', 'UWR']].values
stat_features_sparse = csr_matrix(stat_features)
train_bow_sparse = csr_matrix(train_bow)
X_combined = hstack([stat_features_sparse, train_bow_sparse])

# Label Encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_data['sentiment'])

#Feature Selection
selector = TextFeatureSelector(X_combined, y, feature_names=list(train_bow.columns) + list(train_data.columns[-7:]), n_features=900)
selected_features, chi2_scores = selector.chi_square_selection()
X_combined_csr = csr_matrix(X_combined)
X_selected = X_combined_csr[:, selected_features]

# Model Training
sentiment_analyzer = SentimentAnalyzer(X_selected, y, feature_engineer, selected_feature_indices=selected_features)
model = sentiment_analyzer.train_model(model_type="logistic_regression")

# Model Evaluation
metrics = sentiment_analyzer.evaluate_model(model)

Performing chi-square feature selection...
Selected 900 features using chi-square
Training logistic_regression...
Evaluating model...
Accuracy: 0.8597
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.84      0.86      4961
           1       0.85      0.87      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Confusion Matrix:
[[4189  772]
 [ 631 4408]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Test on unseen data

In [239]:
test_stat_features = test_data[['char_count', 'word_count', 'sent_count', 'AWL', 'ASL', 'unique_word_count', 'UWR']].values
test_stat_features_sparse = csr_matrix(test_stat_features)
test_bow_sparse = csr_matrix(test_bow)
test_combined = hstack([test_stat_features_sparse, test_bow_sparse])

# Label Encoding
label_encoder = LabelEncoder()
test_data['sentiment'] = label_encoder.fit_transform(test_data['sentiment'])

# test_on_unseen_data
predictions, accuracy = sentiment_analyzer.test_on_unseen_data(
    model, 
    unseen_texts=test_data['review'].tolist(),  # List of the raw review texts
    unseen_labels=test_data['sentiment'].values,  # Actual sentiment labels, if available
    statistical_features = test_stat_features_sparse,
    bow_feature = test_bow_sparse
)

print(f"Accuracy on Test Data: {accuracy:.4f}")
print("\nPredictions:")
print(predictions)

Processing unseen data...
Predictions on Unseen Data:
Text: This movie is speedy enough with plot twists, but hard to understand the connection between plots.
Predicted Sentiment: 0

Text: Seriously, this is the best movie I've ever watched! Everything was flawless!
Predicted Sentiment: 1

Text: The storyline was okay, but the acting was just not up to the mark.
Predicted Sentiment: 0

Text: A complete disaster of a movie. Don't waste your time.
Predicted Sentiment: 0

Text: I can't believe how amazing this was. Totally worth it!
Predicted Sentiment: 1

Text: The movie had its moments, but overall, it felt like something was missing.
Predicted Sentiment: 0

Text: I absolutely loved the cinematography, but the acting was subpar.
Predicted Sentiment: 1

Text: The film is an excellent example of how not to make a movie.
Predicted Sentiment: 1

Text: It's hard to imagine how anyone could dislike this masterpiece!
Predicted Sentiment: 1

Text: The trailer was better than the actual movie. F