In [2]:
# Dependencies
import re
import nltk
import spacy
import emoji
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from pathlib import Path
from sklearn.svm import SVC
from textblob import TextBlob
from scipy.sparse import hstack
from wordcloud import WordCloud
from collections import Counter
import matplotlib.pyplot as plt
from scipy.sparse import spmatrix
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from lightgbm import LGBMClassifier
from sklearn.base import BaseEstimator
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.base import TransformerMixin
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import mutual_info_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore all runtime warnings
warnings.filterwarnings('ignore')

In [3]:
# Downloading Packages from NLTK
nltk.download('punkt')
nltk.download('words')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/it012314/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /Users/it012314/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/it012314/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/it012314/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load Dataset

In [4]:
# Load Dataset
imdb_data = pd.read_csv('../data/IMDB_Dataset.csv')

In [5]:
# Dataset Preview
imdb_data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


## Text Preprocessing

In [6]:
class TextPreprocessor:
    """
    A class for preprocessing text data through cleaning, tokenization, and normalization
    
    Attributes:
    -----------
        lemmatizer : WordNetLemmatizer instance for word lemmatization
        
        stop_words : Set of stopwords to be removed from text
    """ 
    def __init__(self):
        """
        Initialize the TextPreprocessor with required NLTK resources
        
        Raises:
        -------
            LookupError : If required NLTK resources cannot be downloaded
        """
        try:
            # Download required NLTK data
            nltk.download('punkt', quiet=True)
            nltk.download('stopwords', quiet=True)
            nltk.download('wordnet', quiet=True)
            nltk.download('punkt_tab', quiet=True)
            
            self.lemmatizer = WordNetLemmatizer()
            self.stop_words = set(stopwords.words('english'))
            
        except LookupError as e:
            raise
    
    def clean_text(self, text:str) -> str:
        """
        Clean and normalize input text by removing HTML tags, special characters,
        and applying text normalization techniques
        
        Arguments:
        ----------
            text { str }      : Input text to be cleaned
            
        Raises:
        -------
            ValueError        : If input text is None or empty
            
            TextCleaningError : If any error occurs at any step of text cleaning process
            
        Returns:
        --------
                { str }       : Cleaned and normalized text
        """
        if ((not text) or (not isinstance(text, str))):
            raise ValueError("Input text must be a non-empty string")
            
        try:
            # Remove HTML tags
            text   = re.sub('<[^>]*>', '', text)
            
            # Remove special characters and digits
            text   = re.sub('[^a-zA-Z\s]', '', text)
            
            # Convert to lowercase
            text   = text.lower()
            
            # Tokenization
            tokens = word_tokenize(text)
            
            # Remove stopwords and lemmatize
            tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stop_words]
            
            return ' '.join(tokens)
        
        except Exception as TextCleaningError:
            raise

In [7]:
# Initialize the preprocessor
preprocessor                  = TextPreprocessor()

# Add a new column to the original DataFrame to store the cleaned texts
imdb_data['clean_text'] = imdb_data['review'].apply(preprocessor.clean_text)

In [8]:
# Ratings After Cleaning
imdb_data.head(10)

Unnamed: 0,review,sentiment,clean_text
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode you...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...
5,"Probably my all-time favorite movie, a story o...",positive,probably alltime favorite movie story selfless...
6,I sure would like to see a resurrection of a u...,positive,sure would like see resurrection dated seahunt...
7,"This show was an amazing, fresh & innovative i...",negative,show amazing fresh innovative idea first aired...
8,Encouraged by the positive comments about this...,negative,encouraged positive comment film looking forwa...
9,If you like original gut wrenching laughter yo...,positive,like original gut wrenching laughter like movi...


## Contextual Feature Engineering

In [9]:
class Contextual_Features:
    """
    A class for implementing various text feature engineering techniques
    
    Attributes:
    -----------
        texts        { list }  : List of preprocessed text documents
        
        max_features  { int }  : Maximum number of features to create
        
        ngram_range  { tuple } : Range of n-grams to consider
    """
    
    def __init__(self, texts: list, max_features: int = None, ngram_range: tuple = (1, 3)) -> None:
        """
        Initialize TextFeatureEngineering with texts and parameters
        
        Arguments:
        ----------
            texts        : List of preprocessed text documents
            
            max_features : Maximum number of features (None for no limit)
            
            ngram_range  : Range of n-grams to consider (min_n, max_n)
            
        Raises:
        -------
            ValueError   : If texts is empty or parameters are invalid
        """
        if not texts:
            raise ValueError("Input texts cannot be empty")
            
        self.texts        = texts
        self.max_features = max_features
        self.ngram_range  = ngram_range
        
    def window_based(self):
        """
        Create Window Based Feature Engineering with texts and parameters

        Arguments:
        ----------
        texts             : List of preprocessed text documents
        """
        try:
            print("Creating Window-Based Contextual Features:...")
            vectorizer = CountVectorizer(max_features = self.max_features,
                                         ngram_range  = self.ngram_range)
            ngrams_features     = vectorizer.fit_transform(self.texts)
            
            return vectorizer, ngrams_features
            
        except Exception as e:
            raise

    def position_based(self):
        """
        Create Position Based Feature Engineering with texts and parameters

        Arguments:
        ----------
        texts              : List of preprocessed text documents
        """
        try:
            print("Creating Position-Based Contextual Features:...")
            position_features = []
            
            for doc in self.texts:
                words = doc.split() 
            
                position_features.extend([{"word": word, "position": idx} for idx, word in enumerate(words)])

            return position_features

        except Exception as e:
            raise

    def generate_ngrams(self, n=3):
        """
        Generate N-Grams

        Arguments:
        ----------
        words         : List of words taken individually from the preprocessed text documents
        n             : Individual words from the list
        """
        print("Generating N-Grams:...")
        ngrams = []

        for doc in self.texts:
            words = doc.split() 
            ngrams.extend([tuple(words[i:i+n]) for i in range(len(words)-n+1)]) 

        return ngrams

    def cross_document(self):
        """
        Create Cross Document Feature Engineering with texts and parameters

        Arguments:
        ----------
        texts             : List of preprocessed text documents
        """
        try: 
            print("Creating Cross Document Contextual Feature Engineering:...")
            vectorizer   = TfidfVectorizer(max_features = self.max_features, 
                                           ngram_range  = self.ngram_range)
            tfidf_matrix = vectorizer.fit_transform(self.texts)
            return vectorizer, tfidf_matrix

        except Exception as e:
            raise

In [10]:
contextuals = Contextual_Features(texts        = imdb_data['clean_text'].tolist(),
                                  max_features = 100,
                                  ngram_range  = (2, 2))

In [11]:
window_vectorizer, window_features = contextuals.window_based()
positional_features                = contextuals.position_based()
trigrams                           = contextuals.generate_ngrams()
cross_doc_vectorizer, tfidf_matrix = contextuals.cross_document()

Creating Window-Based Contextual Features:...
Creating Position-Based Contextual Features:...
Generating N-Grams:...
Creating Cross Document Contextual Feature Engineering:...


In [12]:
print("Vocabulary:\n", window_vectorizer.vocabulary_)
print("N-Gram Features:\n", window_features.toarray())

Vocabulary:
 {'worth watching': np.int64(95), 'well done': np.int64(91), 'make film': np.int64(43), 'real life': np.int64(75), 'new york': np.int64(64), 'would like': np.int64(96), 'waste time': np.int64(87), 'ive seen': np.int64(35), 'one worst': np.int64(69), 'like movie': np.int64(36), 'low budget': np.int64(41), 'film would': np.int64(19), 'saw movie': np.int64(78), 'year old': np.int64(99), 'year later': np.int64(98), 'see movie': np.int64(80), 'good thing': np.int64(26), 'ive ever': np.int64(34), 'ever seen': np.int64(9), 'high school': np.int64(30), 'special effect': np.int64(83), 'movie made': np.int64(55), 'story line': np.int64(84), 'movie could': np.int64(49), 'first film': np.int64(20), 'film also': np.int64(13), 'film like': np.int64(15), 'film one': np.int64(17), 'look like': np.int64(39), 'bad guy': np.int64(1), 'good movie': np.int64(25), 'seems like': np.int64(81), 'pretty much': np.int64(72), 'worst movie': np.int64(94), 'much better': np.int64(61), 'main character': 

In [13]:
print("Positional Features:\n", pd.DataFrame(positional_features))

Positional Features:
               word  position
0              one         0
1         reviewer         1
2        mentioned         2
3         watching         3
4               oz         4
...            ...       ...
5928987       even        63
5928988      cable        64
5928989    channel        65
5928990      avoid        66
5928991      movie        67

[5928992 rows x 2 columns]


In [14]:
for i in range (100):
    print("Trigrams:\n", trigrams[i])


Trigrams:
 ('one', 'reviewer', 'mentioned')
Trigrams:
 ('reviewer', 'mentioned', 'watching')
Trigrams:
 ('mentioned', 'watching', 'oz')
Trigrams:
 ('watching', 'oz', 'episode')
Trigrams:
 ('oz', 'episode', 'youll')
Trigrams:
 ('episode', 'youll', 'hooked')
Trigrams:
 ('youll', 'hooked', 'right')
Trigrams:
 ('hooked', 'right', 'exactly')
Trigrams:
 ('right', 'exactly', 'happened')
Trigrams:
 ('exactly', 'happened', 'methe')
Trigrams:
 ('happened', 'methe', 'first')
Trigrams:
 ('methe', 'first', 'thing')
Trigrams:
 ('first', 'thing', 'struck')
Trigrams:
 ('thing', 'struck', 'oz')
Trigrams:
 ('struck', 'oz', 'brutality')
Trigrams:
 ('oz', 'brutality', 'unflinching')
Trigrams:
 ('brutality', 'unflinching', 'scene')
Trigrams:
 ('unflinching', 'scene', 'violence')
Trigrams:
 ('scene', 'violence', 'set')
Trigrams:
 ('violence', 'set', 'right')
Trigrams:
 ('set', 'right', 'word')
Trigrams:
 ('right', 'word', 'go')
Trigrams:
 ('word', 'go', 'trust')
Trigrams:
 ('go', 'trust', 'show')
Trigrams:


In [15]:
print("TF-IDF Vectorizer:\n", cross_doc_vectorizer.get_feature_names_out())
print("TF-IDF Matrix", tfidf_matrix.toarray())

TF-IDF Vectorizer:
 ['bad acting' 'bad guy' 'bad movie' 'come across' 'dont get' 'dont know'
 'dont think' 'even though' 'ever made' 'ever seen' 'every time'
 'fall love' 'feel like' 'film also' 'film ever' 'film like' 'film made'
 'film one' 'film really' 'film would' 'first film' 'first movie'
 'first time' 'good film' 'good job' 'good movie' 'good thing'
 'great film' 'great movie' 'havent seen' 'high school' 'horror film'
 'horror movie' 'im sure' 'ive ever' 'ive seen' 'like movie' 'like one'
 'long time' 'look like' 'love story' 'low budget' 'main character'
 'make film' 'make movie' 'make sense' 'many people' 'many time'
 'movie bad' 'movie could' 'movie even' 'movie ever' 'movie good'
 'movie ive' 'movie like' 'movie made' 'movie make' 'movie movie'
 'movie one' 'movie really' 'movie would' 'much better' 'must see'
 'never seen' 'new york' 'one best' 'one film' 'one movie' 'one thing'
 'one worst' 'part movie' 'pretty good' 'pretty much' 'production value'
 'read book' 'real lif

In [16]:
wb = window_features.toarray()
print(wb[3])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [17]:
tfidf = tfidf_matrix.toarray()
print(tfidf[3])

[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.70427598 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.70992629 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.        ]


In [27]:
np.array(positional_features).shape

(5928992,)