Natural language processing with BOW: IMDB Dataset of 50K Movie Reviews

In [1]:
# import libraries

import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pickle

# Download the required NLTK data
nltk.download('stopwords') # For stopword removal
nltk.download('punkt')     # For word_tokenize
nltk.download('wordnet')   # For lemmatization 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# load the dataset into a dataframe
df= pd.read_csv("../data/IMDB Dataset.csv")
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [7]:
# Replace the sentiment labels with binary values
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
# Rename the sentiment column
df.rename(columns={'sentiment': 'positive_sentiment'}, inplace=True)
# Verify the change
df.head(10)

Unnamed: 0,review,positive_sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
5,"Probably my all-time favorite movie, a story o...",1
6,I sure would like to see a resurrection of a u...,1
7,"This show was an amazing, fresh & innovative i...",0
8,Encouraged by the positive comments about this...,0
9,If you like original gut wrenching laughter yo...,1


In [8]:
# Check the distribution of positive and negative sentiment to confirm if the dataset is balanced.
df['positive_sentiment'].value_counts()

1    25000
0    25000
Name: positive_sentiment, dtype: int64

In [9]:
# Separate the feature (reviews) and the target variable (positive sentiment) for model training.
X = df["review"]
y = df["positive_sentiment"]

In [10]:

def text_preprocessing(text, apply_stemming=True, apply_lemmatization=True):
    """
    Preprocess a single review text by removing stopwords, punctuation and HTML tags, converting to lowercase,
    and applying stemming and/or lemmatization.
    
    Parameters:
    - text: A single text (string) to preprocess.
    - apply_stemming: Boolean. If True, apply stemming. Default is True.
    - apply_lemmatization: Boolean. If True, apply lemmatization. Default is True.
    
    Returns:
    - preprocessed_text: Preprocessed text as a string.
    """
    
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation (except for apostrophes in words like "don't")
    text = re.sub(r'[^\w\s\']', '', text)
    
    # Remove HTML tags like <br />
    text = re.sub(r'<br\s*/?>', ' ', text)
    # Remove any remaining occurrences of "br"
    text = re.sub(r'\bbr\b', '', text)

    # Tokenize the text into words
    words = word_tokenize(text)

    # Set of English stopwords
    stop_words = set(stopwords.words('english'))

    # List of negation words to exclude from stopwords
    negation_words = {'no', 'not',"don't","can't", "isn't", "mustn't", 'hasn', 'shan', 'mustn', 'neither', 'ain', 'haven', 'none', "hadn't", 'hadn', "haven't", 'wouldn', "shouldn't", 'didn', "couldn't", "didn't", "wasn't", "shan't", "aren't", 'isn', 'needn', 'weren', 'mightn', "weren't", 'shouldn', "won't", 'never', "hasn't", "needn't", 'nor', 'cannot', 'couldn', 'doesn', 'wasn', "mightn't", "wouldn't", "doesn't", 'aren', 'won'}
    # Remove the negation words from the stopwords list
    stop_words = stop_words.difference(negation_words)

    # Remove stopwords
    filtered_words = [word for word in words if word not in stop_words]

    # Initialize Stemmer and Lemmatizer
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    processed_words = filtered_words

    # Apply stemming and/or lemmatization as needed
    if apply_stemming:
        processed_words = [stemmer.stem(word) for word in processed_words]
    if apply_lemmatization:
        processed_words = [lemmatizer.lemmatize(word) for word in processed_words]
    
    # Join the processed words back into a string
    preprocessed_text = ' '.join(processed_words)
    
    return preprocessed_text


In [11]:
# Testing the function on a sample review
sample_review = X.iloc[1]   
# Preprocess the review using the text_preprocessing function
preprocessed_review = text_preprocessing(sample_review)

# Print the original and preprocessed review
print(f"Original Review:{sample_review}")
print(f"Preprocessed Review:{preprocessed_review}")

Original Review:A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master's of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional 'dream' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell's murals decorating every surface) are terri

In [12]:
X_preprocessed = [text_preprocessing(text) for text in X]

In [13]:
#Split the data into train (80%) and test (20%)  sets. 

X_train, X_test, y_train, y_test = train_test_split(X_preprocessed,y, test_size= 0.2, random_state= 42)

In [14]:
# Initiate a CountVectorizer
vectorizer = CountVectorizer(stop_words=None, ngram_range=(1, 2)) # Use bigrams (1-grams and 2-grams)

# Fit and transform the training data
X_train_vector = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_vector = vectorizer.transform(X_test)

# To check the shape of the resulting BoW matrices
print(f"Shape of X_train_vector: {X_train_vector.shape}")
print(f"Shape of X_test_vector: {X_test_vector.shape}")

Shape of X_train_vector: (40000, 2458266)
Shape of X_test_vector: (10000, 2458266)


In [15]:
# Initialize and train the Logistic Regression model

model = LogisticRegression()
model.fit(X_train_vector, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
# Predict on the test set
y_pred = model.predict(X_test_vector)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy * 100:.2f}%')
print("\nClassification Report:\n", report)

Accuracy: 90.28%

Classification Report:
              precision    recall  f1-score   support

          0       0.91      0.89      0.90      4961
          1       0.90      0.91      0.90      5039

avg / total       0.90      0.90      0.90     10000



In [20]:
# Open a file where you want to store the model in write-binary mode and write the model to the file
with open('../ref/model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [19]:
# write the vectorizer to a file
with open('../ref/vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)