## Movie Reviews Sentiment Classification

Classify movie reviews into positive, negative and neutral classes. We are using IMDB 50k movie reviews data. 

In [None]:
# !pip install contractions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import re
import contractions
import unicodedata
import html

from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

Import data from kaggle

In [None]:
# !mkdir -p ~/.kaggle
# !mv kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json

In [None]:
# !kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 66% 17.0M/25.7M [00:00<00:00, 71.7MB/s]
100% 25.7M/25.7M [00:00<00:00, 88.5MB/s]


In [None]:
# !unzip imdb-dataset-of-50k-movie-reviews.zip -d /content/drive/MyDrive/Sentiment_analysis

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: /content/drive/MyDrive/Sentiment_analysis/IMDB Dataset.csv  


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Sentiment_analysis/IMDB Dataset.csv')

In [None]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [None]:
def preprocess(text):

    text = text.lower() # lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # remove links
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'/', ' ', text) # convert / to space
    text = contractions.fix(text) # convert contractions to single words
    text = re.sub(r'[^a-z\s]', ' ', text) # remove punctuations
    text = re.sub(r'\bs\b', '', text) # remove extra spaces

    return ' '.join(text.split())

In [None]:
df['preprocess_review'] = df['review'].apply(preprocess)

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
# Replace repeated characters with two occurrences
def normalize_stretched_word(word):
    return re.sub(r'(.)\1+', r'\1\1', word)  

def lemmatize_and_normalize_text(text):
    tokens = word_tokenize(text.lower())
    normalized_tokens = [normalize_stretched_word(token) for token in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in normalized_tokens]
    return ' '.join(lemmatized_tokens)

In [None]:
df['preprocess_review'] = df['preprocess_review'].apply(lemmatize_and_normalize_text)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['preprocess_review'])

In [None]:
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

In [None]:
vocab_size

89202

In [None]:
# Use glove embeddings from the file

def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            try:
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
            except ValueError as e:
                print(f"Skipping line due to error: {e}")
                continue
    return embeddings_index

glove_file_path = '/path/to/glove.840B.300d.txt'  # Replace with your actual path to your GloVe file
embeddings_index = load_glove_embeddings(glove_file_path)

Skipping line due to error: could not convert string to float: '.'
Skipping line due to error: could not convert string to float: 'name@domain.com'
Skipping line due to error: could not convert string to float: '.'
Skipping line due to error: could not convert string to float: 'name@domain.com'
Skipping line due to error: could not convert string to float: '.'
Skipping line due to error: could not convert string to float: '.'
Skipping line due to error: could not convert string to float: 'name@domain.com'
Skipping line due to error: could not convert string to float: 'name@domain.com'
Skipping line due to error: could not convert string to float: 'name@domain.com'
Skipping line due to error: could not convert string to float: 'name@domain.com'
Skipping line due to error: could not convert string to float: 'name@domain.com'
Skipping line due to error: could not convert string to float: 'Killerseats.com'
Skipping line due to error: could not convert string to float: 'name@domain.com'
Ski

In [None]:
# create a embedding matrix for our vocabulary

embedding_dim = 300 
glove_embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros
        glove_embedding_matrix[i] = embedding_vector

In [None]:
df['sequences'] = tokenizer.texts_to_sequences(df['preprocess_review'])

In [None]:
empty_rows = np.all(glove_embedding_matrix == 0, axis=1) #no. of tokens with no word embedding
np.sum(empty_rows)

21525

In [None]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
print(df['sequences'].apply(len).max())
print(df['sequences'].apply(len).median())

2474
175.0


In [None]:
import pickle

with open('/content/drive/MyDrive/Sentiment_analysis/data.pkl', 'wb') as f:
    pickle.dump({
        'data': df,
        'tokenizer': tokenizer,
        'embedding_matrix': glove_embedding_matrix
    }, f)