In [27]:
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
import zipfile

# !kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
with zipfile.ZipFile("imdb-dataset-of-50k-movie-reviews.zip", 'r') as zip_ref:
    zip_ref.extractall()

df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [29]:
# Remove HTML tags from the 'review' column using regular expressions
df['review'] = df['review'].apply(lambda x: re.sub(r'<[^>]*>', '', x))

# Remove any special characters (excluding letters, numbers, and whitespace) from the 'review' column
df['review'] = df['review'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# df.to_csv("IMDB Dataset(cleaned).csv")
# df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming tech...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive


In [30]:
# Preprocessing the data.
df_jumbled = df.sample(frac=1, random_state=42)
# df_jumbled.head()

# Tokenization, Negation Handling, lowercase the reviews.
# nltk.download('punkt')
df_tokenized_reviews = df_jumbled['review'].apply(nltk.word_tokenize)
# print(df_tokenized_reviews.head(5))

# lambda function used to further process words into case insensitive tokens.
df_processed_reviews = df_tokenized_reviews.apply(lambda x: [word.lower() for word in x])
# print(df_processed_reviews.head(5))

# Create a new DataFrame with processed reviews and the already existing 'sentiment' column from before.
df_processed_data = pd.DataFrame({
    'review' : df_processed_reviews,
    'sentiment' : df_jumbled['sentiment']
})

df_processed_data.head()


Unnamed: 0,review,sentiment
33553,"[i, really, liked, this, summerslam, due, to, ...",positive
9427,"[not, many, television, shows, appeal, to, qui...",positive
199,"[the, film, quickly, gets, to, a, major, chase...",negative
12447,"[jane, austen, would, definitely, approve, of,...",positive
39489,"[expectations, were, somewhat, high, for, me, ...",negative


In [31]:
# Feature Extraction - split apart the feature column from the sentiment column and make review X, and Sentiment Y

# List object, transform this into string object
X = df_processed_data['review']

# sequence of strings needed for vectorizor, convert each list of tokenized words into a single string
X_reviews_as_strings = [' '.join(token) for token in X]

# text needed to be encoded from "text" --> "binary-like value" that can be inpreted by the model.
y = df_processed_data['sentiment']

# Use vectorizer for reviews, and encoder for the sentiment
vectorizer = TfidfVectorizer()
encoder = LabelEncoder()

# TfidfVectorizer used on all strings from review column.
X_vectorized = vectorizer.fit_transform(X_reviews_as_strings)

# Target label (sentiment) is fit and transformed
y_encoded = encoder.fit_transform(y)

# Stratified split of dataset to maintain even distribution of positive and negative reviews
# ... when training a model. Maintains the class proportions.
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_encoded, test_size = 0.2, stratify=y_encoded, random_state=42)


In [None]:
# Model Selection

# Data Anaylsis --> Data Preprocessing --> train_test_split --> Data Modeling w/ X_train, y_train