# Movie Review Sentiment Analysis 

In [145]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix

# Pre-processing

In [146]:
imdb_data = pd.read_csv("./data/imdb_dataset.csv")
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [147]:
imdb_data.isna().sum()

review       0
sentiment    0
dtype: int64

The dataset does not have any null values

In [148]:
imdb_data["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

The dataset is very balanced with 25000 data points each for positive and negative sentiments

In [149]:
def preprocess_reviews(reviews: pd.Series) -> pd.Series:
    # convert all values to strings
    s = reviews.astype(str)
    # replace null values with ""
    s = reviews.fillna("")
    # remove html tags
    s = s.str.replace(r"<[^>]+>", " ", regex=True)
    # collapse whitespaces and remove leading and trailing spaces
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()

    return s

In [150]:
nltk.download('stopwords', download_dir="./data")
vectorizer = CountVectorizer(
                stop_words=stopwords.words('english'),
                token_pattern=r"(?u)\b\w+\b"
                )

[nltk_data] Downloading package stopwords to ./data...
[nltk_data]   Package stopwords is already up-to-date!


In [151]:
def vectorize_reviews(reviews: pd.Series) -> tuple[csr_matrix, np.ndarray]:
        document_term_matrix = vectorizer.fit_transform(reviews)
        vocab = vectorizer.get_feature_names_out()
        return document_term_matrix, vocab

In [152]:
processed_reviews = preprocess_reviews(imdb_data["review"])
document_term_matrix, vocabulary = vectorize_reviews(processed_reviews)

In [153]:
document_term_matrix.shape, vocabulary.shape

((50000, 101784), (101784,))