In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
import itertools
import collections

In [25]:
# Ensure stopwords are downloaded
import nltk

nltk.download("stopwords")
stopwords = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
imdb_reviews = pd.read_csv("./IMDB_movie_review_dataset/Train.csv")
imdb_reviews.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [28]:
# text pre-processing pipeline
# 1. Convert text to lowercase
# 2. Remove any links or urls from text
# 3. Remove any punctuations
# 4. Remove stopwords
# 5. Correct misspelling [Time consuming]


def text_preprocessing_pipeline(corpus):
    # Lowercase
    corpus["text"] = corpus["text"].str.lower()

    # Remove URLs/hyper links
    corpus["text"] = corpus["text"].str.replace(r"http\S+", "", regex=True)

    # Remove non-alphanumeric characters
    corpus["text"] = corpus["text"].str.replace("[^A-Za-z0-9]+", " ", regex=True)

    # Remove stopwords
    corpus["text"] = corpus["text"].apply(
        lambda words: " ".join(
            word.lower() for word in words.split() if word not in stopwords
        )
    )

    # Optionally apply text correction (Only if you have ample time😂)
    # corpus["text"] = corpus["text"].apply(lambda x: str(TextBlob(x).correct()))

    return corpus


reviews = text_preprocessing_pipeline(imdb_reviews)
reviews.head()

Unnamed: 0,text,label
0,grew b 1965 watching loving thunderbirds mates...,0
1,put movie dvd player sat coke chips expectatio...,0
2,people know particular time past like feel nee...,0
3,even though great interest biblical movies bor...,0
4,im die hard dads army fan nothing ever change ...,1


In [29]:
# Downloads the WordNet lexical database, which is essential for lemmatization.
# WordNet provides word relationships and word forms, allowing the lemmatizer to reduce words to their base or root form (lemma).
nltk.download("wordnet")
# Downloads the Punkt tokenizer, which is used for sentence and word tokenization.
# Tokenization is the process of breaking text into smaller units like sentences or words.
nltk.download("punkt")

# A tokenizer that splits text into tokens based on whitespace (spaces, tabs, or newlines).
# It’s a simple way to split words without considering punctuation or sentence boundaries.
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

# A lemmatizer that uses WordNet to reduce words to their base or dictionary form (e.g., "running" → "run", "better" → "good").
# Unlike stemming, lemmatization considers the meaning of the word to produce valid lemmas.
lemmatizer = nltk.stem.WordNetLemmatizer()


def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]


reviews["lemmatized_tokens"] = reviews["text"].apply(lemmatize_text)
reviews.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhin\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Unnamed: 0,text,label,lemmatized_tokens
0,grew b 1965 watching loving thunderbirds mates...,0,"[grew, b, 1965, watching, loving, thunderbird,..."
1,put movie dvd player sat coke chips expectatio...,0,"[put, movie, dvd, player, sat, coke, chip, exp..."
2,people know particular time past like feel nee...,0,"[people, know, particular, time, past, like, f..."
3,even though great interest biblical movies bor...,0,"[even, though, great, interest, biblical, movi..."
4,im die hard dads army fan nothing ever change ...,1,"[im, die, hard, dad, army, fan, nothing, ever,..."


In [None]:
lemmatized_tokens = list(reviews["lemmatized_tokens"])
token_list = list(itertools.chain(*lemmatized_tokens))
count_no = collections.Counter(token_list)