In [1]:
import pandas as pd
df = pd.read_csv('movies_dataset.csv')
df.head()

Unnamed: 0,Movie name,Description,Genre
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"Drama, Crime"
3,Schindler's List,The true story of how businessman Oskar Schind...,"Drama, History, War"
4,12 Angry Men,The defense and the prosecution have rested an...,Drama


---
## 1. Make The Columns into Lower Case

In [2]:
df['Description'] = df['Description'].str.lower()

In [3]:
df.head()

Unnamed: 0,Movie name,Description,Genre
0,The Shawshank Redemption,imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,"spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
2,The Godfather Part II,in the continuing saga of the corleone crime f...,"Drama, Crime"
3,Schindler's List,the true story of how businessman oskar schind...,"Drama, History, War"
4,12 Angry Men,the defense and the prosecution have rested an...,Drama


---
## 2. Remove HTML Tags and URLs

In [4]:
import re

# Precompile regex patterns
TAG_RE = re.compile(r'</?[A-Za-z][A-Za-z0-9-]*\b[^>]*>')
URL_RE = re.compile(r'https?://\S+|www\.\S+')

def clean_text(text):
    if not isinstance(text, str):
        return text   # skip if not string
    
    # Remove HTML tags
    no_html = TAG_RE.sub('', text)
    
    # Remove URLs
    no_url = URL_RE.sub('', no_html)
    
    return no_url.strip()


In [5]:
df['Description'] = df['Description'].apply(clean_text)

In [6]:
df.head()

Unnamed: 0,Movie name,Description,Genre
0,The Shawshank Redemption,imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,"spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
2,The Godfather Part II,in the continuing saga of the corleone crime f...,"Drama, Crime"
3,Schindler's List,the true story of how businessman oskar schind...,"Drama, History, War"
4,12 Angry Men,the defense and the prosecution have rested an...,Drama


In [7]:
# Remove rows where Description is NaN

df = df.dropna(subset=['Description'])

In [8]:
# Remove all rows that contain NaN in any column
df = df.dropna()

In [9]:
df.isnull().sum()

Movie name     0
Description    0
Genre          0
dtype: int64

---
## 3. Remove Punctuations

In [10]:
import string
import time

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
exclude = string.punctuation

In [12]:
def remove_punc1(text):
    return text.translate(str.maketrans('', '', exclude))

In [13]:
df.tail(5)

Unnamed: 0,Movie name,Description,Genre
9375,The Watch,four everyday suburban guys come together as a...,"Comedy, Action, Science Fiction"
9376,Retreat,kate and martin escape from personal tragedy t...,"Horror, Drama, Thriller"
9377,The Sweetest Thing,christina's love life is stuck in neutral. aft...,"Romance, Comedy"
9378,The Volcano,the eruption of the icelandic volcano eyjafjal...,Comedy
9379,Taxi,a mouthy and feisty taxicab driver has hot tip...,"Action, Comedy, Thriller, Crime"


---
## 4. Spelling Correction

In [14]:
## very slow good for small text

#from textblob import TextBlob
# df['Description'] = df['Description'].apply(lambda x: str(TextBlob(x).correct()))

In [15]:
## fast good for large text

from autocorrect import Speller
spell = Speller(lang='en')

df['Description'] = df['Description'].apply(lambda x: spell(x))

In [16]:
df.head()

Unnamed: 0,Movie name,Description,Genre
0,The Shawshank Redemption,imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,"spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
2,The Godfather Part II,in the continuing saga of the corleone crime f...,"Drama, Crime"
3,Schindler's List,the true story of how businessman oscar chandl...,"Drama, History, War"
4,12 Angry Men,the defense and the prosecution have rested an...,Drama


---
## 5. Removing Stop Words

In [17]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/atharparvezce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
# Now you can use it
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [20]:
def remove_stopwords(text):
    new_text = []

    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [21]:
df['Description'] = df['Description'].apply(remove_stopwords)

In [22]:
df.head()

Unnamed: 0,Movie name,Description,Genre
0,The Shawshank Redemption,imprisoned 1940s double murder wife lo...,"Drama, Crime"
1,The Godfather,"spanning years 1945 1955, chronicle ficti...","Drama, Crime"
2,The Godfather Part II,"continuing saga corleone crime family, yo...","Drama, Crime"
3,Schindler's List,true story businessman oscar chandler saved...,"Drama, History, War"
4,12 Angry Men,defense prosecution rested jury filing ...,Drama


In [23]:
# Save the current dataframe into csv file for further preprocessing

df.to_csv("cleaned_dataset.csv", index=False)

In [24]:
df = pd.read_csv("cleaned_dataset.csv")
df.head()

Unnamed: 0,Movie name,Description,Genre
0,The Shawshank Redemption,imprisoned 1940s double murder wife lo...,"Drama, Crime"
1,The Godfather,"spanning years 1945 1955, chronicle ficti...","Drama, Crime"
2,The Godfather Part II,"continuing saga corleone crime family, yo...","Drama, Crime"
3,Schindler's List,true story businessman oscar chandler saved...,"Drama, History, War"
4,12 Angry Men,defense prosecution rested jury filing ...,Drama


---
## 6. Tokenization

In [25]:
import spacy
print(spacy.__version__)

3.7.5


In [27]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")


# Apply spaCy tokenization to each row
df["Tokens"] = df["Description"].apply(lambda x: [t.text for t in nlp(x)])

In [28]:
df.head()

Unnamed: 0,Movie name,Description,Genre,Tokens
0,The Shawshank Redemption,imprisoned 1940s double murder wife lo...,"Drama, Crime","[imprisoned, , 1940s, , double, murder, ..."
1,The Godfather,"spanning years 1945 1955, chronicle ficti...","Drama, Crime","[spanning, , years, 1945, , 1955, ,, , chro..."
2,The Godfather Part II,"continuing saga corleone crime family, yo...","Drama, Crime","[ , continuing, saga, , corleone, crime, fa..."
3,Schindler's List,true story businessman oscar chandler saved...,"Drama, History, War","[ , true, story, , businessman, oscar, chand..."
4,12 Angry Men,defense prosecution rested jury filing ...,Drama,"[ , defense, , prosecution, , rested, , j..."


## 7. Lemmatization

In [29]:
import nltk
from nltk.stem import WordNetLemmatizer

# Make sure resources are available
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/atharparvezce/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/atharparvezce/nltk_data...


In [32]:
# Function to lemmatize token list
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token, pos="v") for token in tokens if token.isalpha()]  
    # keep only words (removes numbers, punctuation)

# Apply to your Tokens column
df["Lemmas"] = df["Tokens"].apply(lemmatize_tokens)

In [33]:
df.head()

Unnamed: 0,Movie name,Description,Genre,Tokens,Lemmas
0,The Shawshank Redemption,imprisoned 1940s double murder wife lo...,"Drama, Crime","[imprisoned, , 1940s, , double, murder, ...","[imprison, double, murder, wife, lover, stand,..."
1,The Godfather,"spanning years 1945 1955, chronicle ficti...","Drama, Crime","[spanning, , years, 1945, , 1955, ,, , chro...","[span, years, chronicle, fictional, italian, a..."
2,The Godfather Part II,"continuing saga corleone crime family, yo...","Drama, Crime","[ , continuing, saga, , corleone, crime, fa...","[continue, saga, corleone, crime, family, youn..."
3,Schindler's List,true story businessman oscar chandler saved...,"Drama, History, War","[ , true, story, , businessman, oscar, chand...","[true, story, businessman, oscar, chandler, sa..."
4,12 Angry Men,defense prosecution rested jury filing ...,Drama,"[ , defense, , prosecution, , rested, , j...","[defense, prosecution, rest, jury, file, jury,..."
