# Assignment for Text preprocessing

#### 1) Create your own dataset with columns : movie name , description and genre from TMDb website
#### 2) Perform the necessary Text preprocessing on the description column

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import requests # This library is used to call APIs and get data from the internet
import pandas as pd
import time # Used to pause the program briefly so we don‚Äôt overload the API.

API_KEY = "8265bd1679663a7ea12ac168da84d2e8"

# Step 1: Get genre mapping

genre_url = f"https://api.themoviedb.org/3/genre/movie/list?api_key={API_KEY}&language=en-US"

genre_response = requests.get(genre_url).json() # Calls the API and converts the response into a Python dictionary.

# Map genre_id to genre_name
genre_dict = {genre['id']: genre['name'] for genre in genre_response['genres']}

#  Fetch all top-rated movies

all_data = []
total_pages = 471  
for page in range(1, total_pages + 1):
    movies_url = f"https://api.themoviedb.org/3/movie/top_rated?api_key={API_KEY}&language=en-US&page={page}"
    response = requests.get(movies_url).json()
    movies_list = response.get('results', [])
    
    for movie in movies_list:
        name = movie.get('title', "")
        description = movie.get('overview', "")
        genres = [genre_dict[g] for g in movie.get('genre_ids', [])]
        
        all_data.append({
            "movie_name": name,
            "description": description,
            "genre": ", ".join(genres)
        })
    
    print(f"Page {page} fetched, total movies collected: {len(all_data)}")
    time.sleep(0.3)  # To avoid hitting API rate limits

# Create DataFrame and save CSV

df = pd.DataFrame(all_data)
df.to_csv("/kaggle/working/top_rated_movies.csv", index=False)

print("Dataset saved as top_rated_movies.csv")
df.head(10) 


Page 1 fetched, total movies collected: 20
Page 2 fetched, total movies collected: 40
Page 3 fetched, total movies collected: 60
Page 4 fetched, total movies collected: 80
Page 5 fetched, total movies collected: 100
Page 6 fetched, total movies collected: 120
Page 7 fetched, total movies collected: 140
Page 8 fetched, total movies collected: 160
Page 9 fetched, total movies collected: 180
Page 10 fetched, total movies collected: 200
Page 11 fetched, total movies collected: 220
Page 12 fetched, total movies collected: 240
Page 13 fetched, total movies collected: 260
Page 14 fetched, total movies collected: 280
Page 15 fetched, total movies collected: 300
Page 16 fetched, total movies collected: 320
Page 17 fetched, total movies collected: 340
Page 18 fetched, total movies collected: 360
Page 19 fetched, total movies collected: 380
Page 20 fetched, total movies collected: 400
Page 21 fetched, total movies collected: 420
Page 22 fetched, total movies collected: 440
Page 23 fetched, total 

Unnamed: 0,movie_name,description,genre
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"Drama, Crime"
3,Schindler's List,The true story of how businessman Oskar Schind...,"Drama, History, War"
4,12 Angry Men,The defense and the prosecution have rested an...,Drama
5,Spirited Away,"A young girl, Chihiro, becomes trapped in a st...","Animation, Family, Fantasy"
6,The Dark Knight,Batman raises the stakes in his war on crime. ...,"Drama, Action, Crime, Thriller"
7,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...","Comedy, Drama, Romance"
8,The Green Mile,A supernatural tale set on death row in a Sout...,"Fantasy, Drama, Crime"
9,Parasite,"All unemployed, Ki-taek's family takes peculia...","Comedy, Thriller, Drama"


In [3]:
df

Unnamed: 0,movie_name,description,genre
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"Drama, Crime"
3,Schindler's List,The true story of how businessman Oskar Schind...,"Drama, History, War"
4,12 Angry Men,The defense and the prosecution have rested an...,Drama
...,...,...,...
9415,White Girl,"Summer, New York City. A college girl falls ha...",Drama
9416,Playing for Keeps,A former sports star who's fallen on hard time...,"Comedy, Romance"
9417,Captive State,Nearly a decade after occupation by an extrate...,"Science Fiction, Action, Thriller"
9418,The Texas Chainsaw Massacre 2,A radio host is victimised by the notorious ca...,"Horror, Comedy"


In [4]:
df['description']

0       Imprisoned in the 1940s for the double murder ...
1       Spanning the years 1945 to 1955, a chronicle o...
2       In the continuing saga of the Corleone crime f...
3       The true story of how businessman Oskar Schind...
4       The defense and the prosecution have rested an...
                              ...                        
9415    Summer, New York City. A college girl falls ha...
9416    A former sports star who's fallen on hard time...
9417    Nearly a decade after occupation by an extrate...
9418    A radio host is victimised by the notorious ca...
9419    A Chicago detective travels to Scotland after ...
Name: description, Length: 9420, dtype: object

In [5]:
# 1) lowercasing

df['description'] = df['description'].str.lower()
df['description']

0       imprisoned in the 1940s for the double murder ...
1       spanning the years 1945 to 1955, a chronicle o...
2       in the continuing saga of the corleone crime f...
3       the true story of how businessman oskar schind...
4       the defense and the prosecution have rested an...
                              ...                        
9415    summer, new york city. a college girl falls ha...
9416    a former sports star who's fallen on hard time...
9417    nearly a decade after occupation by an extrate...
9418    a radio host is victimised by the notorious ca...
9419    a chicago detective travels to scotland after ...
Name: description, Length: 9420, dtype: object

In [6]:
# 2) Checking if the column has html tags or not

import re

def has_html(text):
    # Pattern checks for <tag> or </tag>
    html_pattern = re.compile(r'<.*?>')
    if pd.isna(text):  # Handle empty cells
        return False
    return bool(html_pattern.search(text))


# Add a new column that shows True if HTML is present
df['has_html'] = df['description'].apply(has_html)

# Count how many rows have HTML
html_count = df['has_html'].sum()
print(f"Number of rows with HTML tags: {html_count}")

# Show rows with HTML tags
df_with_html = df[df['has_html'] == True]
print(df_with_html)

## NO any html tag so no needed to remove

Number of rows with HTML tags: 0
Empty DataFrame
Columns: [movie_name, description, genre, has_html]
Index: []


In [7]:
# 3) Removing Url , but first checking if there are urls or not

def has_url(text):
    
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    if pd.isna(text):  # Handle empty cells
        return False
    return bool(url_pattern.search(text))

# Add a new column that shows True if URL is present
df['has_url'] = df['description'].apply(has_url) # This creates a new column called has_url in your DataFrame.For each row, the value will be: True ‚Üí if the description contains a URL and False ‚Üí if it does not contain a URL

# Count how many rows have URLs
url_count = df['has_url'].sum()
print(f"Number of rows with URLs: {url_count}")

# Show rows with URLs
df_with_urls = df[df['has_url'] == True]
print(df_with_urls)


# there are no urls so no needed to remove

Number of rows with URLs: 0
Empty DataFrame
Columns: [movie_name, description, genre, has_html, has_url]
Index: []


In [8]:
# 4) Removing Punctuations

import string
exclude = string.punctuation

def remove_punc(text):
    if pd.isna(text): # checks if a value is missing or ‚ÄúNaN‚Äù (Not a Number).
        return ""
    return text.translate(str.maketrans('', '', exclude)) # It will throw an error for the row where description is NaN because .translate() cannot be applied to NaN. so pd.isna() is used


df['description_clean'] = df['description'].apply(remove_punc)

df['description_clean']

0       imprisoned in the 1940s for the double murder ...
1       spanning the years 1945 to 1955 a chronicle of...
2       in the continuing saga of the corleone crime f...
3       the true story of how businessman oskar schind...
4       the defense and the prosecution have rested an...
                              ...                        
9415    summer new york city a college girl falls hard...
9416    a former sports star whos fallen on hard times...
9417    nearly a decade after occupation by an extrate...
9418    a radio host is victimised by the notorious ca...
9419    a chicago detective travels to scotland after ...
Name: description_clean, Length: 9420, dtype: object

In [9]:
# 5) Chatwords handling

chatwords_dict = {
    'A3': 'Anytime, Anywhere, Anyplace',
    'ADIH': 'Another Day In Hell',
    'AFK': 'Away From Keyboard',
    'AFAIK': 'As Far As I Know',
    'ASAP': 'As Soon As Possible',
    'ASL': 'Age, Sex, Location',
    'ATK': 'At The Keyboard',
    'ATM': 'At The Moment',
    'BAE': 'Before Anyone Else',
    'BAK': 'Back At Keyboard',
    'BBL': 'Be Back Later',
    'BBS': 'Be Back Soon',
    'BFN': 'Bye For Now',
    'B4N': 'Bye For Now',
    'BRB': 'Be Right Back',
    'BRUH': 'Bro',
    'BRT': 'Be Right There',
    'BSAAW': 'Big Smile And A Wink',
    'BTW': 'By The Way',
    'BWL': 'Bursting With Laughter',
    'CSL': 'Can‚Äôt Stop Laughing',
    'CU': 'See You',
    'CUL8R': 'See You Later',
    'CYA': 'See You',
    'DM': 'Direct Message',
    'FAQ': 'Frequently Asked Questions',
    'FC': 'Fingers Crossed',
    'FIMH': 'Forever In My Heart',
    'FOMO': 'Fear Of Missing Out',
    'FR': 'For Real',
    'FWIW': "For What It's Worth",
    'FYP': 'For You Page',
    'FYI': 'For Your Information',
    'G9': 'Genius',
    'GAL': 'Get A Life',
    'GG': 'Good Game',
    'GMTA': 'Great Minds Think Alike',
    'GN': 'Good Night',
    'GOAT': 'Greatest Of All Time',
    'GR8': 'Great!',
    'HBD': 'Happy Birthday',
    'IC': 'I See',
    'ICQ': 'I Seek You',
    'IDC': 'I Don‚Äôt Care',
    'IDK': "I Don't Know",
    'IFYP': 'I Feel Your Pain',
    'ILU': 'I Love You',
    'ILY': 'I Love You',
    'IMHO': 'In My Honest/Humble Opinion',
    'IMU': 'I Miss You',
    'IMO': 'In My Opinion',
    'IOW': 'In Other Words',
    'IRL': 'In Real Life',
    'IYKYK': 'If You Know, You Know',
    'JK': 'Just Kidding',
    'KISS': 'Keep It Simple, Stupid',
    'L': 'Loss',
    'L8R': 'Later',
    'LDR': 'Long Distance Relationship',
    'LMK': 'Let Me Know',
    'LMAO': 'Laughing My A** Off',
    'LOL': 'Laughing Out Loud',
    'LTNS': 'Long Time No See',
    'M8': 'Mate',
    'MFW': 'My Face When',
    'MID': 'Mediocre',
    'MRW': 'My Reaction When',
    'MTE': 'My Thoughts Exactly',
    'NVM': 'Never Mind',
    'NRN': 'No Reply Necessary',
    'NPC': 'Non-Player Character',
    'OIC': 'Oh I See',
    'OP': 'Overpowered',
    'PITA': 'Pain In The A**',
    'POV': 'Point Of View',
    'PRT': 'Party',
    'PRW': 'Parents Are Watching',
    'ROFL': 'Rolling On The Floor Laughing',
    'ROFLOL': 'Rolling On The Floor Laughing Out Loud',
    'ROTFLMAO': 'Rolling On The Floor Laughing My A** Off',
    'RN': 'Right Now',
    'SK8': 'Skate',
    'STATS': 'Your Sex And Age',
    'SUS': 'Suspicious',
    'TBH': 'To Be Honest',
    'TFW': 'That Feeling When',
    'THX': 'Thank You',
    'TIME': 'Tears In My Eyes',
    'TLDR': 'Too Long, Didn‚Äôt Read',
    'TNTL': 'Trying Not To Laugh',
    'TTFN': 'Ta-Ta For Now!',
    'TTYL': 'Talk To You Later',
    'U': 'You',
    'U2': 'You Too',
    'U4E': 'Yours For Ever',
    'W': 'Win',
    'W8': 'Wait...',
    'WB': 'Welcome Back',
    'WTF': 'What The F**k',
    'WTG': 'Way To Go!',
    'WUF': 'Where Are You From?',
    'WYD': 'What You Doing?',
    'WYWH': 'Wish You Were Here',
    'ZZZ': 'Sleeping, Bored, Tired',
    'SMH': 'Shaking My Head',
    'AF': 'As F**k',
    'TMI': 'Too Much Information',
    'BFF': 'Best Friends Forever',
    'YOLO': 'You Only Live Once',
    'ICYMI': 'In Case You Missed It',
    'OTP': 'One True Pairing',
    'IKR': 'I Know, Right?',
    'NBD': 'No Big Deal',
    'FML': 'F**k My Life',
    'GTG': 'Got To Go',
    'IDTS': 'I Don‚Äôt Think So',
    'HBU': 'How About You?',
    'TTYT': 'Talk To You Tomorrow',
    'OMW': 'On My Way'
}


In [10]:
def replace_chatwords(text, chat_dict):
    if pd.isna(text):  # Handle missing values
        return ""
    words = text.split()  # Split text into words
    
    new_words = [chat_dict.get(word.upper(), word) for word in words]  # Replace if in dictionary

# chat_dict.get(key, default) means:Look for key in the dictionary , If found ‚Üí return its value and If NOT found ‚Üí return default , the default means the original word not found in the dictionary

    
    return " ".join(new_words)  # join() takes a list of words and combines (joins) them into one sentence, inserting a space " " between each word.


In [11]:
df['description_clean_chatwords'] = df['description_clean'].apply(lambda x: replace_chatwords(x, chatwords_dict))

df['description_clean_chatwords']


0       imprisoned in the 1940s for the double murder ...
1       spanning the years 1945 to 1955 a chronicle of...
2       in the continuing saga of the corleone crime f...
3       the true story of how businessman oskar schind...
4       the defense and the prosecution have rested an...
                              ...                        
9415    summer new york city a college girl falls hard...
9416    a former sports star whos fallen on hard times...
9417    nearly a decade after occupation by an extrate...
9418    a radio host is victimised by the notorious ca...
9419    a chicago detective travels to scotland after ...
Name: description_clean_chatwords, Length: 9420, dtype: object

In [12]:
# 6) TMDB movie descriptions are written professionally.They almost never include emojis like üòÇ‚ù§Ô∏èüî• , but just checking if there are emojis or not

df['has_emoji'] = df['description'].str.contains(r'[\U0001F600-\U0001F64F]', regex=True)

df['has_emoji'].sum()




0

#### 7) Spelling correction slows processing and may distort movie titles or names like Thor‚Äù ‚Üí corrected to ‚ÄúThorn so not needed

In [13]:
# When you start ML modeling later: then first do tokenization and then lemmatization . Stemming is not necessary for movie description  as it Makes phrases less meaningful

In [14]:
# Tokenization
# Tokenization + Lemmatization ‚Üí Done together in ONE step ‚Üí Saved into ONE final column



import spacy
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def spacy_tokenize_lemmatize(text):
    if pd.isna(text) or text.strip() == "":
        return ""

    doc = nlp(text)  # contains tokenized words

    # It creates a cleaned, lemmatized list of words from the description by skipping spaces.
    tokens = [token.lemma_ for token in doc if not token.is_space]

    return " ".join(tokens)

df["description_final"] = df["description_clean_chatwords"].apply(spacy_tokenize_lemmatize)

df["description_final"]


0       imprison in the 1940 for the double murder of ...
1       span the year 1945 to 1955 a chronicle of the ...
2       in the continue saga of the corleone crime fam...
3       the true story of how businessman oskar schind...
4       the defense and the prosecution have rest and ...
                              ...                        
9415    summer new york city a college girl fall hard ...
9416    a former sport star who s fall on hard time st...
9417    nearly a decade after occupation by an extrate...
9418    a radio host be victimise by the notorious can...
9419    a chicago detective travel to scotland after a...
Name: description_final, Length: 9420, dtype: object