In [1]:
import requests
import pandas as pd

# Step 1: Fetch genre list
genre_url = 'http://api.themoviedb.org/3/genre/movie/list?api_key=237af7c9dea012a4a6ec3f40e5f87c2a&language=en-US'
genre_response = requests.get(genre_url)

if genre_response.status_code == 200:
    genre_data = genre_response.json()['genres']  # Extract genre list
else:
    print("Failed to fetch genre data")
    genre_data = []

# Step 2: Create a dictionary to map genre_id to genre_name
genre_dict = {genre['id']: genre['name'] for genre in genre_data}

# Step 3: Initialize an empty list to store movie data
movie_data = []

# Step 4: Fetch movie data from multiple pages
for i in range(1, 499):  # Iterating through pages
    movie_url = f'http://api.themoviedb.org/3/movie/top_rated?api_key=237af7c9dea012a4a6ec3f40e5f87c2a&language=en-US&page={i}'
    response = requests.get(movie_url)

    if response.status_code == 200:
        results = response.json()['results']

        # Process each movie entry
        for movie in results:
            movie_entry = {
                'title': movie['title'],
                'overview': movie['overview'],
                'genres': ', '.join([genre_dict.get(genre_id, "Unknown") for genre_id in movie['genre_ids']])  # Map genre IDs to names
            }
            movie_data.append(movie_entry)
    else:
        print(f"Failed to fetch movie data for page {i}")

# Step 5: Convert the collected data into a Pandas DataFrame
df = pd.DataFrame(movie_data)

# Step 6: Save DataFrame as CSV file
df.to_csv("movies_with_genres.csv", index=False)

# Step 7: Display the first few rows
print(df.head(5))


                      title  \
0  The Shawshank Redemption   
1             The Godfather   
2     The Godfather Part II   
3          Schindler's List   
4              12 Angry Men   

                                            overview               genres  
0  Imprisoned in the 1940s for the double murder ...         Drama, Crime  
1  Spanning the years 1945 to 1955, a chronicle o...         Drama, Crime  
2  In the continuing saga of the Corleone crime f...         Drama, Crime  
3  The true story of how businessman Oskar Schind...  Drama, History, War  
4  The defense and the prosecution have rested an...                Drama  


In [5]:
from tabulate import tabulate

In [6]:
print(tabulate(df, headers="keys", tablefmt="grid"))

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [2]:
import numpy as np
import pandas as pd
import re

In [3]:
df = pd.read_csv("movies_with_genres.csv", encoding="cp1252", quoting=0, escapechar="\\", on_bad_lines="warn", dtype=str)

In [4]:
df.head(10)

Unnamed: 0,title,overview,genres
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"Drama, Crime"
3,Schindler's List,The true story of how businessman Oskar Schind...,"Drama, History, War"
4,12 Angry Men,The defense and the prosecution have rested an...,Drama
5,Spirited Away,"A young girl, Chihiro, becomes trapped in a st...","Animation, Family, Fantasy"
6,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...","Comedy, Drama, Romance"
7,The Dark Knight,Batman raises the stakes in his war on crime. ...,"Drama, Action, Crime, Thriller"
8,The Green Mile,A supernatural tale set on death row in a Sout...,"Fantasy, Drama, Crime"
9,Parasite,"All unemployed, Ki-taek's family takes peculia...","Comedy, Thriller, Drama"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9960 entries, 0 to 9959
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     9960 non-null   object
 1   overview  9960 non-null   object
 2   genres    9956 non-null   object
dtypes: object(3)
memory usage: 233.6+ KB


In [5]:
df.shape

(9960, 3)

Lower Casing

In [6]:
df['title']= df['title'].str.lower()
df['title']

0       the shawshank redemption
1                  the godfather
2          the godfather part ii
3               schindler's list
4                   12 angry men
                  ...           
9955               baby geniuses
9956        hercules in new york
9957               the love guru
9958                  bloodrayne
9959          inspector gadget 2
Name: title, Length: 9960, dtype: object

In [7]:
df['overview']= df['overview'].str.lower()
df['overview']

0       imprisoned in the 1940s for the double murder ...
1       spanning the years 1945 to 1955, a chronicle o...
2       in the continuing saga of the corleone crime f...
3       the true story of how businessman oskar schind...
4       the defense and the prosecution have rested an...
                              ...                        
9955    babyco is the world's leading manufacturer in ...
9956    hercules is sent from mount olympus to modern-...
9957    born in america and raised in an indian ashram...
9958    in 18th-century romania, after spending much o...
9959    after capturing claw, all the criminals have g...
Name: overview, Length: 9960, dtype: object

In [8]:
df['genres']= df['genres'].str.lower()
df['genres']

0                            drama, crime
1                            drama, crime
2                            drama, crime
3                     drama, history, war
4                                   drama
                      ...                
9955      science fiction, comedy, family
9956           comedy, adventure, fantasy
9957                      comedy, romance
9958                      fantasy, horror
9959    action, adventure, comedy, family
Name: genres, Length: 9960, dtype: object

HTML Tags Removal

In [9]:
import re
def remove_html_tags(text):
    pattern= re.compile('<.*?>')
    return pattern.sub(r'', text)

In [10]:
df['overview'].apply(remove_html_tags)  #no need of this bc there are no html tags

0       imprisoned in the 1940s for the double murder ...
1       spanning the years 1945 to 1955, a chronicle o...
2       in the continuing saga of the corleone crime f...
3       the true story of how businessman oskar schind...
4       the defense and the prosecution have rested an...
                              ...                        
9955    babyco is the world's leading manufacturer in ...
9956    hercules is sent from mount olympus to modern-...
9957    born in america and raised in an indian ashram...
9958    in 18th-century romania, after spending much o...
9959    after capturing claw, all the criminals have g...
Name: overview, Length: 9960, dtype: object

URLs Removal

In [11]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')  # Corrected regex
    return pattern.sub('', text)

In [12]:
df['overview'].apply(remove_url)  # no need of this too

0       imprisoned in the 1940s for the double murder ...
1       spanning the years 1945 to 1955, a chronicle o...
2       in the continuing saga of the corleone crime f...
3       the true story of how businessman oskar schind...
4       the defense and the prosecution have rested an...
                              ...                        
9955    babyco is the world's leading manufacturer in ...
9956    hercules is sent from mount olympus to modern-...
9957    born in america and raised in an indian ashram...
9958    in 18th-century romania, after spending much o...
9959    after capturing claw, all the criminals have g...
Name: overview, Length: 9960, dtype: object

Remove Punctuation

In [13]:
import string

def remove_punc1(text):
    exclude = string.punctuation  # Define punctuation to remove
    return text.translate(str.maketrans('', '', exclude))



In [14]:
df['overview'] = df['overview'].apply(remove_punc1)


In [15]:
df['overview'][2]

'in the continuing saga of the corleone crime family a young vito corleone grows up in sicily and in 1910s new york in the 1950s michael corleone attempts to expand the family business into las vegas hollywood and cuba'

In [16]:
df['overview'][5]

'a young girl chihiro becomes trapped in a strange new world of spirits when her parents undergo a mysterious transformation she must call upon the courage she never knew she had to free her family'

In [17]:
# No need of chat word treatment also bc the dataset does not contain any chat word as of it

Spelling Correction

In [5]:
from textblob import TextBlob

In [6]:
# Function to correct spelling
def correct_spelling(text):
    if isinstance(text, str):  # Ensure input is a string
        return str(TextBlob(text).correct())
    return text  # Return the original value if not a string


In [7]:
df.loc[:4, 'overview'] = df.loc[:4, 'overview'].apply(correct_spelling)


In [8]:
print(df['overview'][3])

The true story of how businessman Scar Schindler saved over a thousand Jewish lives from the Paris while they worked as slaves in his factory during World War of.


Removing Stop Words

In [9]:
import nltk
from nltk.corpus import stopwords

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [11]:
stop_words = set(stopwords.words('english'))

In [12]:
# Function to remove stopwords
def remove_stopwords(text):
    if isinstance(text, str):  # Ensure input is a string
        return " ".join([word for word in text.split() if word.lower() not in stop_words])
    return text  # Return original text if it's not a string

In [13]:
df.loc[:4, 'title_cleaned'] = df.loc[:4, 'title'].apply(remove_stopwords)
df.loc[:4, 'overview_cleaned'] = df.loc[:4, 'overview'].apply(remove_stopwords)

In [15]:
print(df[['title', 'title_cleaned', 'overview','overview_cleaned']].head(1))

                      title         title_cleaned  \
0  The Shawshank Redemption  Shawshank Redemption   

                                            overview  \
0  Imprisoned in the 1940s for the double murder ...   

                                    overview_cleaned  
0  Imprisoned 1940s double murder wife lover, sta...  


In [16]:
# Handling Emojis , no need of this

Tokenization using spacy

In [17]:
import spacy

In [18]:
# Load English model
nlp = spacy.load("en_core_web_sm")

In [19]:
def tokenize_text(text):
    if isinstance(text, str):  # Ensure input is a string
        doc = nlp(text)
        return [token.text for token in doc]  # Extract tokenized words
    return text

In [20]:
df.loc[:3, 'tokenized_overview'] = df.loc[:3, 'overview'].apply(tokenize_text)

In [21]:
print(df[['overview', 'tokenized_overview']].head())

                                            overview  \
0  Imprisoned in the 1940s for the double murder ...   
1  Spanning the years 1945 to 1955, a chronicle o...   
2  In the continuing sage of the Corleone crime f...   
3  The true story of how businessman Scar Schindl...   
4  The defense and the prosecution have rested an...   

                                  tokenized_overview  
0  [Imprisoned, in, the, 1940s, for, the, double,...  
1  [Spanning, the, years, 1945, to, 1955, ,, a, c...  
2  [In, the, continuing, sage, of, the, Corleone,...  
3  [The, true, story, of, how, businessman, Scar,...  
4                                                NaN  


Stemming

In [22]:
from nltk.stem.porter import PorterStemmer
ps= PorterStemmer()

In [23]:
def stem_words(text):
    if isinstance(text, str): 
        return " ".join([ps.stem(word) for word in text.split()])
    return text

In [24]:
df.loc[:3, 'stemmed_overview'] = df.loc[:3, 'overview'].apply(stem_words)

In [25]:
print(df[['overview', 'stemmed_overview']].head())

                                            overview  \
0  Imprisoned in the 1940s for the double murder ...   
1  Spanning the years 1945 to 1955, a chronicle o...   
2  In the continuing sage of the Corleone crime f...   
3  The true story of how businessman Scar Schindl...   
4  The defense and the prosecution have rested an...   

                                    stemmed_overview  
0  imprison in the 1940 for the doubl murder of h...  
1  span the year 1945 to 1955, a chronicl of the ...  
2  in the continu sage of the corleon crime famil...  
3  the true stori of how businessman scar schindl...  
4                                                NaN  


Lemmatization

In [26]:
import nltk

In [27]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [28]:
wordnet_lemmatizer = WordNetLemmatizer()

In [29]:
def lemmatize_text(text):
    if isinstance(text, str): 
        words = word_tokenize(text)  # Tokenize text
        lemmatized_words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]  
        return " ".join(lemmatized_words) 
    return text

In [30]:
df.loc[:3,'lemmatized_overview'] = df.loc[:3, 'overview'].apply(lemmatize_text)

In [37]:
pd.set_option('display.max_colwidth', None)  # Allow full text display
print(df[['overview', 'lemmatized_overview']].head(1))


                                                                                                                                                                                                                                                                                                                                                                                               overview  \
0  Imprisoned in the 1940s for the double murder of his wife and her lover, standing banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skill to work for an moral garden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.   

                                                                                                                                                                                                                 