In [49]:
import pandas as pd
import requests
import re

In [32]:
df = pd.DataFrame()

In [33]:
for i in range(1,429):
    response = requests.get('https://api.themoviedb.org/3/movie/top_rated?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US&page={}'.format(i))
    temp_df = pd.DataFrame(response.json()['results'])[['id','title','overview','genre_ids']]
    df = pd.concat([df,temp_df],ignore_index=True)



In [34]:
df.head()

Unnamed: 0,id,title,overview,genre_ids
0,278,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"[18, 80]"
1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[18, 80]"
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[18, 80]"
3,424,Schindler's List,The true story of how businessman Oskar Schind...,"[18, 36, 10752]"
4,389,12 Angry Men,The defense and the prosecution have rested an...,[18]


In [35]:
genre_url  = 'https://api.themoviedb.org/3/genre/movie/list?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US'
g_response = requests.get(genre_url)
genres = g_response.json()['genres']

genre_dict = {g['id']: g['name'] for g in genres}

In [36]:
df['genres'] = df['genre_ids'].apply(
    lambda x: [genre_dict[i] for i in x]
)


In [37]:
df['genres'] = df['genres'].apply(lambda x: ", ".join(x))


In [38]:
df.drop(columns=['genre_ids','id'],inplace=True)

In [41]:
df.to_csv("MovieDataset.csv")

In [43]:
df.shape

(8560, 3)

# Steps to Apply For NLP Pipeline
1) Lower casing
2) Remove HTML Tags
3) Remove URLs
4) Remove Punctuation
5) Chat Word Treatment (Not Needed for Movie Dataset, Already Professional)
6) Spelling Check (Not Needed for Movie Dataset, Already Professional)
7) Removing StopWords
8) Handling Emojis (Not Needed)
9) Tokenization
10) Stemming

In [46]:
df.head()

Unnamed: 0,title,overview,genres
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"Drama, Crime"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama, Crime"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"Drama, Crime"
3,Schindler's List,The true story of how businessman Oskar Schind...,"Drama, History, War"
4,12 Angry Men,The defense and the prosecution have rested an...,Drama


In [48]:
df['overview'] = df['overview'].str.lower()

In [59]:
def remove_html_tags(text):
    return re.sub(r'<[^>]+>', '', text)

df['overview'] = df['overview'].apply(remove_html_tags)

In [60]:
def remove_urls(text):
    if isinstance(text, str):
        return re.sub(r'https?://\S+|www\.\S+', '', text)
    return text

df['overview'] = df['overview'].apply(remove_urls)

In [63]:
import string

exclude = string.punctuation

def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))

df['overview'] = df['overview'].apply(remove_punc)

In [69]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Load stopwords once
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    if not isinstance(text, str):
        return text
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

df['overview'] = df['overview'].apply(remove_stop_words)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sukku\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [72]:
import spacy
nlp = spacy.load("en_core_web_sm")

def tokenize(text):
    if not isinstance(text, str):
        return []
    doc = nlp(text)
    return [token.text for token in doc]
    
df['tokens'] = df['overview'].apply(tokenize)

In [74]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
def stem_tokens(token_list):
    return [ps.stem(word) for word in token_list]

df['stemized'] = df['tokens'].apply(stem_tokens)

In [75]:
df.head()

Unnamed: 0,title,overview,genres,tokens,stemized
0,The Shawshank Redemption,imprisoned 1940s double murder wife lover upst...,"Drama, Crime","[imprisoned, 1940s, double, murder, wife, love...","[imprison, 1940, doubl, murder, wife, lover, u..."
1,The Godfather,spanning years 1945 1955 chronicle fictional i...,"Drama, Crime","[spanning, years, 1945, 1955, chronicle, ficti...","[span, year, 1945, 1955, chronicl, fiction, it..."
2,The Godfather Part II,continuing saga corleone crime family young vi...,"Drama, Crime","[continuing, saga, corleone, crime, family, yo...","[continu, saga, corleon, crime, famili, young,..."
3,Schindler's List,true story businessman oskar schindler saved t...,"Drama, History, War","[true, story, businessman, oskar, schindler, s...","[true, stori, businessman, oskar, schindler, s..."
4,12 Angry Men,defense prosecution rested jury filing jury ro...,Drama,"[defense, prosecution, rested, jury, filing, j...","[defens, prosecut, rest, juri, file, juri, roo..."
