In [1]:
import numpy as np 
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')

In [2]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [4]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


In [5]:
merged_df = pd.merge(movies, credits, left_on="id", right_on="movie_id", how="inner")

In [6]:
merged_df.shape

(4803, 24)

In [7]:
new_df = merged_df[['id', 'genres', 'keywords', 'overview', 'title_x', 'cast', 'crew', 'production_companies']]

In [8]:
new_df = new_df.rename(columns = {'title_x':'title'})

In [9]:
import ast
def get_names(obj):
    x = ast.literal_eval(obj)
    keywords = []
    for r in x:
        name = r["name"].replace(" ", "_").lower().strip()
        if name not in keywords:
            keywords.append(name)
    return ",".join(keywords)

In [10]:
new_df["genres"] = new_df["genres"].apply(get_names)

In [11]:
new_df["keyword"] = new_df["keywords"].apply(get_names)

In [12]:
new_df["cast"] = new_df["cast"].apply(get_names)


In [13]:
new_df["crew"] = new_df["crew"].apply(get_names)

In [14]:
new_df["production_companies"] = new_df["production_companies"].apply(get_names)

In [16]:
new_df = new_df.drop('keyword', axis=1)

In [18]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    4803 non-null   int64 
 1   genres                4803 non-null   object
 2   keywords              4803 non-null   object
 3   overview              4800 non-null   object
 4   title                 4803 non-null   object
 5   cast                  4803 non-null   object
 6   crew                  4803 non-null   object
 7   production_companies  4803 non-null   object
dtypes: int64(1), object(7)
memory usage: 300.3+ KB


In [20]:
new_df["overview"] = new_df["overview"].apply(lambda x:str(x).replace(",", "").replace(" ", ",").lower())


In [22]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    4803 non-null   int64 
 1   genres                4803 non-null   object
 2   keywords              4803 non-null   object
 3   overview              4803 non-null   object
 4   title                 4803 non-null   object
 5   cast                  4803 non-null   object
 6   crew                  4803 non-null   object
 7   production_companies  4803 non-null   object
dtypes: int64(1), object(7)
memory usage: 300.3+ KB


In [24]:
new_df["tags"] = new_df["genres"] + "," + new_df["keywords"] + "," + new_df["overview"] + "," + new_df["cast"] + "," + new_df["crew"] + new_df["production_companies"]

In [26]:
final_df = new_df[["id", "title", "tags"]]

In [28]:
final_df.dropna(inplace = True)

In [30]:
final_df.drop_duplicates(inplace = True)

In [32]:
#remove stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Download stopwords if not already available
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [34]:
def remove_stop_words(text):
    # Get the English stopwords list
    stop_words = set(stopwords.words('english'))
    # Tokenize the sentence
    words = text.split(',')
    # Remove stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # # Convert back to string
    filtered_text = ",".join(filtered_words)
    return filtered_text

In [36]:
final_df["tags"] = final_df["tags"].apply(remove_stop_words)

In [38]:
final_df.to_csv("cleaned_data.csv")