## Prepare the movie data

In [79]:
import pandas as pd
import numpy as np
import json

In [80]:
# load dataset from the kaggle
import kagglehub

# Download latest version
path = kagglehub.dataset_download("tmdb/tmdb-movie-metadata")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\Development\.cache\kagglehub\datasets\tmdb\tmdb-movie-metadata\versions\2


In [81]:
# load dataset
movie_dataset = pd.read_csv("../dataset/tmdb_5000_movies.csv")
movie_dataset.head()

FileNotFoundError: [Errno 2] No such file or directory: '../dataset/tmdb_5000_movies.csv'

In [None]:
# understand the dataset
movie_dataset.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [None]:
# understand the datatypes
movie_dataset.dtypes

budget                    int64
genres                   object
homepage                 object
id                        int64
keywords                 object
original_language        object
original_title           object
overview                 object
popularity              float64
production_companies     object
production_countries     object
release_date             object
revenue                   int64
runtime                 float64
spoken_languages         object
status                   object
tagline                  object
title                    object
vote_average            float64
vote_count                int64
dtype: object

## Understand the dataset and make propper colummns

### Genres in dataset

In [None]:
# lets look the important data points in the dataset
movie_dataset["genres"]

0       [{"id": 28, "name": "Action"}, {"id": 12, "nam...
1       [{"id": 12, "name": "Adventure"}, {"id": 14, "...
2       [{"id": 28, "name": "Action"}, {"id": 12, "nam...
3       [{"id": 28, "name": "Action"}, {"id": 80, "nam...
4       [{"id": 28, "name": "Action"}, {"id": 12, "nam...
                              ...                        
4798    [{"id": 28, "name": "Action"}, {"id": 80, "nam...
4799    [{"id": 35, "name": "Comedy"}, {"id": 10749, "...
4800    [{"id": 35, "name": "Comedy"}, {"id": 18, "nam...
4801                                                   []
4802                  [{"id": 99, "name": "Documentary"}]
Name: genres, Length: 4803, dtype: object

In [None]:
# that genre have multiple id numbers and genre names then need to filter all and get only needed values

def filter_genres(row):
    genres = json.loads(row["genres"])
    genre_names = [genre["name"] for genre in genres]

    return genre_names

In [None]:
movie_dataset["genre"] =  movie_dataset.apply(filter_genres, axis = 1)
movie_dataset["genre"].head()

0    [Action, Adventure, Fantasy, Science Fiction]
1                     [Adventure, Fantasy, Action]
2                       [Action, Adventure, Crime]
3                 [Action, Crime, Drama, Thriller]
4             [Action, Adventure, Science Fiction]
Name: genre, dtype: object

### Keywords in dataset

In [None]:
movie_dataset["keywords"]

0       [{"id": 1463, "name": "culture clash"}, {"id":...
1       [{"id": 270, "name": "ocean"}, {"id": 726, "na...
2       [{"id": 470, "name": "spy"}, {"id": 818, "name...
3       [{"id": 849, "name": "dc comics"}, {"id": 853,...
4       [{"id": 818, "name": "based on novel"}, {"id":...
                              ...                        
4798    [{"id": 5616, "name": "united states\u2013mexi...
4799                                                   []
4800    [{"id": 248, "name": "date"}, {"id": 699, "nam...
4801                                                   []
4802    [{"id": 1523, "name": "obsession"}, {"id": 224...
Name: keywords, Length: 4803, dtype: object

In [None]:
# keywords also behave as genre values, 
# There also need to filter all of keywords

def keyword_filter(row):
    keywords = json.loads(row["keywords"])
    keyword_names = [keyword["name"] for keyword in keywords]

    return keyword_names

In [None]:
movie_dataset["keyword"] = movie_dataset.apply(keyword_filter, axis = 1)
movie_dataset["keyword"].head()

0    [culture clash, future, space war, space colon...
1    [ocean, drug abuse, exotic island, east india ...
2    [spy, based on novel, secret agent, sequel, mi...
3    [dc comics, crime fighter, terrorist, secret i...
4    [based on novel, mars, medallion, space travel...
Name: keyword, dtype: object

### Production company details

In [None]:
def production_company_filter(row):
    companies = json.loads(row["production_companies"])
    company_names = [company["name"] for company in companies]

    return company_names

In [None]:
movie_dataset["production_by"] = movie_dataset.apply(production_company_filter, axis = 1)
movie_dataset["production_by"].head()

0    [Ingenious Film Partners, Twentieth Century Fo...
1    [Walt Disney Pictures, Jerry Bruckheimer Films...
2                     [Columbia Pictures, Danjaq, B24]
3    [Legendary Pictures, Warner Bros., DC Entertai...
4                               [Walt Disney Pictures]
Name: production_by, dtype: object

### Production language

In [None]:
def filter_production_language(row):
    languages = json.loads(row["spoken_languages"])
    language_names = [language["name"] for language in languages]

    return language_names

In [None]:
movie_dataset["production_language"] = movie_dataset.apply(filter_production_language, axis = 1)
movie_dataset["production_language"].head()

0                                 [English, Español]
1                                          [English]
2    [Français, English, Español, Italiano, Deutsch]
3                                          [English]
4                                          [English]
Name: production_language, dtype: object

## Makre propper dataset

In [None]:
movie_dataset.columns

processed_dataset = movie_dataset[["original_title", "overview", "tagline", "genre", "keyword", "production_by", "production_language"]]
processed_dataset.head()

Unnamed: 0,original_title,overview,tagline,genre,keyword,production_by,production_language
0,Avatar,"In the 22nd century, a paraplegic Marine is di...",Enter the World of Pandora.,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Ingenious Film Partners, Twentieth Century Fo...","[English, Español]"
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","At the end of the world, the adventure begins.","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Walt Disney Pictures, Jerry Bruckheimer Films...",[English]
2,Spectre,A cryptic message from Bond’s past sends him o...,A Plan No One Escapes,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Columbia Pictures, Danjaq, B24]","[Français, English, Español, Italiano, Deutsch]"
3,The Dark Knight Rises,Following the death of District Attorney Harve...,The Legend Ends,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Legendary Pictures, Warner Bros., DC Entertai...",[English]
4,John Carter,"John Carter is a war-weary, former military ca...","Lost in our world, found in another.","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",[Walt Disney Pictures],[English]


## Load prepared dataset

In [None]:
import os

saving_path = "../dataset/prepared_dataset"
os.makedirs(saving_path, exist_ok = True)

processed_dataset.to_csv(f"{saving_path}/movie_dataset.csv")