In [89]:
import numpy as np
import pandas as pd
import copy
import ast

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv
/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv


<h1 align='center'>Data Preporccessing<h1>

### Step 1: import Datasets

In [3]:
df1 = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
df2 = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')
print(df1.shape)
print(df2.shape)

(4803, 20)
(4803, 4)


### Step 2: Combining Datasets

In [None]:
df1.head(1)

In [None]:
df2.head(1)

we have to convert `movie_id` to `id` before merging these 2 dataset

In [5]:
df2['id'] = df2['movie_id']

In [None]:
df2.head(1)

now combine these two dataset with respect to `id`

In [6]:
df = df1.merge(df2, on='id')
print(df.shape)
print(df.info())

(4803, 24)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status    

In [None]:
# run these one by one to see the data 
df.nunique()
df['status'].value_counts()

In [None]:
df.head(1)

### Step 3: Taking only relevelnt column

In [90]:
movies = df[['id','title_x','overview','genres',
             'keywords','original_language','production_companies',
             'tagline','cast','crew']]
movies2 = movies.copy()

In [96]:
movies2.head(1)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",Enter the World of Pandora.,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## Handling missing values

`movies.isnull().sum()` will show the total number of null values in each column

In [97]:
movies2.isnull().sum()

id                        0
title_x                   0
overview                  0
genres                    0
keywords                  0
original_language         0
production_companies      0
tagline                 844
cast                      0
crew                      0
dtype: int64

#### `overview` column

In [98]:
movies2['overview'].isnull().sum()

0

In [99]:
movies2['overview'] = movies2['overview'].fillna(" ")

In [100]:
movies2['overview'].isnull().sum()

0

#### `tagline` column

In [101]:
movies2['tagline'].isnull().sum()

844

In [102]:
movies2['tagline'] = movies2['tagline'].fillna(" ")

In [103]:
movies2['tagline'].isnull().sum()

0

In [106]:
movies2.isnull().sum()

id                        0
title_x                   0
overview                  3
genres                    0
keywords                  0
original_language         0
production_companies      0
tagline                 844
cast                      0
crew                      0
dtype: int64

# Feature engeneering

In [107]:
# first make a copy
movies3 = movies2.copy()

In [105]:
movies2.head(1)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",Enter the World of Pandora.,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


#### Making a funtion to convert `genres`, `keywords`

In [108]:
def convert_genres(obj):
    List = []
    for i in ast.literal_eval(obj):
        List.append(f"gen_{i['name']}")
    text = ", ".join(List)
    text = text.lower().replace(" ","").replace(","," ")
    return text

def convert_keywords(obj):
    List = []
    for i in ast.literal_eval(obj):
        List.append(i['name'])
    text = ", ".join(List)
    text = text.lower().replace(" ","").replace(",", " ")
    return text

In [109]:
movies3['genres'] = movies3['genres'].apply(convert_genres)

In [110]:
movies3['keywords'] = movies3['keywords'].apply(convert_keywords)

In [111]:
movies2.head(1)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",gen_action gen_adventure gen_fantasy gen_scien...,cultureclash future spacewar spacecolony socie...,en,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",Enter the World of Pandora.,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Working on `original language`

In [112]:
movies3['original_language'] = "ol_" + movies3['original_language'].astype(str)

In [114]:
movies3.head(1)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",gen_action gen_adventure gen_fantasy gen_scien...,cultureclash future spacewar spacecolony socie...,ol_en,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",Enter the World of Pandora.,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## Working on `production_companies`

In [155]:
movies4 = movies3.copy()

In [156]:
def convert_pc(obj):
    List = []
    for i in ast.literal_eval(obj):
        List.append(i['name'])
    text = ", ".join(List)
    text = text.lower().replace(" ","").replace(","," ").replace("."," ").replace("  "," ")
    return text

In [157]:
movies4['production_companies'] = movies4['production_companies'].apply(convert_pc)

In [153]:
movies4.head(1)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",gen_action gen_adventure gen_fantasy gen_scien...,cultureclash future spacewar spacecolony socie...,ol_en,ingeniousfilmpartners twentiethcenturyfoxfilmc...,Enter the World of Pandora.,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [158]:
# checking production companies
print(movies.iloc[3].title_x)
print(movies.iloc[3].production_companies)
print(movies4.iloc[3].title_x)
print(movies4.iloc[3].production_companies)
# so we will need all production companies name

The Dark Knight Rises
[{"name": "Legendary Pictures", "id": 923}, {"name": "Warner Bros.", "id": 6194}, {"name": "DC Entertainment", "id": 9993}, {"name": "Syncopy", "id": 9996}]
The Dark Knight Rises
legendarypictures warnerbros dcentertainment syncopy


## Working on `cast`

In [239]:
movies5 = movies4.copy()

In [166]:
movies5.head(2)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",gen_action gen_adventure gen_fantasy gen_scien...,cultureclash future spacewar spacecolony socie...,ol_en,ingeniousfilmpartners twentiethcenturyfoxfilmc...,Enter the World of Pandora.,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",gen_adventure gen_fantasy gen_action,ocean drugabuse exoticisland eastindiatradingc...,ol_en,waltdisneypictures jerrybruckheimerfilms secon...,"At the end of the world, the adventure begins.","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [200]:
movies5.iloc[1].cast
# i need : character (first 5), name (first 5)

'[{"cast_id": 4, "character": "Captain Jack Sparrow", "credit_id": "52fe4232c3a36847f800b50d", "gender": 2, "id": 85, "name": "Johnny Depp", "order": 0}, {"cast_id": 5, "character": "Will Turner", "credit_id": "52fe4232c3a36847f800b511", "gender": 2, "id": 114, "name": "Orlando Bloom", "order": 1}, {"cast_id": 6, "character": "Elizabeth Swann", "credit_id": "52fe4232c3a36847f800b515", "gender": 1, "id": 116, "name": "Keira Knightley", "order": 2}, {"cast_id": 12, "character": "William \\"Bootstrap Bill\\" Turner", "credit_id": "52fe4232c3a36847f800b52d", "gender": 2, "id": 1640, "name": "Stellan Skarsg\\u00e5rd", "order": 3}, {"cast_id": 10, "character": "Captain Sao Feng", "credit_id": "52fe4232c3a36847f800b525", "gender": 2, "id": 1619, "name": "Chow Yun-fat", "order": 4}, {"cast_id": 9, "character": "Captain Davy Jones", "credit_id": "52fe4232c3a36847f800b521", "gender": 2, "id": 2440, "name": "Bill Nighy", "order": 5}, {"cast_id": 7, "character": "Captain Hector Barbossa", "credit_

In [240]:
def top_cast(obj):
    data = ast.literal_eval(obj)
    top5 = data[:5]
    top5_cast = [] 
    for member in top5:
        name = member['name'].replace(' ','').replace('"'," ").replace("."," ").replace("-","").lower()
        character = member['character'].replace(' ','').replace('"'," ").replace("."," ").replace("-","").lower()
        top5_cast.append(f"{name} {character}")
    return " ".join(top5_cast) 

In [241]:
movies5['cast'] = movies5['cast'].apply(top_cast)

In [242]:
movies5.iloc[1].cast

'johnnydepp captainjacksparrow orlandobloom willturner keiraknightley elizabethswann stellanskarsgård william bootstrapbill turner chowyunfat captainsaofeng'

## Working on `crew`

In [253]:
movies6 = movies5.copy()

In [245]:
movies6.head(1)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",gen_action gen_adventure gen_fantasy gen_scien...,cultureclash future spacewar spacecolony socie...,ol_en,ingeniousfilmpartners twentiethcenturyfoxfilmc...,Enter the World of Pandora.,samworthington jakesully zoesaldana neytiri si...,"[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [246]:
movies6.iloc[1].crew

'[{"credit_id": "52fe4232c3a36847f800b579", "department": "Camera", "gender": 2, "id": 120, "job": "Director of Photography", "name": "Dariusz Wolski"}, {"credit_id": "52fe4232c3a36847f800b4fd", "department": "Directing", "gender": 2, "id": 1704, "job": "Director", "name": "Gore Verbinski"}, {"credit_id": "52fe4232c3a36847f800b54f", "department": "Production", "gender": 2, "id": 770, "job": "Producer", "name": "Jerry Bruckheimer"}, {"credit_id": "52fe4232c3a36847f800b503", "department": "Writing", "gender": 2, "id": 1705, "job": "Screenplay", "name": "Ted Elliott"}, {"credit_id": "52fe4232c3a36847f800b509", "department": "Writing", "gender": 2, "id": 1706, "job": "Screenplay", "name": "Terry Rossio"}, {"credit_id": "52fe4232c3a36847f800b57f", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "52fe4232c3a36847f800b585", "department": "Editing", "gender": 2, "id": 1722, "job": "Editor", "name": "Craig Wood"}, {"credit_id": "52f

In [259]:
def fetch_director(obj):
    data = ast.literal_eval(obj)
    director = []
    for i in data:
        if i['job'] == 'Director':
            name = i['name'].replace(' ','').replace('"'," ").replace("."," ").replace("-","").lower()
            director.append(f"{name}")
    return " ".join(director)

In [260]:
movies6['crew'] = movies5['crew'].apply(fetch_director)

In [262]:
movies6.iloc[6].crew

'byronhoward nathangreno'

In [265]:
movies6.head(1)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",gen_action gen_adventure gen_fantasy gen_scien...,cultureclash future spacewar spacecolony socie...,ol_en,ingeniousfilmpartners twentiethcenturyfoxfilmc...,Enter the World of Pandora.,samworthington jakesully zoesaldana neytiri si...,jamescameron


## convert all column into list

In [280]:
movies7 = movies6.copy()

making a title for later

In [283]:
movies7['title'] = movies7['title_x']

In [285]:
columns = ['title_x','overview','genres','keywords',
           'original_language','production_companies',
           'tagline','cast','crew']
for i in columns:
    movies7[i] = movies7[i].apply(lambda x:x.lower().split())

In [286]:
movies7.head(2)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew,title
0,19995,[avatar],"[in, the, 22nd, century,, a, paraplegic, marin...","[gen_action, gen_adventure, gen_fantasy, gen_s...","[cultureclash, future, spacewar, spacecolony, ...",[ol_en],"[ingeniousfilmpartners, twentiethcenturyfoxfil...","[enter, the, world, of, pandora.]","[samworthington, jakesully, zoesaldana, neytir...",[jamescameron],Avatar
1,285,"[pirates, of, the, caribbean:, at, world's, end]","[captain, barbossa,, long, believed, to, be, d...","[gen_adventure, gen_fantasy, gen_action]","[ocean, drugabuse, exoticisland, eastindiatrad...",[ol_en],"[waltdisneypictures, jerrybruckheimerfilms, se...","[at, the, end, of, the, world,, the, adventure...","[johnnydepp, captainjacksparrow, orlandobloom,...",[goreverbinski],Pirates of the Caribbean: At World's End


## Conatinating all Columns into `tags`

In [287]:
movies8 = movies7.copy()

In [289]:
movies8["tags"] = movies8['title_x'] + movies8['overview'] + movies8['genres'] + movies8['keywords'] + movies8['original_language'] + movies8['production_companies'] + movies8['tagline'] + movies8['cast'] + movies8['crew']

In [290]:
movies8.head()

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew,title,tags
0,19995,[avatar],"[in, the, 22nd, century,, a, paraplegic, marin...","[gen_action, gen_adventure, gen_fantasy, gen_s...","[cultureclash, future, spacewar, spacecolony, ...",[ol_en],"[ingeniousfilmpartners, twentiethcenturyfoxfil...","[enter, the, world, of, pandora.]","[samworthington, jakesully, zoesaldana, neytir...",[jamescameron],Avatar,"[avatar, in, the, 22nd, century,, a, paraplegi..."
1,285,"[pirates, of, the, caribbean:, at, world's, end]","[captain, barbossa,, long, believed, to, be, d...","[gen_adventure, gen_fantasy, gen_action]","[ocean, drugabuse, exoticisland, eastindiatrad...",[ol_en],"[waltdisneypictures, jerrybruckheimerfilms, se...","[at, the, end, of, the, world,, the, adventure...","[johnnydepp, captainjacksparrow, orlandobloom,...",[goreverbinski],Pirates of the Caribbean: At World's End,"[pirates, of, the, caribbean:, at, world's, en..."
2,206647,[spectre],"[a, cryptic, message, from, bond’s, past, send...","[gen_action, gen_adventure, gen_crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...",[ol_en],"[columbiapictures, danjaq, b24]","[a, plan, no, one, escapes]","[danielcraig, jamesbond, christophwaltz, blofe...",[sammendes],Spectre,"[spectre, a, cryptic, message, from, bond’s, p..."
3,49026,"[the, dark, knight, rises]","[following, the, death, of, district, attorney...","[gen_action, gen_crime, gen_drama, gen_thriller]","[dccomics, crimefighter, terrorist, secretiden...",[ol_en],"[legendarypictures, warnerbros, dcentertainmen...","[the, legend, ends]","[christianbale, brucewayne/batman, michaelcain...",[christophernolan],The Dark Knight Rises,"[the, dark, knight, rises, following, the, dea..."
4,49529,"[john, carter]","[john, carter, is, a, war-weary,, former, mili...","[gen_action, gen_adventure, gen_sciencefiction]","[basedonnovel, mars, medallion, spacetravel, p...",[ol_en],[waltdisneypictures],"[lost, in, our, world,, found, in, another.]","[taylorkitsch, johncarter, lynncollins, dejaht...",[andrewstanton],John Carter,"[john, carter, john, carter, is, a, war-weary,..."


In [292]:
movies9 = pd.DataFrame(movies8[['id', 'title', 'tags']])

In [293]:
movies9.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"[avatar, in, the, 22nd, century,, a, paraplegi..."
1,285,Pirates of the Caribbean: At World's End,"[pirates, of, the, caribbean:, at, world's, en..."
2,206647,Spectre,"[spectre, a, cryptic, message, from, bond’s, p..."
3,49026,The Dark Knight Rises,"[the, dark, knight, rises, following, the, dea..."
4,49529,John Carter,"[john, carter, john, carter, is, a, war-weary,..."


In [294]:
movies9.iloc[6].tags


['tangled',
 'when',
 'the',
 "kingdom's",
 'most',
 'wanted-and',
 'most',
 'charming-bandit',
 'flynn',
 'rider',
 'hides',
 'out',
 'in',
 'a',
 'mysterious',
 'tower,',
 "he's",
 'taken',
 'hostage',
 'by',
 'rapunzel,',
 'a',
 'beautiful',
 'and',
 'feisty',
 'tower-bound',
 'teen',
 'with',
 '70',
 'feet',
 'of',
 'magical,',
 'golden',
 'hair.',
 "flynn's",
 'curious',
 'captor,',
 "who's",
 'looking',
 'for',
 'her',
 'ticket',
 'out',
 'of',
 'the',
 'tower',
 'where',
 "she's",
 'been',
 'locked',
 'away',
 'for',
 'years,',
 'strikes',
 'a',
 'deal',
 'with',
 'the',
 'handsome',
 'thief',
 'and',
 'the',
 'unlikely',
 'duo',
 'sets',
 'off',
 'on',
 'an',
 'action-packed',
 'escapade,',
 'complete',
 'with',
 'a',
 'super-cop',
 'horse,',
 'an',
 'over-protective',
 'chameleon',
 'and',
 'a',
 'gruff',
 'gang',
 'of',
 'pub',
 'thugs.',
 'gen_animation',
 'gen_family',
 'hostage',
 'magic',
 'horse',
 'fairytale',
 'musical',
 'princess',
 'animation',
 'tower',
 'blondewom

## Converting `tags` into string

In [295]:
movies9['tags'] = movies9['tags'].apply(lambda x: ' '.join(x))

In [306]:
movies9.iloc[6].tags

"tangled when the kingdom's most wanted-and most charming-bandit flynn rider hides out in a mysterious tower he's taken hostage by rapunzel a beautiful and feisty tower-bound teen with 70 feet of magical golden hair flynn's curious captor who's looking for her ticket out of the tower where she's been locked away for years strikes a deal with the handsome thief and the unlikely duo sets off on an action-packed escapade complete with a super-cop horse an over-protective chameleon and a gruff gang of pub thugs gen_animation gen_family hostage magic horse fairytale musical princess animation tower blondewoman selfishness healingpower basedonfairytale duringcreditsstinger healinggift animalsidekick ol_en waltdisneypictures waltdisneyanimationstudios they're taking adventure to new lengths zacharylevi flynnridervoice mandymoore rapunzelvoice donnamurphy mothergothelvoice ronperlman stabbingtonbrothervoice m c gainey captainoftheguardvoice byronhoward nathangreno"

#### Notice
we can see thre are still some symbols like `(voice)` `.` now we have to handle those

In [305]:
def clean_text(text):
    for ch in ['(', ')', '.', ',']:     # remove these characters completely
        text = text.replace(ch, '')
    text = ' '.join(text.split())       # replace multiple spaces with single space because it joinning only words with single space
    return text

In [303]:
movies9['tags'] = movies9['tags'].apply(clean_text)

In [301]:
movies9.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"avatar in the 22nd century, a paraplegic marin..."
1,285,Pirates of the Caribbean: At World's End,pirates of the caribbean: at world's end capta...
2,206647,Spectre,spectre a cryptic message from bond’s past sen...
3,49026,The Dark Knight Rises,the dark knight rises following the death of d...
4,49529,John Carter,"john carter john carter is a war-weary, former..."


## Handling simmiler words like `loved, loving` to `love, love` 

# Vectorizing

In [307]:
movies9.shape

(4803, 3)

In [308]:
movies_vec = movies9.copy()

In [323]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')

In [327]:
# covert into vector
vector = cv.fit_transform(movies_vec['tags']).toarray()

In [319]:
vector.shape

(4803, 62215)

In [334]:
feature_name =cv.get_feature_names_out()
print(feature_name[:500])

['00' '000' '007' '05' '06' '07am' '10' '100' '1000' '10000' '100000'
 '1000000' '1001' '100bares' '101' '10191' '1019entertainment'
 '101ststreetfilms' '102' '108' '10th' '10thholeproductions'
 '10weststudios' '10yearoldzoe' '11' '11000' '114' '117' '118' '119'
 '11th' '12' '1200' '120dbfilms' '120films' '121208' '1215' '1250' '125th'
 '127' '12th' '12years' '13' '130' '1300' '1350'
 '13ghostsproductionscanadainc' '13th' '14' '140' '14000' '1408' '141'
 '142' '1429' '148000' '1492pictures' '14pm' '14th' '15' '150' '150th'
 '1520s' '1536' '15th' '15thcentury' '16' '1600s' '161' '1630s' '1644'
 '1681' '1691' '16blockproductions' '16th' '16thcentury' '17' '170000'
 '1700s' '173rd' '1748' '1776' '17th' '17thcentury' '17years' '18' '180'
 '1800' '1812' '1812productions' '1818' '1820' '1820s' '1821pictures'
 '1824' '1831' '1834' '1836' '1838' '1839' '1841' '1845' '1850' '1850s'
 '1856' '1857' '1860' '1862' '1863' '1870s' '1875' '1876' '1879' '188'
 '1880s' '1882' '1885' '1889' '1890' '1890s