In [1]:
import numpy as np
import pandas as pd
import copy
import ast

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv
/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv


<h1 align='center'>Data Preporccessing<h1>

### Step 1: import Datasets

In [2]:
df1 = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
df2 = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')
print(df1.shape)
print(df2.shape)

(4803, 20)
(4803, 4)


### Step 2: Combining Datasets

In [None]:
df1.head(1)

In [None]:
df2.head(1)

we have to convert `movie_id` to `id` before merging these 2 dataset

In [4]:
df2['id'] = df2['movie_id']

In [None]:
df2.head(1)

now combine these two dataset with respect to `id`

In [5]:
df = df1.merge(df2, on='id')
print(df.shape)
print(df.info())

(4803, 24)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status    

In [None]:
# run these one by one to see the data 
df.nunique()
df['status'].value_counts()

In [None]:
df.head(1)

### Step 3: Taking only relevelnt column

In [6]:
movies = df[['id','title_x','overview','genres',
             'keywords','original_language','production_companies',
             'tagline','cast','crew']]
movies2 = movies.copy()

In [7]:
movies2.head(1)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",Enter the World of Pandora.,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## Handling missing values

`movies.isnull().sum()` will show the total number of null values in each column

In [97]:
movies2.isnull().sum()

id                        0
title_x                   0
overview                  0
genres                    0
keywords                  0
original_language         0
production_companies      0
tagline                 844
cast                      0
crew                      0
dtype: int64

#### `overview` column

In [98]:
movies2['overview'].isnull().sum()

0

In [8]:
movies2['overview'] = movies2['overview'].fillna(" ")

In [100]:
movies2['overview'].isnull().sum()

0

#### `tagline` column

In [101]:
movies2['tagline'].isnull().sum()

844

In [9]:
movies2['tagline'] = movies2['tagline'].fillna(" ")

In [103]:
movies2['tagline'].isnull().sum()

0

In [10]:
movies2.isnull().sum()

id                      0
title_x                 0
overview                0
genres                  0
keywords                0
original_language       0
production_companies    0
tagline                 0
cast                    0
crew                    0
dtype: int64

# Feature engeneering

In [11]:
# first make a copy
movies3 = movies2.copy()

In [105]:
movies2.head(1)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",Enter the World of Pandora.,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


#### Making a funtion to convert `genres`, `keywords`

In [12]:
def convert_genres(obj):
    List = []
    for i in ast.literal_eval(obj):
        List.append(f"gen_{i['name']}")
    text = ", ".join(List)
    text = text.lower().replace(" ","").replace(","," ")
    return text

def convert_keywords(obj):
    List = []
    for i in ast.literal_eval(obj):
        List.append(i['name'])
    text = ", ".join(List)
    text = text.lower().replace(" ","").replace(",", " ")
    return text

In [13]:
movies3['genres'] = movies3['genres'].apply(convert_genres)
movies3['keywords'] = movies3['keywords'].apply(convert_keywords)

In [15]:
movies3.head(1)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",gen_action gen_adventure gen_fantasy gen_scien...,cultureclash future spacewar spacecolony socie...,en,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",Enter the World of Pandora.,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Working on `original language`

In [16]:
movies3['original_language'] = "ol_" + movies3['original_language'].astype(str)

In [17]:
movies3.head(1)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",gen_action gen_adventure gen_fantasy gen_scien...,cultureclash future spacewar spacecolony socie...,ol_en,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",Enter the World of Pandora.,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## Working on `production_companies`

In [18]:
movies4 = movies3.copy()

In [19]:
def convert_pc(obj):
    List = []
    for i in ast.literal_eval(obj):
        List.append(i['name'])
    text = ", ".join(List)
    text = text.lower().replace(" ","").replace(","," ").replace("."," ").replace("  "," ")
    return text

In [21]:
movies4['production_companies'] = movies4['production_companies'].apply(convert_pc)

SyntaxError: invalid syntax (<unknown>, line 1)

In [22]:
movies4.head(1)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",gen_action gen_adventure gen_fantasy gen_scien...,cultureclash future spacewar spacecolony socie...,ol_en,ingeniousfilmpartners twentiethcenturyfoxfilmc...,Enter the World of Pandora.,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [23]:
# checking production companies
print(movies.iloc[3].title_x)
print(movies.iloc[3].production_companies)
print(movies4.iloc[3].title_x)
print(movies4.iloc[3].production_companies)
# so we will need all production companies name

The Dark Knight Rises
[{"name": "Legendary Pictures", "id": 923}, {"name": "Warner Bros.", "id": 6194}, {"name": "DC Entertainment", "id": 9993}, {"name": "Syncopy", "id": 9996}]
The Dark Knight Rises
legendarypictures warnerbros dcentertainment syncopy


## Working on `cast`

In [25]:
movies5 = movies4.copy()

In [166]:
movies5.head(2)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",gen_action gen_adventure gen_fantasy gen_scien...,cultureclash future spacewar spacecolony socie...,ol_en,ingeniousfilmpartners twentiethcenturyfoxfilmc...,Enter the World of Pandora.,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",gen_adventure gen_fantasy gen_action,ocean drugabuse exoticisland eastindiatradingc...,ol_en,waltdisneypictures jerrybruckheimerfilms secon...,"At the end of the world, the adventure begins.","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [200]:
movies5.iloc[1].cast
# i need : character (first 5), name (first 5)

'[{"cast_id": 4, "character": "Captain Jack Sparrow", "credit_id": "52fe4232c3a36847f800b50d", "gender": 2, "id": 85, "name": "Johnny Depp", "order": 0}, {"cast_id": 5, "character": "Will Turner", "credit_id": "52fe4232c3a36847f800b511", "gender": 2, "id": 114, "name": "Orlando Bloom", "order": 1}, {"cast_id": 6, "character": "Elizabeth Swann", "credit_id": "52fe4232c3a36847f800b515", "gender": 1, "id": 116, "name": "Keira Knightley", "order": 2}, {"cast_id": 12, "character": "William \\"Bootstrap Bill\\" Turner", "credit_id": "52fe4232c3a36847f800b52d", "gender": 2, "id": 1640, "name": "Stellan Skarsg\\u00e5rd", "order": 3}, {"cast_id": 10, "character": "Captain Sao Feng", "credit_id": "52fe4232c3a36847f800b525", "gender": 2, "id": 1619, "name": "Chow Yun-fat", "order": 4}, {"cast_id": 9, "character": "Captain Davy Jones", "credit_id": "52fe4232c3a36847f800b521", "gender": 2, "id": 2440, "name": "Bill Nighy", "order": 5}, {"cast_id": 7, "character": "Captain Hector Barbossa", "credit_

In [27]:
def top_cast(obj):
    data = ast.literal_eval(obj)
    top5 = data[:5]
    top5_cast = [] 
    for member in top5:
        name = member['name'].replace(' ','').replace('"'," ").replace("."," ").replace("-","").lower()
        character = member['character'].replace(' ','').replace('"'," ").replace("."," ").replace("-","").lower()
        top5_cast.append(f"{name} {character}")
    return " ".join(top5_cast) 

In [28]:
movies5['cast'] = movies5['cast'].apply(top_cast)

In [242]:
movies5.iloc[1].cast

'johnnydepp captainjacksparrow orlandobloom willturner keiraknightley elizabethswann stellanskarsgård william bootstrapbill turner chowyunfat captainsaofeng'

## Working on `crew`

In [30]:
movies6 = movies5.copy()

In [31]:
movies6.head(1)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",gen_action gen_adventure gen_fantasy gen_scien...,cultureclash future spacewar spacecolony socie...,ol_en,ingeniousfilmpartners twentiethcenturyfoxfilmc...,Enter the World of Pandora.,samworthington jakesully zoesaldana neytiri si...,"[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [246]:
movies6.iloc[1].crew

'[{"credit_id": "52fe4232c3a36847f800b579", "department": "Camera", "gender": 2, "id": 120, "job": "Director of Photography", "name": "Dariusz Wolski"}, {"credit_id": "52fe4232c3a36847f800b4fd", "department": "Directing", "gender": 2, "id": 1704, "job": "Director", "name": "Gore Verbinski"}, {"credit_id": "52fe4232c3a36847f800b54f", "department": "Production", "gender": 2, "id": 770, "job": "Producer", "name": "Jerry Bruckheimer"}, {"credit_id": "52fe4232c3a36847f800b503", "department": "Writing", "gender": 2, "id": 1705, "job": "Screenplay", "name": "Ted Elliott"}, {"credit_id": "52fe4232c3a36847f800b509", "department": "Writing", "gender": 2, "id": 1706, "job": "Screenplay", "name": "Terry Rossio"}, {"credit_id": "52fe4232c3a36847f800b57f", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "52fe4232c3a36847f800b585", "department": "Editing", "gender": 2, "id": 1722, "job": "Editor", "name": "Craig Wood"}, {"credit_id": "52f

In [32]:
def fetch_director(obj):
    data = ast.literal_eval(obj)
    director = []
    for i in data:
        if i['job'] == 'Director':
            name = i['name'].replace(' ','').replace('"'," ").replace("."," ").replace("-","").lower()
            director.append(f"{name}")
    return " ".join(director)

In [34]:
movies6['crew'] = movies5['crew'].apply(fetch_director)

In [36]:
movies6.iloc[6].crew

'byronhoward nathangreno'

In [265]:
movies6.head(1)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",gen_action gen_adventure gen_fantasy gen_scien...,cultureclash future spacewar spacecolony socie...,ol_en,ingeniousfilmpartners twentiethcenturyfoxfilmc...,Enter the World of Pandora.,samworthington jakesully zoesaldana neytiri si...,jamescameron


## convert all column into list

In [39]:
movies7 = movies6.copy()

making a title for later

In [41]:
movies7['title'] = movies7['title_x']

In [42]:
columns = ['title_x','overview','genres','keywords',
           'original_language','production_companies',
           'tagline','cast','crew']
for i in columns:
    movies7[i] = movies7[i].apply(lambda x:x.lower().split())

In [286]:
movies7.head(2)

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew,title
0,19995,[avatar],"[in, the, 22nd, century,, a, paraplegic, marin...","[gen_action, gen_adventure, gen_fantasy, gen_s...","[cultureclash, future, spacewar, spacecolony, ...",[ol_en],"[ingeniousfilmpartners, twentiethcenturyfoxfil...","[enter, the, world, of, pandora.]","[samworthington, jakesully, zoesaldana, neytir...",[jamescameron],Avatar
1,285,"[pirates, of, the, caribbean:, at, world's, end]","[captain, barbossa,, long, believed, to, be, d...","[gen_adventure, gen_fantasy, gen_action]","[ocean, drugabuse, exoticisland, eastindiatrad...",[ol_en],"[waltdisneypictures, jerrybruckheimerfilms, se...","[at, the, end, of, the, world,, the, adventure...","[johnnydepp, captainjacksparrow, orlandobloom,...",[goreverbinski],Pirates of the Caribbean: At World's End


## Conatinating all Columns into `tags`

In [43]:
movies8 = movies7.copy()

In [44]:
movies8["tags"] = movies8['title_x'] + movies8['overview'] + movies8['genres'] + movies8['keywords'] + movies8['original_language'] + movies8['production_companies'] + movies8['tagline'] + movies8['cast'] + movies8['crew']

In [290]:
movies8.head()

Unnamed: 0,id,title_x,overview,genres,keywords,original_language,production_companies,tagline,cast,crew,title,tags
0,19995,[avatar],"[in, the, 22nd, century,, a, paraplegic, marin...","[gen_action, gen_adventure, gen_fantasy, gen_s...","[cultureclash, future, spacewar, spacecolony, ...",[ol_en],"[ingeniousfilmpartners, twentiethcenturyfoxfil...","[enter, the, world, of, pandora.]","[samworthington, jakesully, zoesaldana, neytir...",[jamescameron],Avatar,"[avatar, in, the, 22nd, century,, a, paraplegi..."
1,285,"[pirates, of, the, caribbean:, at, world's, end]","[captain, barbossa,, long, believed, to, be, d...","[gen_adventure, gen_fantasy, gen_action]","[ocean, drugabuse, exoticisland, eastindiatrad...",[ol_en],"[waltdisneypictures, jerrybruckheimerfilms, se...","[at, the, end, of, the, world,, the, adventure...","[johnnydepp, captainjacksparrow, orlandobloom,...",[goreverbinski],Pirates of the Caribbean: At World's End,"[pirates, of, the, caribbean:, at, world's, en..."
2,206647,[spectre],"[a, cryptic, message, from, bond’s, past, send...","[gen_action, gen_adventure, gen_crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...",[ol_en],"[columbiapictures, danjaq, b24]","[a, plan, no, one, escapes]","[danielcraig, jamesbond, christophwaltz, blofe...",[sammendes],Spectre,"[spectre, a, cryptic, message, from, bond’s, p..."
3,49026,"[the, dark, knight, rises]","[following, the, death, of, district, attorney...","[gen_action, gen_crime, gen_drama, gen_thriller]","[dccomics, crimefighter, terrorist, secretiden...",[ol_en],"[legendarypictures, warnerbros, dcentertainmen...","[the, legend, ends]","[christianbale, brucewayne/batman, michaelcain...",[christophernolan],The Dark Knight Rises,"[the, dark, knight, rises, following, the, dea..."
4,49529,"[john, carter]","[john, carter, is, a, war-weary,, former, mili...","[gen_action, gen_adventure, gen_sciencefiction]","[basedonnovel, mars, medallion, spacetravel, p...",[ol_en],[waltdisneypictures],"[lost, in, our, world,, found, in, another.]","[taylorkitsch, johncarter, lynncollins, dejaht...",[andrewstanton],John Carter,"[john, carter, john, carter, is, a, war-weary,..."


In [45]:
movies9 = pd.DataFrame(movies8[['id', 'title', 'tags']])

In [293]:
movies9.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"[avatar, in, the, 22nd, century,, a, paraplegi..."
1,285,Pirates of the Caribbean: At World's End,"[pirates, of, the, caribbean:, at, world's, en..."
2,206647,Spectre,"[spectre, a, cryptic, message, from, bond’s, p..."
3,49026,The Dark Knight Rises,"[the, dark, knight, rises, following, the, dea..."
4,49529,John Carter,"[john, carter, john, carter, is, a, war-weary,..."


In [294]:
movies9.iloc[6].tags


['tangled',
 'when',
 'the',
 "kingdom's",
 'most',
 'wanted-and',
 'most',
 'charming-bandit',
 'flynn',
 'rider',
 'hides',
 'out',
 'in',
 'a',
 'mysterious',
 'tower,',
 "he's",
 'taken',
 'hostage',
 'by',
 'rapunzel,',
 'a',
 'beautiful',
 'and',
 'feisty',
 'tower-bound',
 'teen',
 'with',
 '70',
 'feet',
 'of',
 'magical,',
 'golden',
 'hair.',
 "flynn's",
 'curious',
 'captor,',
 "who's",
 'looking',
 'for',
 'her',
 'ticket',
 'out',
 'of',
 'the',
 'tower',
 'where',
 "she's",
 'been',
 'locked',
 'away',
 'for',
 'years,',
 'strikes',
 'a',
 'deal',
 'with',
 'the',
 'handsome',
 'thief',
 'and',
 'the',
 'unlikely',
 'duo',
 'sets',
 'off',
 'on',
 'an',
 'action-packed',
 'escapade,',
 'complete',
 'with',
 'a',
 'super-cop',
 'horse,',
 'an',
 'over-protective',
 'chameleon',
 'and',
 'a',
 'gruff',
 'gang',
 'of',
 'pub',
 'thugs.',
 'gen_animation',
 'gen_family',
 'hostage',
 'magic',
 'horse',
 'fairytale',
 'musical',
 'princess',
 'animation',
 'tower',
 'blondewom

## Converting `tags` into string

In [46]:
movies9['tags'] = movies9['tags'].apply(lambda x: ' '.join(x))

In [53]:
movies9.iloc[6].tags

"tangled when the kingdom's most wantedand most charmingbandit flynn rider hides out in a mysterious tower he's taken hostage by rapunzel a beautiful and feisty towerbound teen with 70 feet of magical golden hair flynn's curious captor who's looking for her ticket out of the tower where she's been locked away for years strikes a deal with the handsome thief and the unlikely duo sets off on an actionpacked escapade complete with a supercop horse an overprotective chameleon and a gruff gang of pub thugs gen_animation gen_family hostage magic horse fairytale musical princess animation tower blondewoman selfishness healingpower basedonfairytale duringcreditsstinger healinggift animalsidekick ol_en waltdisneypictures waltdisneyanimationstudios they're taking adventure to new lengths zacharylevi flynnridervoice mandymoore rapunzelvoice donnamurphy mothergothelvoice ronperlman stabbingtonbrothervoice m c gainey captainoftheguardvoice byronhoward nathangreno"

#### Notice
we can see thre are still some symbols like `(voice)` `.` now we have to handle those

In [51]:
def clean_text(text):
    for ch in ['(', ')', '.', ',','-']:     # remove these characters completely
        text = text.replace(ch, '')
    text = ' '.join(text.split())       # replace multiple spaces with single space because it joinning only words with single space
    return text

In [52]:
movies9['tags'] = movies9['tags'].apply(clean_text)

In [301]:
movies9.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"avatar in the 22nd century, a paraplegic marin..."
1,285,Pirates of the Caribbean: At World's End,pirates of the caribbean: at world's end capta...
2,206647,Spectre,spectre a cryptic message from bond’s past sen...
3,49026,The Dark Knight Rises,the dark knight rises following the death of d...
4,49529,John Carter,"john carter john carter is a war-weary, former..."


## Handling simmiler words like `loved, loving` to `love, love` 

it will require `nltk` library from natural language proccessing. if you are running this notbook on your local computer, you may intsll it by `!pip install nltk`

In [56]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [63]:
def stem(text):
    a = []
    for i in text.split():
        a.append(ps.stem(i))
    return " ".join(a)

In [65]:
# just test of `stem`
print("tangled when the kingdom's most wantedand most charmingbandit flynn rider hides out in a mysterious tower he's taken hostage by rapunzel a beautiful and feisty towerbound teen with 70 feet of magical golden hair flynn's curious captor who's looking for her ticket out of the tower where she's been locked away for years strikes a deal with the handsome thief and the unlikely duo sets off on an actionpacked escapade complete with a supercop horse an overprotective chameleon and a gruff gang of pub thugs gen_animation gen_family hostage magic horse fairytale musical princess animation tower blondewoman selfishness healingpower basedonfairytale duringcreditsstinger healinggift animalsidekick ol_en waltdisneypictures waltdisneyanimationstudios they're taking adventure to new lengths zacharylevi flynnridervoice mandymoore rapunzelvoice donnamurphy mothergothelvoice ronperlman stabbingtonbrothervoice m c gainey captainoftheguardvoice byronhoward nathangreno\n"
)
stem("tangled when the kingdom's most wantedand most charmingbandit flynn rider hides out in a mysterious tower he's taken hostage by rapunzel a beautiful and feisty towerbound teen with 70 feet of magical golden hair flynn's curious captor who's looking for her ticket out of the tower where she's been locked away for years strikes a deal with the handsome thief and the unlikely duo sets off on an actionpacked escapade complete with a supercop horse an overprotective chameleon and a gruff gang of pub thugs gen_animation gen_family hostage magic horse fairytale musical princess animation tower blondewoman selfishness healingpower basedonfairytale duringcreditsstinger healinggift animalsidekick ol_en waltdisneypictures waltdisneyanimationstudios they're taking adventure to new lengths zacharylevi flynnridervoice mandymoore rapunzelvoice donnamurphy mothergothelvoice ronperlman stabbingtonbrothervoice m c gainey captainoftheguardvoice byronhoward nathangreno"
)

tangled when the kingdom's most wantedand most charmingbandit flynn rider hides out in a mysterious tower he's taken hostage by rapunzel a beautiful and feisty towerbound teen with 70 feet of magical golden hair flynn's curious captor who's looking for her ticket out of the tower where she's been locked away for years strikes a deal with the handsome thief and the unlikely duo sets off on an actionpacked escapade complete with a supercop horse an overprotective chameleon and a gruff gang of pub thugs gen_animation gen_family hostage magic horse fairytale musical princess animation tower blondewoman selfishness healingpower basedonfairytale duringcreditsstinger healinggift animalsidekick ol_en waltdisneypictures waltdisneyanimationstudios they're taking adventure to new lengths zacharylevi flynnridervoice mandymoore rapunzelvoice donnamurphy mothergothelvoice ronperlman stabbingtonbrothervoice m c gainey captainoftheguardvoice byronhoward nathangreno



"tangl when the kingdom' most wantedand most charmingbandit flynn rider hide out in a mysteri tower he' taken hostag by rapunzel a beauti and feisti towerbound teen with 70 feet of magic golden hair flynn' curiou captor who' look for her ticket out of the tower where she' been lock away for year strike a deal with the handsom thief and the unlik duo set off on an actionpack escapad complet with a supercop hors an overprotect chameleon and a gruff gang of pub thug gen_anim gen_famili hostag magic hors fairytal music princess anim tower blondewoman selfish healingpow basedonfairytal duringcreditssting healinggift animalsidekick ol_en waltdisneypictur waltdisneyanimationstudio they'r take adventur to new length zacharylevi flynnridervoic mandymoor rapunzelvoic donnamurphi mothergothelvoic ronperlman stabbingtonbrothervoic m c gainey captainoftheguardvoic byronhoward nathangreno"

#### now applyting `stem` funcitin to our current dataset's tags column

In [66]:
movies_stem = movies9.copy()

In [69]:
movies_stem['tags'] = movies_stem['tags'].apply(stem)

# Vectorizing

In [70]:
movies_stem.shape

(4803, 3)

In [78]:
vector = movies_stem.copy()

In [79]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')

In [80]:
# covert into vector
vector = cv.fit_transform(movies_stem['tags']).toarray()

In [82]:
print(vector.shape)

(4803, 57809)


In [83]:
# check first few words
feature_name =cv.get_feature_names_out()
print(feature_name[:500])

['00' '000' '007' '05' '06' '07am' '10' '100' '1000' '10000' '100000'
 '1000000' '1000foot' '1001' '100bare' '10191' '1019entertain'
 '101ststreetfilm' '101yearold' '102' '108yearold' '10round' '10th'
 '10thholeproduct' '10weststudio' '10year' '10yearold' '10yearoldzo' '11'
 '11000' '117' '118' '119' '11th' '11year' '11yearold' '12' '1200'
 '120dbfilm' '120film' '121208' '1214' '1215' '1250' '125th' '127'
 '12foot' '12hour' '12th' '12thcenturi' '12year' '12yearold' '13' '130'
 '1300' '1350' '13ghostsproductionscanadainc' '13th' '13year' '13yearold'
 '14' '140' '14000' '1408' '141' '142' '1429' '148000' '1492pictur'
 '14foot' '14pm' '14th' '14year' '14yearold' '15' '150' '150th' '1520'
 '1536' '15mile' '15th' '15thcenturi' '15year' '15yearold' '16' '1600'
 '161' '1630s' '16441911' '1681' '1691' '16blockproduct' '16th'
 '16thcenturi' '16yearold' '17' '1700' '170000' '1718' '173rd' '1748'
 '1776' '17th' '17thcenturi' '17year' '17yearold' '18' '180' '1800' '1812'
 '1812product' '1818' '182

# Making `cosine distance`

In [85]:
from sklearn.metrics.pairwise import cosine_similarity

In [88]:
similarity = cosine_similarity(vector)

In [87]:
cosine_similarity(vector).shape

(4803, 4803)

In [91]:
similarity[:3]

array([[1.        , 0.08695652, 0.06552976, ..., 0.05285626, 0.02675241,
        0.01428717],
       [0.08695652, 1.        , 0.04914732, ..., 0.05285626, 0.04012862,
        0.02857434],
       [0.06552976, 0.04914732, 1.        , ..., 0.02987405, 0.01512031,
        0.01615005]])

now as we got similarity, we have to sort it in descending way. sothat we can get the top 5 or 10 similar movies according to the similarity value. but the problem is if we sort it, their index value will change. so we have to apply `enumerate` so that each sorted value keeps their orignal index position. 

there are two different `id` and `index`. id is for fetching the thumbnail and other things from TMDB site. they are usefull. but we will not use it right now. we will work on `index` value

In [130]:
# testing
print("ID: ", movies9[movies9['title'] == 'Batman Begins']['id'].iloc[0])
print("Index: ", movies9[movies9['title'] == 'Batman Begins'].index[0])

ID:  272
Index:  119


`enumerate` creats an object for each value : `enumerate(similarity[0])` we will put it into a list : `list(enumerate(similarity[0]))`.
here inside `[]` we can provide index position of any movie. and can see it's similarities with other movies. this is a `list` of `tupples`

In [None]:
list(enumerate(similarity[1]))

#### 1:
now we can sort it. like: `sorted(list(enumerate(similarity[1])),reverse=True)` but it will show the last movie first. because we used reverse whitch thought we are sorting based on index. so we need to specify the sorting position.
current output:
```
[(4802, 0.028574344469642157),
 (4801, 0.0401286176952564),
...
```
#### 2:
if we think it as a csv, here first column `4882` is the last number of the index. so can tell that it sorted by first column `[0]` but we need to sort by column 2 `[1]`. to specify we will use a `lambda` function to give to set 2nd column as key. `sorted(list(enumerate(similarity[1])),reverse=True, key = lambda x:x[1])`

#### 3:
but it still have a problem. it show the relation with it also. which index is `[0]` but we need 2nd index `[1]`. and we will take first 10 similar movies, which index will be `[11]`. so we will put a positio `[1:11]`

In [142]:
# sorted(list(enumerate(similarity[1])),reverse=True)
# sorted(list(enumerate(similarity[0])),reverse=True, key = lambda x:x[1])
sorted(list(enumerate(similarity[0])),reverse=True, key = lambda x:x[1])[1:11]

[(2403, 0.2153527608232662),
 (1213, 0.20348923188911994),
 (778, 0.19332886183313755),
 (1086, 0.1930468356263361),
 (1914, 0.19206937701335358),
 (507, 0.18725147156828453),
 (539, 0.18363965490589892),
 (1201, 0.17518714874213512),
 (47, 0.16447838793172298),
 (3158, 0.15890330854706755)]

## Final function

now lets build the final functin that will do all the steps

In [155]:
def recommend(movie):
    movie_index = movies9[movies9['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True, key = lambda x:x[1])[1:11] # we can remove [0] as we did it n `distance`

    for i in movies_list:
        print(movies9.iloc[i[0]].title)

In [156]:
recommend('Avatar')

Aliens
Aliens vs Predator: Requiem
Meet Dave
Aliens in the Attic
Lifeforce
Independence Day
Titan A.E.
Predators
Star Trek Into Darkness
Alien


In [None]:
import pickle
pickle.dump()

In [2]:
movies9.head()

NameError: name 'movies9' is not defined