In [60]:
import os
path = os.getcwd()



In [61]:
import numpy as np
import pandas as pd
import ast
import re
import datetime

from nltk import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [62]:
import warnings
warnings.filterwarnings('ignore')

### Import Datasets

In [63]:
credits = pd.read_csv(path + r'\data\credits.csv')
keywords = pd.read_csv(path + r'\data\keywords.csv')
links = pd.read_csv(path + r'\data\links.csv')
movies_metadata = pd.read_csv(path + r'\data\movies_metadata.csv')

In [64]:
# credits[['cast','crew','id']]
# keywords[['id','keywords']]
# links[['movieId', 'imdbId', 'tmdbId']]
# movies_metadata[['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
#        'imdb_id', 'original_language', 'original_title', 'overview',
#        'popularity', 'poster_path', 'production_companies',
#        'production_countries', 'release_date', 'revenue', 'runtime',
#        'spoken_languages', 'status', 'tagline', 'title', 'video',
#        'vote_average', 'vote_count']]
# ratings[['userId', 'movieId', 'rating', 'timestamp']]

### Data Overview

##### Movies Metadata

In [65]:
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [66]:
# ['id','title','genres','overview']

In [67]:
movies_metadata['belongs_to_collection'][0]

"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}"

In [68]:
movies_metadata['belongs_to_collection'].isnull().sum()

40972

In [69]:
movies_metadata['popularity']

0        21.946943
1        17.015539
2          11.7129
3         3.859495
4         8.387519
           ...    
45461     0.072051
45462     0.178241
45463     0.903007
45464     0.003503
45465     0.163015
Name: popularity, Length: 45466, dtype: object

In [70]:
movies_metadata['tagline'].head()

0                                                  NaN
1            Roll the dice and unleash the excitement!
2    Still Yelling. Still Fighting. Still Ready for...
3    Friends are the people who let you be yourself...
4    Just When His World Is Back To Normal... He's ...
Name: tagline, dtype: object

In [71]:
movies_metadata['video'].head()

0    False
1    False
2    False
3    False
4    False
Name: video, dtype: object

In [72]:
movies_metadata['video'].unique()

array([False, True, nan], dtype=object)

In [73]:
movies_metadata['vote_average'].isnull().sum()

6

In [74]:
movies_metadata['adult'].value_counts()

adult
False                                                                                                                             45454
True                                                                                                                                  9
 - Written by Ørnås                                                                                                                   1
 Rune Balot goes to a casino connected to the October corporation to try to wrap up her case once and for all.                        1
 Avalanche Sharks tells the story of a bikini contest that turns into a horrifying affair when it is hit by a shark avalanche.        1
Name: count, dtype: int64

In [75]:
movies_metadata[movies_metadata['adult'].isin(['False','True'])].shape

(45463, 24)

In [76]:
movies_metadata[~ movies_metadata['adult'].isin(['False','True'])].index

Index([19730, 29503, 35587], dtype='int64')

### Data Preprocessing

##### Movies Metadata

In [77]:
movies_metadata.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [78]:
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [79]:
movies_metadata['original_language'].unique()

array(['en', 'fr', 'zh', 'it', 'fa', 'nl', 'de', 'cn', 'ar', 'es', 'ru',
       'sv', 'ja', 'ko', 'sr', 'bn', 'he', 'pt', 'wo', 'ro', 'hu', 'cy',
       'vi', 'cs', 'da', 'no', 'nb', 'pl', 'el', 'sh', 'xx', 'mk', 'bo',
       'ca', 'fi', 'th', 'sk', 'bs', 'hi', 'tr', 'is', 'ps', 'ab', 'eo',
       'ka', 'mn', 'bm', 'zu', 'uk', 'af', 'la', 'et', 'ku', 'fy', 'lv',
       'ta', 'sl', 'tl', 'ur', 'rw', 'id', 'bg', 'mr', 'lt', 'kk', 'ms',
       'sq', nan, '104.0', 'qu', 'te', 'am', 'jv', 'tg', 'ml', 'hr', 'lo',
       'ay', 'kn', 'eu', 'ne', 'pa', 'ky', 'gl', '68.0', 'uz', 'sm', 'mt',
       '82.0', 'hy', 'iu', 'lb', 'si'], dtype=object)

In [80]:
movies_metadata['original_language'].value_counts()

original_language
en       32269
fr        2438
it        1529
ja        1350
de        1080
         ...  
zu           1
qu           1
104.0        1
la           1
si           1
Name: count, Length: 92, dtype: int64

In [81]:
movies_metadata = movies_metadata[movies_metadata['original_language'].isin(['en','hi','bn','gu','ml','mr','pa','ta','te','ur'])]
# movies_metadata = movies_metadata[movies_metadata['original_language'] == 'hi']

In [82]:
movies_metadata['release_date'].fillna(0 , inplace=True)

In [83]:
def convert(date_str):
    try:
        return pd.to_datetime(date_str)
    except (ValueError, TypeError):
        return pd.NaT

In [84]:
movies_metadata['release_date'] = movies_metadata['release_date'].apply(convert)

In [85]:
movies_metadata = movies_metadata[~ movies_metadata['release_date'].isna()]

In [86]:
movies_metadata = movies_metadata[movies_metadata['release_date']>'1990-01-01']

In [87]:
movies_metadata = movies_metadata.reset_index(drop=True)

In [88]:
#####################################################################################

In [89]:
eng = movies_metadata[movies_metadata['original_language'] == 'en']

In [90]:
index = eng[eng['release_date'] <= '2005-01-01'].index

In [91]:
filtered_movies = movies_metadata.drop(index , axis=0)

In [92]:
filtered_movies.shape

(14132, 24)

In [93]:
filtered_movies = filtered_movies[(filtered_movies['runtime']>=100) & (filtered_movies['vote_average']>=5)]

In [94]:
filtered_movies.shape

(4107, 24)

In [95]:
#############################################################################################

In [96]:
# Taking the required columns only for our analysis

In [97]:
movies = filtered_movies[['id','title','genres','overview','vote_average']].reset_index(drop=True)

In [98]:
movies.head()

Unnamed: 0,id,title,genres,overview,vote_average
0,79782,Venice,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",An atmospheric coming-of-age story featuring a...,7.5
1,480,Monsoon Wedding,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",From an exciting Indian wedding comes a relati...,6.8
2,4435,Asoka,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...","In India, about 260 BC in the Empire of Magadh...",5.8
3,7504,Earth,"[{'id': 18, 'name': 'Drama'}]",It's 1947 and the borderlines between India an...,6.6
4,19666,Lagaan: Once Upon a Time in India,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",Lagaan tells the tale of the Indian village Ch...,7.2


In [99]:
movies.isnull().sum()

id              0
title           0
genres          0
overview        2
vote_average    0
dtype: int64

In [100]:
movies = movies[~ movies['title'].isnull()]

In [101]:
movies.shape

(4107, 5)

In [102]:
duplicates = movies['title'].duplicated()

In [103]:
movies = movies[~ duplicates]

In [104]:
movies['id'].duplicated().sum()

0

In [105]:
movies.shape

(4076, 5)

In [106]:
movies['overview']

0       An atmospheric coming-of-age story featuring a...
1       From an exciting Indian wedding comes a relati...
2       In India, about 260 BC in the Empire of Magadh...
3       It's 1947 and the borderlines between India an...
4       Lagaan tells the tale of the Indian village Ch...
                              ...                        
4101    A sensitive university student unraveling whil...
4102    A man is forced to marry a tree to ward off il...
4104    A woman and a young girl in different cities a...
4105    Are we prepared for dealing with the prospect ...
4106    The bliss of a biology teacher’s family life i...
Name: overview, Length: 4076, dtype: object

In [107]:
movies['genres'][0]

"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]"

In [108]:
def genre_name(items):
    items = ast.literal_eval(items)
    g_list = []
    for i in items:
        g_list.append(i['name'])
    return g_list

genre_name(movies['genres'][0])


['Drama', 'Romance']

In [109]:
movies['genres'] = movies['genres'].apply(genre_name)

In [110]:
movies['overview'] = movies['overview'].apply(lambda x : str(x).split())

In [111]:
movies.head()

Unnamed: 0,id,title,genres,overview,vote_average
0,79782,Venice,"[Drama, Romance]","[An, atmospheric, coming-of-age, story, featur...",7.5
1,480,Monsoon Wedding,"[Comedy, Drama, Romance]","[From, an, exciting, Indian, wedding, comes, a...",6.8
2,4435,Asoka,"[Drama, History]","[In, India,, about, 260, BC, in, the, Empire, ...",5.8
3,7504,Earth,[Drama],"[It's, 1947, and, the, borderlines, between, I...",6.6
4,19666,Lagaan: Once Upon a Time in India,"[Adventure, Drama, Music, Romance]","[Lagaan, tells, the, tale, of, the, Indian, vi...",7.2


##### Credits

In [112]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [113]:
credits.isnull().sum()

cast    0
crew    0
id      0
dtype: int64

In [114]:
credits['id'].duplicated().sum()

44

In [115]:
credits = credits[~ credits['id'].duplicated()]

In [116]:
# credits['cast'][0]

In [117]:
def cast_name(items):
    items = ast.literal_eval(items)
    cast_list = []
    count = 0
    for i in items:
        if count != 4:
            cast_list.append(i['name'])
            count += 1
        else:
            break
    return cast_list

cast_name(credits['cast'][0])

['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim Varney']

In [118]:
credits['cast'] = credits['cast'].apply(cast_name)

In [119]:
# credits['crew'][0]

In [120]:
def director_name(items):
    items = ast.literal_eval(items)
    director_list = []
    for i in items:
        if i['job'] == 'Director':
            director_list.append(i['name'])
        else:
            break
    return director_list

director_name(credits['crew'][0])

['John Lasseter']

In [121]:
credits['crew'] = credits['crew'].apply(director_name)

In [122]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney]",[John Lasseter],862
1,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",[],8844
2,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",[Howard Deutch],15602
3,"[Whitney Houston, Angela Bassett, Loretta Devi...",[Forest Whitaker],31357
4,"[Steve Martin, Diane Keaton, Martin Short, Kim...",[],11862


##### Keywords

In [123]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [124]:
keywords['keywords'][0]

"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]"

In [125]:
keywords['id'].duplicated().sum()

987

In [126]:
keywords.drop_duplicates(subset='id',inplace=True)

In [127]:
def keyword_name(items):
    items = ast.literal_eval(items)
    key_list = []
    for i in items:
        key_list.append(i['name'])
    return key_list

keyword_name(keywords['keywords'][0])

['jealousy',
 'toy',
 'boy',
 'friendship',
 'friends',
 'rivalry',
 'boy next door',
 'new toy',
 'toy comes to life']

In [128]:
keywords['keywords'] = keywords['keywords'].apply(keyword_name)

In [129]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[jealousy, toy, boy, friendship, friends, riva..."
1,8844,"[board game, disappearance, based on children'..."
2,15602,"[fishing, best friend, duringcreditsstinger, o..."
3,31357,"[based on novel, interracial relationship, sin..."
4,11862,"[baby, midlife crisis, confidence, aging, daug..."


##### Links

In [130]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [131]:
links = links[['tmdbId','imdbId']]

In [132]:
links.columns = ['id','imdbid']

In [133]:
links.head()

Unnamed: 0,id,imdbid
0,862.0,114709
1,8844.0,113497
2,15602.0,113228
3,31357.0,114885
4,11862.0,113041


In [134]:
links.isnull().sum()

id        219
imdbid      0
dtype: int64

In [135]:
links.dropna(inplace = True)

In [136]:
links.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45624 entries, 0 to 45842
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      45624 non-null  float64
 1   imdbid  45624 non-null  int64  
dtypes: float64(1), int64(1)
memory usage: 1.0 MB


In [137]:
links['id'] = links['id'].astype(int)

In [138]:
links.head()

Unnamed: 0,id,imdbid
0,862,114709
1,8844,113497
2,15602,113228
3,31357,114885
4,11862,113041


In [139]:
links['id'].duplicated().sum()

30

In [140]:
links.drop_duplicates(subset = 'id' , inplace = True)

In [141]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4076 entries, 0 to 4106
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            4076 non-null   object 
 1   title         4076 non-null   object 
 2   genres        4076 non-null   object 
 3   overview      4076 non-null   object 
 4   vote_average  4076 non-null   float64
dtypes: float64(1), object(4)
memory usage: 320.1+ KB


In [142]:
movies['id'] = movies['id'].astype(int)

In [143]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4076 entries, 0 to 4106
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            4076 non-null   int32  
 1   title         4076 non-null   object 
 2   genres        4076 non-null   object 
 3   overview      4076 non-null   object 
 4   vote_average  4076 non-null   float64
dtypes: float64(1), int32(1), object(3)
memory usage: 304.2+ KB


In [144]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45432 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45432 non-null  object
 1   crew    45432 non-null  object
 2   id      45432 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.4+ MB


In [145]:
movies.shape

(4076, 5)

In [146]:
data = pd.merge(movies,credits,on='id')

In [147]:
data.shape

(4076, 7)

In [148]:
data = data.merge(keywords , on='id')

In [149]:
data.shape

(4076, 8)

In [150]:
data = data.merge(links , on='id')

In [151]:
data.shape

(4076, 9)

In [152]:
#

In [153]:
data.head()

Unnamed: 0,id,title,genres,overview,vote_average,cast,crew,keywords,imdbid
0,79782,Venice,"[Drama, Romance]","[An, atmospheric, coming-of-age, story, featur...",7.5,"[Marcin Walewski, Magdalena Cielecka, Mariusz ...",[Jan Jakub Kolski],[],105729
1,480,Monsoon Wedding,"[Comedy, Drama, Romance]","[From, an, exciting, Indian, wedding, comes, a...",6.8,"[Naseeruddin Shah, Lillete Dubey, Vijay Raaz, ...",[Mira Nair],"[child abuse, adultery, arranged marriage, bol...",265343
2,4435,Asoka,"[Drama, History]","[In, India,, about, 260, BC, in, the, Empire, ...",5.8,"[Shah Rukh Khan, Kareena Kapoor, Ajith Kumar, ...",[],"[buddhism, emperor, commander, reue, india, an...",249371
3,7504,Earth,[Drama],"[It's, 1947, and, the, borderlines, between, I...",6.6,"[Aamir Khan, Nandita Das, Maia Sethna, Kitu Gi...",[Deepa Mehta],"[based on novel, war of independence, period d...",150433
4,19666,Lagaan: Once Upon a Time in India,"[Adventure, Drama, Music, Romance]","[Lagaan, tells, the, tale, of, the, Indian, vi...",7.2,"[Aamir Khan, Gracy Singh, Rachel Shelley, Paul...",[],"[sport, british, bollywood, arrogance, based o...",282674


In [154]:
data = data[['id','title','genres','cast','crew','keywords','overview','vote_average','imdbid']]

In [155]:
data.head()

Unnamed: 0,id,title,genres,cast,crew,keywords,overview,vote_average,imdbid
0,79782,Venice,"[Drama, Romance]","[Marcin Walewski, Magdalena Cielecka, Mariusz ...",[Jan Jakub Kolski],[],"[An, atmospheric, coming-of-age, story, featur...",7.5,105729
1,480,Monsoon Wedding,"[Comedy, Drama, Romance]","[Naseeruddin Shah, Lillete Dubey, Vijay Raaz, ...",[Mira Nair],"[child abuse, adultery, arranged marriage, bol...","[From, an, exciting, Indian, wedding, comes, a...",6.8,265343
2,4435,Asoka,"[Drama, History]","[Shah Rukh Khan, Kareena Kapoor, Ajith Kumar, ...",[],"[buddhism, emperor, commander, reue, india, an...","[In, India,, about, 260, BC, in, the, Empire, ...",5.8,249371
3,7504,Earth,[Drama],"[Aamir Khan, Nandita Das, Maia Sethna, Kitu Gi...",[Deepa Mehta],"[based on novel, war of independence, period d...","[It's, 1947, and, the, borderlines, between, I...",6.6,150433
4,19666,Lagaan: Once Upon a Time in India,"[Adventure, Drama, Music, Romance]","[Aamir Khan, Gracy Singh, Rachel Shelley, Paul...",[],"[sport, british, bollywood, arrogance, based o...","[Lagaan, tells, the, tale, of, the, Indian, vi...",7.2,282674


In [156]:
data.isnull().sum()

id              0
title           0
genres          0
cast            0
crew            0
keywords        0
overview        0
vote_average    0
imdbid          0
dtype: int64

In [157]:
app_data = data.copy()

In [158]:
# app_data.to_csv('data.csv')

In [159]:
data['genres'] = data['genres'].apply(lambda x : [y.replace(' ','') for y in x])

In [160]:
data['cast'] = data['cast'].apply(lambda x : [y.replace(' ','') for y in x])

In [161]:
data['crew'] = data['crew'].apply(lambda x : [y.replace(' ','') for y in x])

In [162]:
data['keywords'] = data['keywords'].apply(lambda x : [y.replace(' ','') for y in x])

In [163]:
data.head()

Unnamed: 0,id,title,genres,cast,crew,keywords,overview,vote_average,imdbid
0,79782,Venice,"[Drama, Romance]","[MarcinWalewski, MagdalenaCielecka, MariuszBon...",[JanJakubKolski],[],"[An, atmospheric, coming-of-age, story, featur...",7.5,105729
1,480,Monsoon Wedding,"[Comedy, Drama, Romance]","[NaseeruddinShah, LilleteDubey, VijayRaaz, Til...",[MiraNair],"[childabuse, adultery, arrangedmarriage, bolly...","[From, an, exciting, Indian, wedding, comes, a...",6.8,265343
2,4435,Asoka,"[Drama, History]","[ShahRukhKhan, KareenaKapoor, AjithKumar, Dann...",[],"[buddhism, emperor, commander, reue, india, an...","[In, India,, about, 260, BC, in, the, Empire, ...",5.8,249371
3,7504,Earth,[Drama],"[AamirKhan, NanditaDas, MaiaSethna, KituGidwani]",[DeepaMehta],"[basedonnovel, warofindependence, perioddrama,...","[It's, 1947, and, the, borderlines, between, I...",6.6,150433
4,19666,Lagaan: Once Upon a Time in India,"[Adventure, Drama, Music, Romance]","[AamirKhan, GracySingh, RachelShelley, PaulBla...",[],"[sport, british, bollywood, arrogance, basedon...","[Lagaan, tells, the, tale, of, the, Indian, vi...",7.2,282674


In [164]:
data['tokens'] = data['cast'] + data['crew'] + data['genres'] + data['keywords'] + data['overview']

In [165]:
data.head()

Unnamed: 0,id,title,genres,cast,crew,keywords,overview,vote_average,imdbid,tokens
0,79782,Venice,"[Drama, Romance]","[MarcinWalewski, MagdalenaCielecka, MariuszBon...",[JanJakubKolski],[],"[An, atmospheric, coming-of-age, story, featur...",7.5,105729,"[MarcinWalewski, MagdalenaCielecka, MariuszBon..."
1,480,Monsoon Wedding,"[Comedy, Drama, Romance]","[NaseeruddinShah, LilleteDubey, VijayRaaz, Til...",[MiraNair],"[childabuse, adultery, arrangedmarriage, bolly...","[From, an, exciting, Indian, wedding, comes, a...",6.8,265343,"[NaseeruddinShah, LilleteDubey, VijayRaaz, Til..."
2,4435,Asoka,"[Drama, History]","[ShahRukhKhan, KareenaKapoor, AjithKumar, Dann...",[],"[buddhism, emperor, commander, reue, india, an...","[In, India,, about, 260, BC, in, the, Empire, ...",5.8,249371,"[ShahRukhKhan, KareenaKapoor, AjithKumar, Dann..."
3,7504,Earth,[Drama],"[AamirKhan, NanditaDas, MaiaSethna, KituGidwani]",[DeepaMehta],"[basedonnovel, warofindependence, perioddrama,...","[It's, 1947, and, the, borderlines, between, I...",6.6,150433,"[AamirKhan, NanditaDas, MaiaSethna, KituGidwan..."
4,19666,Lagaan: Once Upon a Time in India,"[Adventure, Drama, Music, Romance]","[AamirKhan, GracySingh, RachelShelley, PaulBla...",[],"[sport, british, bollywood, arrogance, basedon...","[Lagaan, tells, the, tale, of, the, Indian, vi...",7.2,282674,"[AamirKhan, GracySingh, RachelShelley, PaulBla..."


In [166]:
data['tokens'] = data['tokens'].apply(lambda x: ' '.join(x))

In [167]:
data['tokens'] = data['tokens'].apply(lambda x: x.lower())

In [168]:
data['tokens'][45]

"ewanmcgregor natalieportman haydenchristensen ianmcdiarmid sciencefiction adventure action showdown deathstar vision cultfigure hatred dreamsequence expectantmother spaceopera chancel childbirth galacticwar years after the onset of the clone wars, the noble jedi knights lead a massive clone army into a galaxy-wide battle against the separatists. when the sinister sith unveil a thousand-year-old plot to rule the galaxy, the republic crumbles and from its ashes rises the evil galactic empire. jedi hero anakin skywalker is seduced by the dark side of the force to become the emperor's new apprentice – darth vader. the jedi are decimated, as obi-wan kenobi and jedi master yoda are forced into hiding. the only hope for the galaxy are anakin's own offspring – the twin children born in secrecy who will grow up to become heroes."

In [169]:
X = data['tokens']

In [170]:
X.head()

0    marcinwalewski magdalenacielecka mariuszbonasz...
1    naseeruddinshah lilletedubey vijayraaz tillota...
2    shahrukhkhan kareenakapoor ajithkumar dannyden...
3    aamirkhan nanditadas maiasethna kitugidwani de...
4    aamirkhan gracysingh rachelshelley paulblackth...
Name: tokens, dtype: object

In [171]:
def clean_text(token):
    
    token = re.sub(r'\[[0-9]*\]', ' ',token)
    token = re.sub(r'\s+', ' ', token)
    token = re.sub('[^a-zA-Z]', ' ', token )
    token = re.sub(r'\s+', ' ', token)
    
    return token

In [172]:
X = X.apply(clean_text)

In [173]:
word = word_tokenize
stemmer = PorterStemmer()

In [174]:
def stem(token):
    token = word(token)
    token = [stemmer.stem(x) for x in token]
    return ' '.join(token)

In [175]:
import nltk
# nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [176]:
X = X.apply(stem)

#### Vectorization

In [177]:
tfidf = TfidfVectorizer(max_features=10000 , stop_words='english')

In [178]:
vectors = tfidf.fit_transform(X).toarray()

In [179]:
vectors.shape

(4076, 10000)

In [180]:
similarity = cosine_similarity(vectors)

In [188]:
similarity.shape

(4076, 4076)

In [181]:
def recommender(name):

    index = data[data['title'] == name].index[0]
    top10 = sorted(enumerate(similarity[index]) , reverse=True , key= lambda x : x[1])[1:11]
    
    for i in top10:
        movie = data.iloc[i[0]].title
        print(movie)

In [182]:
def movie(name):
    
    name = name.lower()
    new_title = [re.sub(r'[^\w\s]', '', title.lower()) for title in data['title']]

    if name in new_title:
        index = new_title.index(name)
        recommender(data.iloc[index].title)
    else:
        print(f'No movie available for "{name}"')

In [183]:
movie('thor')

Thor: The Dark World
Avengers: Age of Ultron
Doctor Strange
The Avengers
Kon-Tiki
Jack the Giant Slayer
Man of Steel
Guardians of the Galaxy
Hellboy II: The Golden Army
Iron Man 2


In [184]:
movie('dilwale')

Dor
Har Dil Jo Pyar Karega...
unINDIAN
Dilwale Dulhania Le Jayenge
Indian
Baghban
Kismat Konnection
Mujhse Dosti Karoge!
Chori Chori Chupke Chupke
Heropanti


In [185]:
data_dict = app_data.to_dict()

In [186]:
pickle.dump(data_dict, open('deployment/movie_dict.pkl', 'wb'))

In [187]:
pickle.dump(similarity, open('deployment/similarity.pkl', 'wb'))