In [1]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/Users/abhijitdeshpande/Documents/ML Projects/Movie Recommendation/movie_metadata.csv')

In [3]:
df.isnull().sum()

color                         19
director_name                104
num_critic_for_reviews        50
duration                      15
director_facebook_likes      104
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        884
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                153
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               303
budget                       492
title_year                   108
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 329
movie_facebook_likes           0
dtype: int64

In [4]:
df['title_year'].value_counts().sort_index()

1916.0      1
1920.0      1
1925.0      1
1927.0      1
1929.0      2
         ... 
2012.0    221
2013.0    237
2014.0    252
2015.0    226
2016.0    106
Name: title_year, Length: 91, dtype: int64

In [5]:
data = df[['director_name','actor_1_name','actor_2_name','actor_3_name',
           'genres','movie_title','title_year','plot_keywords']]

In [6]:
print(data.isnull().sum())
data.replace(np.nan,'', inplace=True)

director_name    104
actor_1_name       7
actor_2_name      13
actor_3_name      23
genres             0
movie_title        0
title_year       108
plot_keywords    153
dtype: int64


In [7]:
data['genres'] = data['genres'].apply(lambda x:x.replace('|',' '))
data['plot_keywords'] = data['plot_keywords'].apply(lambda x:x.replace('|',' '))

In [8]:
data['movie_title'] = data['movie_title'].apply(lambda x: x.lower())
data['movie_title'] = data['movie_title'].apply(lambda x: x[:-1])

In [9]:
meta = pd.read_csv('/Users/abhijitdeshpande/Documents/ML Projects/Movie Recommendation/movies_metadata.csv')
credits = pd.read_csv('/Users/abhijitdeshpande/Documents/ML Projects/Movie Recommendation/credits.csv')

In [10]:
meta.info()
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [19]:
meta['release_date'] = pd.to_datetime(meta['release_date'], errors = 'coerce')

In [22]:
meta['year'] = meta['release_date'].dt.year

In [23]:
(meta['year'].value_counts().sort_index())

1874.0       1
1878.0       1
1883.0       1
1887.0       1
1888.0       2
          ... 
2015.0    1905
2016.0    1604
2017.0     532
2018.0       5
2020.0       1
Name: year, Length: 135, dtype: int64

In [24]:
meta['overview'].iloc[0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

In [25]:
meta.rename(columns={'overview':'plot_keywords'},inplace=True)

In [26]:
new_meta = meta.loc[meta.year==2017,['id','genres','title','year','plot_keywords']]

In [27]:
new_meta['id'] = new_meta['id'].astype(int)

In [28]:
data_1 = pd.merge(new_meta,credits,on='id')

In [29]:
data_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 531 entries, 0 to 530
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             531 non-null    int64  
 1   genres         531 non-null    object 
 2   title          531 non-null    object 
 3   year           531 non-null    float64
 4   plot_keywords  519 non-null    object 
 5   cast           531 non-null    object 
 6   crew           531 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 33.2+ KB


In [30]:
data_1.rename(columns={'title':'movie_title','year':'title_year','overview':''},inplace=True)

In [32]:
import ast
data_1['genres'] = data_1['genres'].apply(lambda x:ast.literal_eval(x))
data_1['crew'] = data_1['crew'].apply(lambda x:ast.literal_eval(x))
data_1['cast'] = data_1['cast'].apply(lambda x: ast.literal_eval(x))

In [33]:
def genre(x):
    gen = []
    srt = ' '
    for i in x:
        gen.append(i.get('name'))
    return srt.join(gen)

def actor(x,k):
    gen = []
    for i in x:
        gen.append(i.get('name'))
    if gen == [] or len(gen)<=k:
        return np.nan
    else:
        return gen[k]

def directors(x):
    gen = []
    for i in x:
        if i.get('job')=='Director':
            gen.append(i.get('name'))
    return ' '.join(gen)

In [34]:
data_1['genres'] = data_1['genres'].apply(lambda x: genre(x))
data_1['actor_1_name'] = data_1['cast'].apply(lambda x: actor(x,0))
data_1['actor_2_name'] = data_1['cast'].apply(lambda x: actor(x,1))
data_1['actor_3_name'] = data_1['cast'].apply(lambda x: actor(x,2))
data_1['director_name'] = data_1['crew'].apply(lambda x: directors(x))
data_1['movie_title'] = data_1['movie_title'].apply(lambda x:x.lower())

In [35]:
data_2 = data_1[['genres','actor_1_name','actor_2_name','actor_3_name','director_name','movie_title','title_year','plot_keywords']]

In [38]:
data['Comb'] = data['director_name']+' '+data['actor_1_name']+' '+data['actor_2_name']+' '+data['actor_3_name']+' '+data['genres']+' '+data['movie_title']+' '+data['plot_keywords']

In [39]:
data_2['Comb'] = data_2['director_name']+' '+data_2['actor_1_name']+' '+data_2['actor_2_name']+' '+data_2['actor_3_name']+' '+data_2['genres']+' '+data_2['movie_title']+''+data_2['plot_keywords']

In [40]:
data_2.isnull().sum()

genres            0
actor_1_name     22
actor_2_name     55
actor_3_name     70
director_name     0
movie_title       0
title_year        0
plot_keywords    12
Comb             80
dtype: int64

In [41]:
data_2.dropna(how='any',inplace=True)

In [42]:
movie = data.append(data_2)

In [43]:
movie.shape,data.shape

((5494, 9), (5043, 9))

In [44]:
movie.drop_duplicates(subset='movie_title',keep='last',inplace=True)

In [45]:
movie['Comb'].iloc[4230]

'Vera Farmiga Donna Murphy Bill Irwin Michael Chernus Drama higher ground 1960s belief in the afterlife domestic violence faith minister'

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer()

In [47]:
matrix = tfid.fit_transform(movie['Comb'])

In [48]:
matrix.shape

(5357, 20235)

In [49]:
from sklearn.metrics.pairwise import linear_kernel

In [50]:
cosine_similarity = linear_kernel(matrix,matrix)

In [51]:
indices = pd.Series(movie.index,index=movie['movie_title'])

In [52]:
import re

In [53]:
movie[['Comb']]

Unnamed: 0,Comb
0,James Cameron CCH Pounder Joel David Moore Wes...
1,Gore Verbinski Johnny Depp Orlando Bloom Jack ...
2,Sam Mendes Christoph Waltz Rory Kinnear Stepha...
3,Christopher Nolan Tom Hardy Christian Bale Jos...
4,Doug Walker Doug Walker Rob Walker Documentar...
...,...
523,Anshai Lal Anushka Sharma Diljit Dosanjh Suraj...
524,Jim Strouse Jessica Williams Chris O'Dowd Keit...
525,Farhad Mann Adelaide Kane Benjamin Hollingswor...
527,Jonathan A. Rosenbaum Lou Diamond Phillips Wal...


In [54]:
meta['original_language'].unique()

array(['en', 'fr', 'zh', 'it', 'fa', 'nl', 'de', 'cn', 'ar', 'es', 'ru',
       'sv', 'ja', 'ko', 'sr', 'bn', 'he', 'pt', 'wo', 'ro', 'hu', 'cy',
       'vi', 'cs', 'da', 'no', 'nb', 'pl', 'el', 'sh', 'xx', 'mk', 'bo',
       'ca', 'fi', 'th', 'sk', 'bs', 'hi', 'tr', 'is', 'ps', 'ab', 'eo',
       'ka', 'mn', 'bm', 'zu', 'uk', 'af', 'la', 'et', 'ku', 'fy', 'lv',
       'ta', 'sl', 'tl', 'ur', 'rw', 'id', 'bg', 'mr', 'lt', 'kk', 'ms',
       'sq', nan, '104.0', 'qu', 'te', 'am', 'jv', 'tg', 'ml', 'hr', 'lo',
       'ay', 'kn', 'eu', 'ne', 'pa', 'ky', 'gl', '68.0', 'uz', 'sm', 'mt',
       '82.0', 'hy', 'iu', 'lb', 'si'], dtype=object)

In [58]:
def recommend(title,cosine_similarity=cosine_similarity):
    try:
        idx = indices[title.lower()]
        sim_score = list(enumerate(cosine_similarity[idx]))
        sim_score = sorted(sim_score, key=lambda x:x[1], reverse=True)
        sim_score = sim_score[1:11]
        movie_indices = [i[0] for i in sim_score]
        return movie['movie_title'].iloc[movie_indices].apply(lambda x: x.title())
    except:
        print('Opps! The  Movie you looking not in list, please try different movies.')

In [59]:
recommend("Inception")

64      The Chronicles Of Narnia: The Lion, The Witch ...
16               The Chronicles Of Narnia: Prince Caspian
4254                          Mad Max 2: The Road Warrior
339         The Lord Of The Rings: The Return Of The King
829                                           Dragonheart
230                             The Chronicles Of Riddick
737                                     The Scorpion King
1455                                  Queen Of The Damned
270     The Lord Of The Rings: The Fellowship Of The Ring
4019                In The Name Of The King: The Last Job
Name: movie_title, dtype: object

In [60]:
movie

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,title_year,plot_keywords,Comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,2009.0,avatar future marine native paraplegic,James Cameron CCH Pounder Joel David Moore Wes...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,2007.0,goddess marriage ceremony marriage proposal pi...,Gore Verbinski Johnny Depp Orlando Bloom Jack ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,2015.0,bomb espionage sequel spy terrorist,Sam Mendes Christoph Waltz Rory Kinnear Stepha...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,2012.0,deception imprisonment lawlessness police offi...,Christopher Nolan Tom Hardy Christian Bale Jos...
4,Doug Walker,Doug Walker,Rob Walker,,Documentary,star wars: episode vii - the force awakens ...,,,Doug Walker Doug Walker Rob Walker Documentar...
...,...,...,...,...,...,...,...,...,...
523,Anshai Lal,Anushka Sharma,Diljit Dosanjh,Suraj Sharma,Fantasy Comedy Romance Drama,phillauri,2017.0,A man is forced to marry a tree to ward off il...,Anshai Lal Anushka Sharma Diljit Dosanjh Suraj...
524,Jim Strouse,Jessica Williams,Chris O'Dowd,Keith Stanfield,Romance Comedy,the incredible jessica james,2017.0,"Burned by a bad breakup, a struggling New York...",Jim Strouse Jessica Williams Chris O'Dowd Keit...
525,Farhad Mann,Adelaide Kane,Benjamin Hollingsworth,Jean Louisa Kelly,Romance,can't buy my love,2017.0,"Lilly, a hard working EMT, is only focused on ...",Farhad Mann Adelaide Kane Benjamin Hollingswor...
527,Jonathan A. Rosenbaum,Lou Diamond Phillips,Wallace Shawn,Gina Holden,Crime Comedy Action Family,cop and a half: new recruit,2017.0,In this family-friendly action reboot of the 1...,Jonathan A. Rosenbaum Lou Diamond Phillips Wal...


In [61]:
movie.reset_index(drop=True,inplace=True)

In [62]:
movie[4321:4329]

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,title_year,plot_keywords,Comb
4321,David Worth,Jessica Szohr,Jonathan Mangum,Angela Jones,Horror Thriller,house at the end of the drive,2014.0,ghost story manson family paranormal real life...,David Worth Jessica Szohr Jonathan Mangum Ange...
4322,Leslie H. Martinson,Burgess Meredith,Cesar Romero,Burt Ward,Adventure Comedy Family,batman: the movie,1966.0,black cat catwoman dc comics penguin riddler,Leslie H. Martinson Burgess Meredith Cesar Rom...
4323,Guy Ritchie,Jason Statham,Jason Flemyng,Dexter Fletcher,Comedy Crime,"lock, stock and two smoking barrels",1998.0,antique cockney accent hatchet money shotgun,Guy Ritchie Jason Statham Jason Flemyng Dexter...
4324,Robert M. Young,Barry Corbin,Bruce McGill,Ned Beatty,Western,the ballad of gregorio cortez,1982.0,,Robert M. Young Barry Corbin Bruce McGill Ned ...
4325,,Karl Malden,Michael Douglas,,Action Crime Drama Mystery,the streets of san francisco,,city name in series title homicide older man y...,Karl Malden Michael Douglas Action Crime Dra...
4326,Thomas Vinterberg,Ulrich Thomsen,Paprika Steen,Trine Dyrholm,Drama,the celebration,1998.0,dogme 95 family secret haunted by the past sec...,Thomas Vinterberg Ulrich Thomsen Paprika Steen...
4327,Steve Buscemi,Steve Buscemi,Debi Mazar,Carol Kane,Comedy Drama,trees lounge,1996.0,alcoholic drink bar drink drinking drunk,Steve Buscemi Steve Buscemi Debi Mazar Carol K...
4328,Ham Tran,Long Nguyen,Kieu Chinh,Cat Ly,Drama,journey from the fall,2006.0,1970s 1980s nonlinear timeline rescue vietnam war,Ham Tran Long Nguyen Kieu Chinh Cat Ly Drama j...


In [63]:
import jedi

In [64]:
import jupyter_contrib_nbextensions