In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, evaluate

from scipy.spatial.distance import euclidean, pdist, squareform

import warnings; warnings.simplefilter('ignore')

In [3]:
md = pd.read_csv("./data/movies_metadata.csv")
rt= pd.read_csv("./data/ratings_small.csv")
links= pd.read_csv("./data/links.csv")   #MoivieIds
keywords= pd.read_csv("./data/keywords.csv")
credits= pd.read_csv("./data/credits.csv")

In [4]:
keywords.shape


(46419, 2)

In [5]:
pd.options.display.max_colwidth = 500  
keywords.iloc[[1]]['keywords']

1    [{'id': 10090, 'name': 'board game'}, {'id': 10941, 'name': 'disappearance'}, {'id': 15101, 'name': "based on children's book"}, {'id': 33467, 'name': 'new home'}, {'id': 158086, 'name': 'recluse'}, {'id': 158091, 'name': 'giant insect'}]
Name: keywords, dtype: object

In [6]:
#keywords.head()

In [7]:
#links.head()

In [8]:
#md[['id','title']]

In [9]:
#md[['id','title']].loc[md['id']=='15602']

In [10]:
pd.options.display.max_colwidth=100

In [11]:
md.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [12]:
#md['vote_count'].head()

In [13]:
c = md[(md['vote_average'].notnull())]['vote_average'].mean()
m = md['vote_count'].quantile(0.85)
m,c

(82.0, 5.618207215133889)

In [14]:
md['year']=md['release_date'].apply(lambda x: str(x).split('-')[0])
md['vote_count']=md[(md['vote_count'].notnull())]['vote_count'].astype('int')


In [15]:
quantised = md.loc[(md['vote_count'] >= m) & md['vote_average'].notnull()][['id','title','year','vote_average','vote_count','genres','popularity']]

In [16]:
quantised['vote_count']=quantised[quantised['vote_count'].notnull()]['vote_count'].astype('int')

In [17]:
quantised.shape

(6832, 7)

In [18]:
#quantised.sort_values('vote_average',ascending=False)

In [19]:
def weightedMean(df):
    v=df['vote_count']
    r=df['vote_average']
    wr=(v*r+m*c)/(v+m)
    return wr
    

In [20]:
quantised['wr']=quantised.apply(weightedMean,axis=1)

In [21]:
quantised.sort_values('wr',ascending=False).head(10) 

Unnamed: 0,id,title,year,vote_average,vote_count,genres,popularity,wr
10309,19404,Dilwale Dulhania Le Jayenge,1995,9.1,661,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]",34.457,8.715738
314,278,The Shawshank Redemption,1994,8.5,8358,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name': 'Crime'}]",51.6454,8.472002
834,238,The Godfather,1972,8.5,6024,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name': 'Crime'}]",41.1093,8.461299
40251,372058,Your Name.,2016,8.5,1030,"[{'id': 10749, 'name': 'Romance'}, {'id': 16, 'name': 'Animation'}, {'id': 18, 'name': 'Drama'}]",34.461252,8.287494
12481,155,The Dark Knight,2008,8.3,12269,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name': 'Action'}, {'id': 80, 'name': 'Crime'}, {'id': ...",123.167,8.282195
2843,550,Fight Club,1999,8.3,9678,"[{'id': 18, 'name': 'Drama'}]",63.8696,8.277469
292,680,Pulp Fiction,1994,8.3,8670,"[{'id': 53, 'name': 'Thriller'}, {'id': 80, 'name': 'Crime'}]",140.95,8.274874
522,424,Schindler's List,1993,8.3,4436,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name': 'History'}, {'id': 10752, 'name': 'War'}]",41.7251,8.251326
23673,244786,Whiplash,2014,8.3,4376,"[{'id': 18, 'name': 'Drama'}]",64.3,8.250671
5481,129,Spirited Away,2001,8.3,3968,"[{'id': 14, 'name': 'Fantasy'}, {'id': 12, 'name': 'Adventure'}, {'id': 16, 'name': 'Animation'}...",41.0489,8.245702


In [22]:
quantised[quantised['genres'].str.contains('Romance')].sort_values('wr',ascending=False).head(10) 

Unnamed: 0,id,title,year,vote_average,vote_count,genres,popularity,wr
10309,19404,Dilwale Dulhania Le Jayenge,1995,9.1,661,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]",34.457,8.715738
40251,372058,Your Name.,2016,8.5,1030,"[{'id': 10749, 'name': 'Romance'}, {'id': 16, 'name': 'Animation'}, {'id': 18, 'name': 'Drama'}]",34.461252,8.287494
351,13,Forrest Gump,1994,8.2,8147,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]",48.3072,8.174273
1132,11216,Cinema Paradiso,1988,8.2,834,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]",14.177,7.968879
40882,313369,La La Land,2016,7.9,4745,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10402, 'name': 'Music'}, {'id...",19.681686,7.861237
22168,152601,Her,2013,7.9,4215,"[{'id': 10749, 'name': 'Romance'}, {'id': 878, 'name': 'Science Fiction'}, {'id': 18, 'name': 'D...",13.8295,7.856456
7208,38,Eternal Sunshine of the Spotless Mind,2004,7.9,3758,"[{'id': 878, 'name': 'Science Fiction'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Rom...",12.9063,7.851274
876,426,Vertigo,1958,8.0,1162,"[{'id': 9648, 'name': 'Mystery'}, {'id': 10749, 'name': 'Romance'}, {'id': 53, 'name': 'Thriller'}]",18.2082,7.843001
3189,901,City Lights,1931,8.2,444,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]",10.8915,7.797515
15530,31011,Mr. Nobody,2009,7.9,1616,"[{'id': 878, 'name': 'Science Fiction'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Rom...",11.8171,7.789807


In [23]:
#md['overview']

In [24]:

# -----------------------------------------------
# ####Content Based Recommender ###############
# -----------------------------------------------


In [25]:
links_sm=pd.read_csv('./data/links_small.csv')
#links_sm.head(20)


In [26]:
links['tmdbId']=links[links['tmdbId'].notnull()]['tmdbId'].astype('int')
links_sm['tmdbId']=links_sm[links_sm['tmdbId'].notnull()]['tmdbId'].astype('int')

In [27]:
links_sm.shape
md.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'year'],
      dtype='object')

md.columns

In [28]:
#md.loc[md['id'].notull()]['id'].astype(int)
#md[md['id'].isin(links_sm['movieId'])]
#md.columns
md = md.drop([19730, 29503, 35587])
md['id']=md[md['id'].notnull()]['id'].astype(int)

In [29]:
####SMALL MD CERATED

smd =md[md['id'].isin(links_sm['movieId'])]


In [30]:
#md.head()

md.shape

In [31]:

md['tagline']= md['tagline'].fillna(' ')
smd['tagline']= smd['tagline'].fillna(' ')

In [32]:
md['overview']= md['overview'].fillna(' ')
smd['overview']= smd['overview'].fillna(' ')

In [33]:
md['descript']= md['tagline']+ md['overview']
md['descript']=md['descript'].fillna(' ')
smd['descript']= smd['tagline']+ smd['overview']
smd['descript']=smd['descript'].fillna(' ')
pd.options.display.max_colwidth=500


In [34]:
###ON SMD

In [35]:
tf=TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')

In [36]:
tfidfmat=tf.fit_transform(smd['descript'])

In [37]:
tfidfmat.shape

(2840, 95316)

In [38]:
cosine_sim=linear_kernel(tfidfmat,tfidfmat)

In [39]:
cosine_sim.shape

(2840, 2840)

In [40]:
#smd = smd.reset_index()
pd.options.display.max_colwidth=50
smd = smd.reset_index()

In [41]:
titles=smd['title']

In [42]:
#smd.head()

In [43]:
indices=pd.Series(smd.index,index=titles)
#indices

In [44]:
idx=1
args=np.argsort(cosine_sim[1])
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:31]
movie_indices = [i[0] for i in sim_scores]
#titles.iloc[movie_indices]


In [45]:
#idx=1
#args=np.argsort(cosine_sim[1])
#args=args[-31:]
#for i in args:
 #   print(titles.iloc[i])

In [46]:
#FULLL MDB
#tf=TfidfVectorizer(analyzer='word',min_df=0, stop_words='english', ngram_range=(1,2))
tf=TfidfVectorizer(analyzer='word',min_df=0, stop_words='english', ngram_range=(1,2))

In [47]:
pd.options.display.max_colwidth=150
md['short_overview']= md['overview'].apply(lambda x: x[0:100])


In [48]:
md_top=md.loc[md['vote_average']>6]
#md_top=md_top.reset_index()
#Droppung 2nd batman
md_top.loc[md_top.title=='Batman']
md_top=md_top.drop([5108])

In [49]:
tfidfmat=tf.fit_transform(md_top['overview'])


(21774, 21774)

In [50]:
tfidfmat_sma=tf.fit_transform(md_top['short_overview'])


In [51]:
tfidfmat_sma.shape

(21774, 193379)

In [52]:
cosine_sim=linear_kernel(tfidfmat_sma,tfidfmat_sma)

In [55]:
###DOES NOT WORK
def movie_rec_old(title):
    mov=md_top.loc[md_top['title'].str.lower()==title.lower()]
    args=np.argsort(cosine_sim[mov.index[0]])
    args=args[-31:]
    for i in args:
        print(md_top.loc[md_top.index==i]['title'])

In [56]:
md_top=md_top.reset_index()

In [57]:
#titles.columns=['movie']
titles=md_top['title']


In [58]:
indices=pd.Series(md_top.index,index=md_top['title'].str.lower())
#indices

In [59]:
#MOVIE RECOMMENDER FUNCTION
def movie_rec(title):
    idx=indices[title.lower()]
    if isinstance(idx, pd.Series):
        idx=idx.values[0]
    args=np.argsort(cosine_sim[idx])
    print(cosine_sim[idx])
    args=args[-31:]
    args=reversed(args)
    inds=[i for i in args]
    return titles.iloc[inds]
        

In [60]:
#movie_rec('the dark knight')
#titles.loc[titles.str.contains('batman')]
#md_top.loc[md_top['title'].str.contains('Batman')]


-----------------------Metadata Based Reccommender-------------------------

In [61]:
credits=pd.read_csv("./data/credits.csv")
#credits.head()

In [62]:
keywords=pd.read_csv("./data/keywords.csv")
pd.options.display.max_colwidth=500
#keywords.head()

In [63]:
md_top['id']=md_top['id'].astype(int)
md_top.shape

(21774, 28)

In [64]:
md_top=md_top.merge(credits,on='id')
md_top=md_top.merge(keywords,on='id')
md_top.shape

(22286, 31)

In [65]:
md_top['cast']=md_top['cast'].apply(literal_eval)

In [66]:
md_top['crew']=md_top['crew'].apply(literal_eval)


In [67]:
md_top['keywords']=md_top['keywords'].apply(literal_eval)

In [68]:
#md_top['crew'][0]

In [75]:
def getdirector(x):
    for i in x:
        if i['job']=='Director':
            return i['name']

In [76]:
getdirector(md_top['crew'][0])

'John Lasseter'

In [153]:
md_top['director']=md_top['crew'].apply(getdirector)
md_top['Director_']=md_top['crew'].apply(getdirector)

In [78]:
#md_top['cast'][0]

In [79]:
def getcast(x):
    names=[] 
    for i in x:
        names.append(i['name'])
    if len(names)>3:
        names=names[:3] 
    return names

In [80]:
getcast(md_top['cast'][0])

['Tom Hanks', 'Tim Allen', 'Don Rickles']

In [159]:
md_top['cast']=md_top['cast'].apply(getcast)


In [82]:
#md_top['keywords'][0]

In [83]:
md_top['keywords'] = md_top['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else [])
#md_top['keywords']


In [84]:
md_top['cast']=md_top['cast'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x] )


In [85]:
md_top['director']=md_top['director'].apply(lambda x: str.lower(x.replace(" ",""))  if  isinstance(x, str) else [])


In [86]:
#

In [87]:
# therefore rejected stemmer :p
stemmer=SnowballStemmer('english')
stemmer.stem('single mother')

'single moth'

In [88]:
#md_top['keywords']=md_top['keywords'].apply(lambda x: [stemmer.stem(i) for i in x]  )

In [89]:
md_top['keywords']=md_top['keywords'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x] )


In [90]:
#md_top['soup']=md['d']
md_top['soup']=md_top['director'].apply(lambda x: [x,x,x]) + md_top['cast'] +md_top['keywords']

In [91]:
def joinlist(x):
    l=""
    for i in x:
        #print(i)
        try :
            l+= i + " "
        except:
            l=""
    return l
l=joinlist(md_top['soup'][0])


In [92]:
#shit code
#for i in range(md_top.shape[0]):
 #   md_top['soup'].iloc[i]=joinlist(md_top['soup'][i])
  #  print(joinlist(md_top['soup'][i]))


In [93]:
#md_top.to_csv('./data/md_top.csv')
#md_top=pd.read_csv('./data/md_top.csv')
md_top['soup']=md_top['soup'].apply(joinlist)

In [166]:
#COUNT VECTORISER METHOD
countvectoriser=CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')

In [188]:
countmat=countvectoriser.fit_transform(md_top['soup'])

In [190]:
countmat.shape

(22286, 200050)

In [189]:
lin_ker_cv=linear_kernel(countmat,countmat)

MemoryError: 

In [178]:
#titles_cv = md_top['title']
#indices_cv = pd.Series(md_top.index, index=md_top['title'])

In [94]:
def movie_rec_cv(title):
    idx=indices[title.lower()]
    if isinstance(idx, pd.Series):
        idx=idx.values[0]
    args=np.argsort(cosinr_sim_cv[idx])
    print(cosinr_sim_cv[idx])
    args=args[-31:]
    args=reversed(args)
    inds=[i for i in args]
    return titles_cv.iloc[inds]
        

In [175]:
ind =indices['inception']
cosine_sim_cv[0]
#mdttt=pd.read_csv('./md_top.csv')

In [119]:
####TFIDF
tf2=TfidfVectorizer(analyzer='word',min_df=0, stop_words='english', ngram_range=(1,1))

In [177]:
tfidfmat2=tf2.fit_transform(md_top['soup'])

In [122]:
tfidfmat2.shape

(22286, 200050)

In [121]:
sim_mat=linear_kernel(tfidfmat2,tfidfmat2)

MemoryError: 

In [112]:
titles=md_top['title']


In [113]:
indices=pd.Series(md_top.index,index=md_top['title'].str.lower())

In [163]:
#MOVIE RECOMMENDER FUNCTION
def movie_rec2(title):
    idx=indices[title.lower()]
    if isinstance(idx, pd.Series):
        idx=idx.values[0]
    args=np.argsort(sim_mat[idx])
    #print(sim_mat[idx])
    args=args[-31:]
    args=reversed(args)
    inds=[i for i in args]
    return titles.iloc[inds]
        

In [164]:
#movie_rec2('the dark knight')
indices['the dark knight']
#md_top.iloc[7475]

the dark knight     7528
the dark knight    14978
dtype: int64

In [165]:
movie_rec2('cinderella')

577                                Cinderella
11673             The Legend of Sleepy Hollow
2225                              Melody Time
1209                          Sleeping Beauty
10184                     Education for Death
1204                                Peter Pan
14966                              Cinderella
12846                       The Glass Slipper
11925                      Sign 'o' the Times
1231           Ever After: A Cinderella Story
20273    A Cinderella Story: If the Shoe Fits
10562                Another Cinderella Story
15052                         Sleeping Beauty
4567                                 The Swan
15688             Three Wishes for Cinderella
6811                          Pan's Labyrinth
338                      Beauty and the Beast
10033                       Midnight in Paris
2125                 The Slipper and the Rose
13723    A Cinderella Story: Once Upon a Song
20450                              Music Land
367                 James and the 

In [145]:
md_top['wr']=md_top.apply(weightedMean,axis=1)

In [166]:
#MOVIE RECOMMENDER FUNCTION FINAL
def movie_recf(title):
    idx=indices[title.lower()]
    if isinstance(idx, pd.Series):
        idx=idx.values[0]
    args=np.argsort(sim_mat[idx])
    #print(sim_mat[idx])
    args=args[-31:]
    args=reversed(args)
    inds=[i for i in args]
    movies= md_top.loc[inds][['title','Director_','cast','vote_average','wr']]
    movies =movies.sort_values('wr',ascending=False)
    return movies

In [168]:
l=movie_recf('x-men')
l

Unnamed: 0,title,Director_,cast,vote_average,wr
38,The Usual Suspects,Bryan Singer,"[stephenbaldwin, gabrielbyrne, chazzpalminteri]",8.1,8.040425
20848,Logan,James Mangold,"[hughjackman, patrickstewart, dafnekeen]",7.6,7.574577
12760,Captain America: The Winter Soldier,Anthony Russo,"[chrisevans, samuell.jackson, scarlettjohansson]",7.6,7.572747
12881,X-Men: Days of Future Past,Bryan Singer,"[hughjackman, jamesmcavoy, michaelfassbender]",7.5,7.475259
10378,The Avengers,Joss Whedon,"[robertdowneyjr., chrisevans, markruffalo]",7.4,7.387907
14142,Deadpool,Tim Miller,"[ryanreynolds, morenabaccarin, edskrein]",7.4,7.387324
7596,Iron Man,Jon Favreau,"[robertdowneyjr., terrencehoward, jeffbridges]",7.4,7.383825
14139,Avengers: Age of Ultron,Joss Whedon,"[robertdowneyjr., chrishemsworth, markruffalo]",7.3,7.280271
14144,Captain America: Civil War,Anthony Russo,"[chrisevans, robertdowneyjr., scarlettjohansson]",7.1,7.083894
14145,Doctor Strange,Scott Derrickson,"[benedictcumberbatch, chiwetelejiofor, rachelmcadams]",7.1,7.07962


In [172]:
md_top.to_csv('./md_top.csv')
titles.to_csv('./title.csv')
indices.to_csv('./indices.csv')

In [1]:
indices

NameError: name 'indices' is not defined