## content-based filtering 
    
- convert the words or text in vector form
- find the closest recommendation to our given movie input title using cosine similarity

In [1]:
import pandas as pd

In [2]:
# read the CSV file
md = pd.read_csv('the-movies-dataset/movies_metadata.csv', usecols = ['id','original_title','overview','tagline'])
# droping rows by index
md = md.drop([19730, 29503, 35587])
md.head()

Unnamed: 0,id,original_title,overview,tagline
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...


In [3]:
md.shape

(45463, 4)

In [4]:
links_small = pd.read_csv('the-movies-dataset/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [5]:
#Check EDA Notebook for how and why I got these indices.
md['id'] = md['id'].astype('int')

md = md.loc[md['id'].isin(links_small)]
md.shape

(9099, 4)

In [6]:
md['overview'] = md['overview'].fillna(' ')
md['tagline'] = md['tagline'].fillna(' ')
md.head()

Unnamed: 0,id,original_title,overview,tagline
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...


In [7]:
# Merging Overview and title together

md['description'] = md[['overview', 'tagline']].apply(lambda x: ' '.join(x), axis=1)
#md['description'] = md['overview'].astype(str) + smd['tagline']
md['description'] = md['description'].fillna(' ')
md.head()

Unnamed: 0,id,original_title,overview,tagline,description
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Just when George Banks has recovered from his ...


TF-IDF to create the vectorizer of our words
- give less weight to the words that are occurring frequently example

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(md['description'])

In [9]:
tfidf_matrix.shape

(9099, 533074)

In [10]:
# Cosine similarity
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [11]:
md = md.reset_index()
titles = md['original_title']
# finding indices of every title
indices = pd.Series(md.index, index=titles)

In [12]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [None]:
get_recommendations('The Dark Knight').head(10)

7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
7565                 Batman: Under the Red Hood
8227    Batman: The Dark Knight Returns, Part 2
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
Name: original_title, dtype: object

In [None]:
from flask import Flask
import json

app = Flask(__name__)

@app.route('/')
def main():
    output = pd.Series.to_json(get_recommendations('The Dark Knight').head(10))
    return json.dumps(output)

if __name__ == "__main__":
  app.run()

 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [08/Oct/2019 19:01:26] "[37mGET / HTTP/1.1[0m" 200 -


## Metadata Based Recommender

In [None]:
credits = pd.read_csv('the-movies-dataset/credits.csv')
keywords = pd.read_csv('the-movies-dataset/keywords.csv')

In [None]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [None]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [None]:
smd = md[md['id'].isin(links_small)]
smd.shape

In [None]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [None]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [None]:
smd['director'] = smd['crew'].apply(get_director)

In [None]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [None]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])