In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [25]:
data = pd.read_csv('/content/movies_metadata.csv')

In [26]:
data.head(3)

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,status,tagline,title,vote_average,vote_count
0,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033,81.0,Released,,Toy Story,7.7,5415
1,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249,104.0,Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413
2,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0,101.0,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92


In [27]:
keywords = pd.read_csv('/content/keywords.csv')

In [28]:
# Convert IDs to int (Merging Purpose)
keywords['id'] = keywords['id'].astype('int')
data['id'] = data['id'].astype('int')

In [29]:
# Merge keywords into main 'data' dataframe
data = data.merge(keywords, on='id')

In [30]:
# Newly merged Dataframe
data.head(2)

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,status,tagline,title,vote_average,vote_count,keywords
0,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033,81.0,Released,,Toy Story,7.7,5415,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249,104.0,Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [31]:
# Parsing the string features into their corresponding python objects
from ast import literal_eval # helps in traversing an abstract syntax tree
features = ['keywords', 'genres']
for feature in features:
    data[feature] = data[feature].apply(literal_eval)

In [32]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        # Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names
    # Return empty list in case of missing/malformed data
    return []

In [33]:
# Define genres features that are in a suitable form.

features = ['keywords','genres']
for feature in features:
    data[feature] = data[feature].apply(get_list)

In [34]:
data[['title', 'keywords', 'genres']].head(3)

Unnamed: 0,title,keywords,genres
0,Toy Story,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"


In [35]:
data.head(2)

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,status,tagline,title,vote_average,vote_count,keywords
0,30000000,"[Animation, Comedy, Family]",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033,81.0,Released,,Toy Story,7.7,5415,"[jealousy, toy, boy]"
1,65000000,"[Adventure, Fantasy, Family]",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249,104.0,Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413,"[board game, disappearance, based on children'..."


In [12]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [13]:
# Apply clean_data function to your features.
features = ['keywords', 'genres']

for feature in features:
    data[feature] = data[feature].apply(clean_data)

In [14]:
def merge(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['genres'])

In [15]:
data['merge'] = data.apply(merge, axis=1)

In [16]:
data[['merge']].head(5)

Unnamed: 0,merge
0,jealousy toy boy animation comedy family
1,boardgame disappearance basedonchildren'sbook ...
2,fishing bestfriend duringcreditsstinger romanc...
3,basedonnovel interracialrelationship singlemot...
4,baby midlifecrisis confidence comedy


In [17]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(data['merge'])

In [18]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [19]:
# Reset index of your main DataFrame and construct reverse mapping as before
data = data.reset_index()
indices = pd.Series(data.index, index = data['title'])

In [20]:
# Function that takes in movie title as input and outputs most similar movies

def recommend_movies(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data['title'].iloc[movie_indices]

In [21]:
recommend_movies('The Godfather')

145                                  Feast of July
1988                                         Belly
3723                                 True Believer
5407                          Better Luck Tomorrow
7741                                   Golden Gate
7997    Redemption: The Stan Tookie Williams Story
8300                   Ghosts... of the Civil Dead
8631                              Blue Hill Avenue
9163                             Knock on Any Door
9300                                   Assassin(s)
Name: title, dtype: object

In [22]:
recommend_movies('The Son of No One')

1939          Monument Ave.
4218     The Learning Curve
4231                 Kansas
5396             Eye of God
7633           Street Smart
8220             Chronicles
10862        Shoot on Sight
11032         Union Station
12478        Animal Kingdom
12658              Red Bear
Name: title, dtype: object

In [23]:
recommend_movies('The Shawshank Redemption')

1406                  Witness
5329                Dark Blue
6533       The Valachi Papers
8899            The Big House
14338              You and Me
18506    Castle on the Hudson
145             Feast of July
1988                    Belly
3723            True Believer
5407     Better Luck Tomorrow
Name: title, dtype: object