Content Based Recommendation

In [13]:
import numpy as np
import pandas as pd
import sklearn
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

import difflib

In [14]:
#ratings = pd.read_csv (r'./../data/ratings.csv', low_memory=False)
#ratings_small = pd.read_csv (r'./../data/ratings_small.csv', low_memory=False)
credits = pd.read_csv (r'./../data/credits.csv', low_memory=False)
keywords = pd.read_csv (r'./../data/keywords.csv', low_memory=False)
#links = pd.read_csv (r'./../data/links.csv', low_memory=False)
#links_small = pd.read_csv (r'./../data/links_small.csv', low_memory=False)
metadata = pd.read_csv (r'./../data/movies_metadata.csv', low_memory=False)

In [15]:
#ratings.head()

In [16]:
#credits.head()

In [17]:
#metadata.head()

In [18]:
metadata = metadata[~metadata.id.str.contains("-")]

**Recommendation based on different metadata**  
Actors, director, keywords, etc. are looked at as well

In [19]:
# Convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

In [20]:
# Merge keywords and credits into your main metadata dataframe
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

In [21]:
# Parse the stringified features into their corresponding python objects
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

In [22]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [23]:
# Define new director, cast, genres and keywords features that are in a suitable form.
metadata['director'] = metadata['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)

In [24]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

In [25]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

# Create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)

metadata[['soup']].head(2)

Unnamed: 0,soup
0,jealousy toy boy tomhanks timallen donrickles ...
1,boardgame disappearance basedonchildren'sbook ...


In [26]:
# create count matrix with CountVectorizer
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])
count_matrix.shape

(46628, 73881)

In [27]:
# Compute the Cosine Similarity matrix based on the count_matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [67]:
# Reset index of your main DataFrame and construct reverse mapping as before
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['id'])

In [68]:
def get_recommendations(id, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[id]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [86]:
input_title = input("Your title: ")

Your title: toy story


In [87]:
possible_titles = difflib.get_close_matches(input_title, metadata['original_title'].tolist(), 5)
print(possible_titles)

['Toy Story', 'Toy Story 3', 'Toy Story 2', 'Autohystoria', 'Love Story']


In [88]:
id_array = []
for i in possible_titles:
    id_array.append(metadata.loc[metadata['original_title'] == str(i), 'id'].array[0])

In [89]:
print(id_array)

[862, 10193, 863, 44570, 9062]


In [90]:
input_number = int(input("What's your film? "))
chosen_id = id_array[input_number]

What's your film? 0


In [91]:
print(chosen_id)

862


In [92]:
get_recommendations(chosen_id, cosine_sim)

3024                       Toy Story 2
15519                      Toy Story 3
29198                  Superstar Goofy
26001       Toy Story That Time Forgot
22126             Toy Story of Terror!
3336                 Creature Comforts
25999                  Partysaurus Rex
27606                            Anina
43071    Dexter's Laboratory: Ego Trip
28005                    Radiopiratene
Name: title, dtype: object