## Content based filtering using cosine similarity

In [2]:
# Read the data to a pandas dataframe
import pandas as pd

df1 = pd.read_csv('tmdb_5000_credits.csv')
df2 = pd.read_csv('./tmdb_5000_movies.csv')

# Merge the two dataframes
df1.columns = ['id','title','cast','crew']
df2= df2.merge(df1,on='id')

# In the next section, we will sanitize the data as we are going to get recommendations based on the following features:
    - cast (top 3 actors)
    - keywords
    - genres
    - director

In [3]:
from ast import literal_eval
import numpy as np 

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(literal_eval)


In [4]:
# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []


In [5]:
# Apply get_director function to 'crew' feature
df2['director'] = df2['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(get_list)

# Print the new features of the first 3 films
print(df2[['original_title', 'cast', 'director', 'keywords', 'genres']].head(3))

                             original_title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   

                                               cast        director  \
0  [Sam Worthington, Zoe Saldana, Sigourney Weaver]   James Cameron   
1     [Johnny Depp, Orlando Bloom, Keira Knightley]  Gore Verbinski   
2      [Daniel Craig, Christoph Waltz, Léa Seydoux]      Sam Mendes   

                              keywords                        genres  
0   [culture clash, future, space war]  [Action, Adventure, Fantasy]  
1   [ocean, drug abuse, exotic island]  [Adventure, Fantasy, Action]  
2  [spy, based on novel, secret agent]    [Action, Adventure, Crime]  


In [6]:
# Clean data by removing spaces and converting to lowercase
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [7]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(clean_data)

In [8]:
# Create a soup out of the desired features
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
df2['soup'] = df2.apply(create_soup, axis=1)

In [12]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['soup'])

# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

# Reset index of your main DataFrame and construct reverse mapping as before
df2 = df2.reset_index()
indices = pd.Series(df2.index, index=df2['original_title'])


ValueError: cannot insert level_0, already exists

In [20]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df2[['id', 'original_title']].iloc[movie_indices]

In [21]:
get_recommendations('The Dark Knight Rises')


Unnamed: 0,id,original_title
65,155,The Dark Knight
119,272,Batman Begins
4638,378237,Amidst the Devil's Wings
1196,1124,The Prestige
3073,2088,Romeo Is Bleeding
3326,312113,Black November
1503,22907,Takers
1986,41283,Faster
303,314,Catwoman
747,82682,Gangster Squad


In [22]:
import pickle

# Save the CountVectorizer
with open('count_vectorizer.pkl', 'wb') as f:
    pickle.dump(count, f)

# Save the Cosine Similarity matrix
with open('cosine_similarity.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)

# Save the DataFrame and indices
df2.to_pickle('movies_df.pkl')
with open('indices.pkl', 'wb') as f:
    pickle.dump(indices, f)