In [3]:
import pandas as pd
import numpy as np

#Import data from the clean file
df = pd.read_csv('/Users/youssefeldeeb/Documents/Material/Recommendation-Systems/Datasets/metadata_clean.csv',low_memory=False)

#Import the original file
orig_df = pd.read_csv('/Users/youssefeldeeb/Documents/Material/Recommendation-Systems/Datasets/movies_metadata.csv',low_memory=False)

#Add the useful features into the cleaned dataframe
df['overview'],df['id'] = orig_df['overview'],orig_df['id']

df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id
0,Toy Story,"['Animation', 'Comedy', 'Family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"['Romance', 'Comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,['Comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862


In [31]:
#Import TfIdfVectorizer from the scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfIdf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df['overview'] = df['overview'].fillna('')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfIdf_matrix = tfIdf.fit_transform(df['overview'])

tfIdf_matrix.shape

<1x75827 sparse matrix of type '<class 'numpy.float64'>'
	with 31 stored elements in Compressed Sparse Row format>

In [8]:
# Import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfIdf_matrix, tfIdf_matrix)


In [29]:
cosine_sim[0:6]
# cosine_sim.shape

array([[1.        , 0.01504121, 0.        , ..., 0.        , 0.00595453,
        0.        ],
       [0.01504121, 1.        , 0.04681953, ..., 0.        , 0.02198641,
        0.00929411],
       [0.        , 0.04681953, 1.        , ..., 0.        , 0.01402548,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.00952214,
        0.01641271],
       [0.        , 0.        , 0.02509444, ..., 0.        , 0.00700553,
        0.        ],
       [0.        , 0.05018805, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [17]:
#Construct a reverse mapping of indices and movie titles, and drop duplicate titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

indices[0:10]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [43]:
# Function that takes in movie title as input and gives recommendations
def contentRecommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
    # Obtain the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwsie similarity scores of all movies with that movie
    # And convert it into a list of tuples 
    simScores = list(enumerate(cosine_sim[idx])) 
    
    # Sort the movies based on the cosine similarity scores
    simScores = sorted(simScores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies. Ignore the first movie.
    simScores = simScores[1:11]
    
    # Get the movie indices
    moviesIndices = [i[0] for i in simScores]
    
    # Return the top 10 most similar movies
    return df['title'].iloc[moviesIndices]
    

In [44]:
#Get recommendations for The Lion King
contentRecommender('The Lion King')

34682    How the Lion Cub and the Turtle Sang a Song
9353                                The Lion King 1½
9115                  The Lion King 2: Simba's Pride
42829                                           Prey
25654                                 Fearless Fagan
17041                                   African Cats
27933              Massaï, les guerriers de la pluie
6094                                       Born Free
37409                                     Sour Grape
3203                                The Waiting Game
Name: title, dtype: object