In [1]:
# Import Modules
import pandas as pd
import joblib
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
# Read Data
data = pd.read_csv("movies.csv")
print(data.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [10]:
# Check missing values
print(data.isnull().sum())

movieId    0
title      0
genres     0
dtype: int64


In [13]:
# Number of movies
num_movies = data['movieId'].nunique()
print('Number of movies:',num_movies)

Number of movies: 62423


In [15]:
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(movies["genres"])
print(tfidf_matrix.shape)

(62423, 23)


In [16]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix[:1000])
print(cosine_sim.shape)

(62423, 1000)


In [23]:
indice = pd.Series(movies.index, index=movies['title']).drop_duplicates()
def get_recommendations(title):

    idx = indice[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    return data['title'].iloc[movie_indices]

get_recommendations('Toy Story (1995)')

Unnamed: 0,title
551,"Pagemaster, The (1994)"
55,Kids of the Round Table (1995)
661,Space Jam (1996)
1,Jumanji (1995)
59,"Indian in the Cupboard, The (1995)"


In [None]:
# Save cosine_similarity & movie_indices
with open('cosine_similarity.pkl', 'wb')as cosine:
    joblib.dump(cosine_sim, cosine)

with open('movie_indices.pkl', 'wb')as indices:
    joblib.dump(indice, indices)