# Phase 4 Project

## Import necessary libraries

In [75]:
# Import necessary libraries
# from pyspark.sql import SparkSession
from surprise import Dataset 
from surprise import Reader
from surprise import SVD
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
import pandas as pd
import numpy as np
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import difflib
from re import search

In [2]:
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv')


## Learning more about the datasets

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
movies.shape

(9742, 3)

In [6]:
ratings.shape

(100836, 4)

In [7]:
df = pd.merge(movies, ratings, on='movieId')
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    100836 non-null  int64  
 1   title      100836 non-null  object 
 2   genres     100836 non-null  object 
 3   userId     100836 non-null  int64  
 4   rating     100836 non-null  float64
 5   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


In [9]:
df.isna().sum()

movieId      0
title        0
genres       0
userId       0
rating       0
timestamp    0
dtype: int64

## Creating df's that will be useful later on

In [10]:
df.drop('timestamp', axis=1, inplace= True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   movieId  100836 non-null  int64  
 1   title    100836 non-null  object 
 2   genres   100836 non-null  object 
 3   userId   100836 non-null  int64  
 4   rating   100836 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.6+ MB


In [12]:
# Sorting by first movie
df.sort_values(by='movieId', ascending=True)
df.head(10)

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5
5,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,18,3.5
6,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,4.0
7,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,21,3.5
8,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,27,3.0
9,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,31,5.0


### Creating main dataset

In [13]:
# Separating Genres and making them into list
genre = df['genres'].map(lambda x: x.split('|'))
df['genre'] = genre
df.head(10)

Unnamed: 0,movieId,title,genres,userId,rating,genre
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,"[Adventure, Animation, Children, Comedy, Fantasy]"
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,"[Adventure, Animation, Children, Comedy, Fantasy]"
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,"[Adventure, Animation, Children, Comedy, Fantasy]"
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,"[Adventure, Animation, Children, Comedy, Fantasy]"
5,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,18,3.5,"[Adventure, Animation, Children, Comedy, Fantasy]"
6,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,4.0,"[Adventure, Animation, Children, Comedy, Fantasy]"
7,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,21,3.5,"[Adventure, Animation, Children, Comedy, Fantasy]"
8,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,27,3.0,"[Adventure, Animation, Children, Comedy, Fantasy]"
9,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,31,5.0,"[Adventure, Animation, Children, Comedy, Fantasy]"


In [14]:
df.drop('genres', axis=1, inplace= True)
df.head()

Unnamed: 0,movieId,title,userId,rating,genre
0,1,Toy Story (1995),1,4.0,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,1,Toy Story (1995),5,4.0,"[Adventure, Animation, Children, Comedy, Fantasy]"
2,1,Toy Story (1995),7,4.5,"[Adventure, Animation, Children, Comedy, Fantasy]"
3,1,Toy Story (1995),15,2.5,"[Adventure, Animation, Children, Comedy, Fantasy]"
4,1,Toy Story (1995),17,4.5,"[Adventure, Animation, Children, Comedy, Fantasy]"


### Creating Ratings Columns 

In [15]:
average_ratings = pd.DataFrame(df.groupby('title')['rating'].mean())
average_ratings.sort_values(by='rating', ascending=False)
#average_ratings.head(15)
#df['avg_rating']= df['rating'].mean()
#df.head()

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
Gena the Crocodile (1969),5.0
True Stories (1986),5.0
Cosmic Scrat-tastrophe (2015),5.0
Love and Pigeons (1985),5.0
Red Sorghum (Hong gao liang) (1987),5.0
...,...
Don't Look Now (1973),0.5
Journey 2: The Mysterious Island (2012),0.5
Joe Dirt 2: Beautiful Loser (2015),0.5
Jesus Christ Vampire Hunter (2001),0.5


In [16]:
average_ratings['Total Ratings'] = pd.DataFrame(df.groupby('title')['rating'].count())
#average_ratings.head()
average_ratings.shape                                                
#df['Total Ratings'] = df['rating'].count()
#df['Total Ratings'].value_counts()

(9719, 2)

In [17]:
# Mean rating of all movie averages
mean_rating = average_ratings['rating'].mean()
print(mean_rating)

3.2623883953257353


In [18]:
average_ratings.head()

Unnamed: 0_level_0,rating,Total Ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'71 (2014),4.0,1
'Hellboy': The Seeds of Creation (2004),4.0,1
'Round Midnight (1986),3.5,2
'Salem's Lot (2004),5.0,1
'Til There Was You (1997),4.0,2


In [19]:
# Calculate the cutoff value (minimum votes)
# Can help with deciding how many votes a movie needs to be considered
# 90 percent of the data have values less than this number
min_vote = average_ratings['Total Ratings'].quantile(0.90)
print(min_vote)


27.0


## Filter movies that have more votes than min value

In [20]:
#filtered_df = df.copy().loc[df['']]

### Creating dataset where genres are in their own row

In [21]:
# df with with each genre in a new row
df_explode = df.explode('genre')
#df_explode.head(25)

In [22]:
df_explode.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 274480 entries, 0 to 100835
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   movieId  274480 non-null  int64  
 1   title    274480 non-null  object 
 2   userId   274480 non-null  int64  
 3   rating   274480 non-null  float64
 4   genre    274480 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 12.6+ MB


In [23]:
df.rating.value_counts()

4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: rating, dtype: int64

In [24]:
lower_rating = df['rating'].min()
upper_rating = df['rating'].max()
print('User rating range: {0} to {1}'.format(lower_rating, upper_rating))

User rating range: 0.5 to 5.0


In [25]:
# Create Surprise Dataset
reader = Reader(rating_scale=(0.5,5))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

In [26]:
svd = SVD(verbose= True, n_epochs=10)
cross_validate(svd, data, measures= ['RMSE', 'MAE'], cv=3, verbose= True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8849  0.8879  0.8885  0.8871  0.0016  
MAE (testset)     0.6817  0.6853  0.6870  0.6846  0.0022  
Fit time          1.74    1.73    1.75    1.74    0.01    
Test time         0.17    0.23    0.23    0.21    0.03    


{'test_rmse': array([0.88494329, 0.88792157, 0.88850448]),
 'test_mae': array([0.68165385, 0.68527862, 0.68696945]),
 'fit_time': (1.7416388988494873, 1.7284278869628906, 1.7503528594970703),
 'test_time': (0.17195701599121094, 0.2278881072998047, 0.22919297218322754)}

In [27]:
type(data)

surprise.dataset.DatasetAutoFolds

In [28]:
# Split into train and test set
trainset, testset = train_test_split(data, test_size=0.2)

In [29]:
print('Type trainset : ', type(trainset),'\n')
print('Type testset :',type(testset))

Type trainset :  <class 'surprise.trainset.Trainset'> 

Type testset : <class 'list'>


In [30]:
print(len(testset))
print(testset[0])

20168
(332, 1380, 3.5)


In [31]:
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')


Number of users:  610 

Number of items:  8987 



In [32]:
# Because of fewer users than items, it is more efficient to calculate
# user-user similarity rather than item-item
sim_cos = {'name':'cosine', 'user_based':False}

In [33]:
basic = knns.KNNBasic(sim_options=sim_cos)
# Fit model
basic.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f904b309100>

In [34]:
# Similarity metrics of each of the users to one another
basic.sim

array([[1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [36]:
# Test model to determine how well it performed 
predictions = basic.test(testset)
print(accuracy.rmse(predictions))

RMSE: 0.9831
0.9831240434212788


Results mean model is off by about 0.9736 points

In [37]:
# Pearson Correlation
sim_pearson = {'name':'pearson', 'user_based':False}
basic_pearson = knns.KNNBasic(sim_options=sim_pearson)
basic_pearson.fit(trainset)
predictions = basic_pearson.test(testset)
print(accuracy.rmse(predictions))

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9743
0.9743062366448675


In [38]:
# KNN with Means. Same as basic KNN model but takes into account the mean rating
# of each user/item
sim_pearson = {'name':'pearson', 'user_based':True}
knn_means = knns.KNNWithMeans(sim_options=sim_pearson)
knn_means.fit(trainset)
predictions = knn_means.test(testset)
print(accuracy.rmse(predictions))

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9004
0.9004280292887995


In [39]:
# KNNBaseline method - This adds in bias term that is calculated by way of
# minimizing a cost function
sim_pearson = {'name':'pearson', 'user_based':False}
knn_baseline = knns.KNNBaseline(sim_options=sim_pearson)
knn_baseline.fit(trainset)
predictions = knn_baseline.test(testset)
print(accuracy.rmse(predictions))

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8844
0.8843530892880284


KNNBaseline method had the best results with 0.8767 RMSE (user_based set to True). And RMSE: 0.8751 when user_based set to False.

In [40]:
# Matrix Factorization 
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
svd.fit(trainset)
predictions = svd.test(testset)
print(accuracy.rmse(predictions))

RMSE: 0.8954
0.8954256832665758


In [41]:
# Make Predictions
user_3_prediction = svd.predict('3', '25')
user_3_prediction

Prediction(uid='3', iid='25', r_ui=None, est=3.5014441910051075, details={'was_impossible': False})

In [42]:
user_3_prediction[3]

3.5014441910051075

In [43]:
# Retrieve movie name
def name_retriever(movie_id, movie_df):
    for movie in df:
       return df.loc[df['movieId']==movie_id]
        
   

In [44]:
# Pandas df of unique movie ID's
unique_iids = df['movieId'].drop_duplicates()

In [45]:
# Find unrated movies by a user function
def find_unrated(user, movie_df):
    movId = df.loc[df['userId'] == user, 'movieId']
    # Remove the iids that user n has rated from the list of all movie ids
    user_unrated = pd.concat([unique_iids,movId]).drop_duplicates(keep=False)
    user_unrated = user_unrated.to_frame()
    return user_unrated.head()
        
    

In [46]:
#df_explode['genre'].value_counts()
#find_unrated(3, df)

## Recommending movies for user 3 using SVD

In [47]:
# Recommending moviews for user 3 using SVD
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f8f66600940>

In [48]:
unique_ids = df['movieId'].unique()

#Get list of unique ids for user 3
newdf = df.loc[df['userId'] == 4 , 'movieId']

# Remove rated movies
movies_to_predict = np.setdiff1d(unique_ids, newdf)


In [49]:
# Make prediction of unrated movie
# svd.predict(uid='3',iid='5')

In [50]:
recs = []
for iid in movies_to_predict:
    recs.append((iid, svd.predict(uid='4',iid=iid).est))
pd.DataFrame(recs,columns=['iid','predictions']).sort_values('predictions', ascending=False).head(10)

Unnamed: 0,iid,predictions
256,318,4.109162
824,1204,4.086187
837,1221,4.059696
557,750,4.052607
42,50,4.039861
4684,7361,4.02847
827,1208,4.020111
612,858,4.009159
6477,58559,4.009067
819,1193,4.009047


## Recommending movies for user 3 using KNNBaseline model

In [51]:
#benchmark = []

#for algorithm in [SVD(), knns.KNNBaseline(), knns.KNNBasic(), knns.KNNWithMeans()]:
    # Cross validation 
 #   results = cross_validate(algorithm, data, measures=['RMSE'],cv=3, verbose=False)
    # Get results & append algorithm name
#    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
#    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],index=['Algoritm']))
#    benchmark.append(tmp)
    
    
#pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

In [52]:
cross_validate(knn_baseline, data, measures= ['RMSE', 'MAE'], cv=3, verbose= True)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8923  0.8800  0.8839  0.8854  0.0051  
MAE (testset)     0.6837  0.6766  0.6805  0.6802  0.0029  
Fit time          13.60   13.51   13.43   13.51   0.07    
Test time         8.26    8.38    8.29    8.31    0.05    


{'test_rmse': array([0.89225943, 0.87995467, 0.88390835]),
 'test_mae': array([0.68369103, 0.6765792 , 0.68047548]),
 'fit_time': (13.600023031234741, 13.514977216720581, 13.425991296768188),
 'test_time': (8.25627088546753, 8.383770227432251, 8.293276309967041)}

In [53]:
#name_retriever(2959, df)

# Using ALS

In [54]:
print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo = knns.KNNBaseline(bsl_options=bsl_options)
cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

Using ALS
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([0.88097338, 0.87908513, 0.88542507]),
 'fit_time': (0.16141080856323242, 0.17261981964111328, 0.1732959747314453),
 'test_time': (2.036731243133545, 1.8985939025878906, 2.031749963760376)}

In [73]:
# User train test split to sample a trainset and testset
algo = knns.KNNBaseline(bsl_options=bsl_options)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8779


0.8779488751640884

RMSE using ALS turned out to be 0.8779 using KNNBaseline.

ALS turned out to have the lowest RMSE out of all the other methods.

In [68]:
# Similarity metrics of each of the users to one another
algo.sim

array([[1.        , 0.32758621, 0.54320988, ..., 0.36923077, 0.2       ,
        0.45283019],
       [0.32758621, 1.        , 0.39755064, ..., 0.35087719, 0.2601626 ,
        0.46666667],
       [0.54320988, 0.39755064, 1.        , ..., 0.53658537, 0.64516129,
        0.76190476],
       ...,
       [0.36923077, 0.35087719, 0.53658537, ..., 1.        , 0.        ,
        0.2       ],
       [0.2       , 0.2601626 , 0.64516129, ..., 0.        , 1.        ,
        0.        ],
       [0.45283019, 0.46666667, 0.76190476, ..., 0.2       , 0.        ,
        1.        ]])

In [69]:
similarity = algo.sim

[[1.         0.32758621 0.54320988 ... 0.36923077 0.2        0.45283019]
 [0.32758621 1.         0.39755064 ... 0.35087719 0.2601626  0.46666667]
 [0.54320988 0.39755064 1.         ... 0.53658537 0.64516129 0.76190476]
 ...
 [0.36923077 0.35087719 0.53658537 ... 1.         0.         0.2       ]
 [0.2        0.2601626  0.64516129 ... 0.         1.         0.        ]
 [0.45283019 0.46666667 0.76190476 ... 0.2        0.         1.        ]]


In [70]:
print(similarity.shape)

(610, 610)


# Recommend top 5 suggestions

In [55]:
def top_5(user, movie_df):
    predictions = svd.test(find_unrated(user, movie_df))
    return predictions 

In [76]:
#top_5(1,df)
#average_ratings.index

Index([''71 (2014)', ''Hellboy': The Seeds of Creation (2004)',
       ''Round Midnight (1986)', ''Salem's Lot (2004)',
       ''Til There Was You (1997)', ''Tis the Season for Love (2015)',
       ''burbs, The (1989)', ''night Mother (1986)',
       '(500) Days of Summer (2009)', '*batteries not included (1987)',
       ...
       'Zulu (2013)', '[REC] (2007)', '[REC]² (2009)',
       '[REC]³ 3 Génesis (2012)',
       'anohana: The Flower We Saw That Day - The Movie (2013)',
       'eXistenZ (1999)', 'xXx (2002)', 'xXx: State of the Union (2005)',
       '¡Three Amigos! (1986)', 'À nous la liberté (Freedom for Us) (1931)'],
      dtype='object', name='title', length=9719)

## Potential fix to cold start problem: User input

In [86]:
#fav_genres = input('What type of movies do you like: Drama, Comedy, Action, Thriller')
#print(fav_genres)
movie_name = input('Enter your favorite movie : ')                   

Enter your favorite movie : Batman


In [99]:
#3.5015681558982497
df.title.index[1]

1

In [91]:
# Create list with all movies in dataset
list_of_titles = df['title'].drop_duplicates().tolist()
#print(list_of_titles)

In [105]:
# Get index of movie user likes
def get_index_from_title(title):
    return df[df.title==title].index

In [106]:
# Finding the closest match to movie name inputed by user
# find_closest_match = difflib.get_close_matches(movie_name, list_of_titles)
# print(find_closest_match)
if search(movie_name,str(list_of_titles)):
    print("Found!")
    movie_index = get_index_from_title(movie_name)
    

Found!


In [102]:
print(movie_index)

Int64Index([], dtype='int64')


In [61]:
closest_match = find_closest_match[0]
print(closest_match)

In [62]:
# Finding index of movie with title
#movie_index = df[df.title == closest_match].index[0]
#print(movie_index)

In [63]:
# Getting a list of similar movies
#sim_score = list(enumerate(similarity[movie_index]))
#print(sim_score)

In [64]:
#length of all movies in dateset
#len(sim_score)

In [65]:
# Sorting movies based on simmularity score
# Sort movies in descending order and get second value of tuple
# sorted_similar_movies = sorted(sim_score, key= lambda x:x[1], reverse = True)
# print(sorted_similar_movies)

In [66]:
# Print the name of similar movies based on index
#print('Movies you may like : ' \n)
#i = 1
#for movie in sorted_similar_movies:
#    index = movie[0]
#    title_from_index = df[df.index]['title'].values[0]
    # Recommnend 5 movies
#    if (i<6):
#        print(i, '.', title_from_index)
#        i+=1