# ML Lab Mini Project – Movie Recommender System (Phase 2)
---
## Performing the recommendations
---
### Similarity between movies

In [1]:
# importing libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from surprise import Reader, Dataset, KNNWithMeans, accuracy
from surprise.model_selection import KFold, train_test_split

In [2]:
# reading the datasets
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags= pd.read_csv('tags.csv')

In [3]:
# general cleanup
tags.drop(['timestamp'], axis=1, inplace=True)

ratings_f = ratings.groupby('userId').filter(lambda x: len(x) > 20)
ratings.drop(['timestamp'], axis=1, inplace=True)

# splitting genres
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))
movies['genres'] = movies['genres'].apply(lambda x: ' '.join(x))
movies['genres'] = movies['genres'].apply(lambda x: x.lower())

In [4]:
# getting a list of unique movies
movielist = ratings_f.movieId.unique().tolist()
movies = movies[movies.movieId.isin(movielist)]

In [5]:
# combining movie and tag data
mixed = pd.merge(movies, tags, on='movieId', how='left')
mixed.head()

Unnamed: 0,movieId,title,genres,userId,tag
0,1,Toy Story (1995),adventure animation children comedy fantasy,336.0,pixar
1,1,Toy Story (1995),adventure animation children comedy fantasy,474.0,pixar
2,1,Toy Story (1995),adventure animation children comedy fantasy,567.0,fun
3,2,Jumanji (1995),adventure children fantasy,62.0,fantasy
4,2,Jumanji (1995),adventure children fantasy,62.0,magic board game


In [6]:
# combining genres and tags to get keyword metadata for each movie
mixed.fillna("", inplace=True)
mixed = pd.DataFrame(mixed.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)))
final = pd.merge(movies, mixed, on='movieId', how='left')
final['keywords'] = final[['tag', 'genres']].apply(lambda x: ' '.join(x), axis = 1)
final.head()

Unnamed: 0,movieId,title,genres,tag,keywords
0,1,Toy Story (1995),adventure animation children comedy fantasy,pixar pixar fun,pixar pixar fun adventure animation children c...
1,2,Jumanji (1995),adventure children fantasy,fantasy magic board game Robin Williams game,fantasy magic board game Robin Williams game a...
2,3,Grumpier Old Men (1995),comedy romance,moldy old,moldy old comedy romance
3,4,Waiting to Exhale (1995),comedy drama romance,,comedy drama romance
4,5,Father of the Bride Part II (1995),comedy,pregnancy remake,pregnancy remake comedy


In [7]:
# using tfidf vectorisation to extract important keywords
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(final['keywords'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=final.index.tolist())
print(tfidf_df.shape)

(9724, 1675)


In [8]:
# computing cosine similarity between movies
cosine_sim = cosine_similarity(tfidf_matrix)

In [9]:
# helper functions
def get_idx(title):
    return final[final.title.str.contains(title, case=False)].index[0]

def get_title(idx):
    return final[final.index == idx]['title'].values[0]

---
### Cold Start


#### Content-based only

In [10]:
# function to recommend top 10 movies based on similar keywords
def keyword_recommend(title):
    movie_list = movies[movies['title'].str.contains(title, case=False)]
    if len(movie_list):
        idx = get_idx(title)
        similar_movies = list(enumerate(cosine_sim[idx]))
        sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)
        
        recommend_frame = []
        for movie in sorted_similar_movies[1:11]:
            recommend_frame.append({'Title':get_title(movie[0]), 'Similarity':movie[1]})
            
        df = pd.DataFrame(recommend_frame, index=range(1, 11))
        return df
    else:
        return "No movies found."
        

In [11]:
movie = input("Enter a movie you liked: ")

Enter a movie you liked: interstellar


In [12]:
print("Top 10 movies similar to", get_title(get_idx(movie)))
keyword_recommend(movie)

Top 10 movies similar to Interstellar (2014)


Unnamed: 0,Title,Similarity
1,Primer (2004),0.651071
2,Back to the Future (1985),0.628502
3,Back to the Future Part II (1989),0.628502
4,Bill & Ted's Excellent Adventure (1989),0.628502
5,Stargate (1994),0.62136
6,Time Bandits (1981),0.602974
7,Bill & Ted's Bogus Journey (1991),0.602974
8,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),0.4238
9,The Butterfly Effect (2004),0.408501
10,Doctor Strange (2016),0.376537


---
#### Users who liked this movie also liked (popularity based)

In [13]:
# creating a sparse matrix of ratings
ratings_pivot = ratings.pivot(index='movieId',columns='userId',values='rating')
ratings_pivot.fillna(0, inplace=True)
ratings_pivot.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# counting rated movies and users who have rated movies
users_rated = ratings.groupby('movieId')['rating'].agg('count')
movies_rated = ratings.groupby('userId')['rating'].agg('count')

In [15]:
# only considering users who have rated more than 50 movies
ratings_pivot = ratings_pivot.loc[:, movies_rated[movies_rated > 50].index]

In [16]:
# Compressed Sparse Row matrix
csr_data = csr_matrix(ratings_pivot.values)
ratings_pivot.reset_index(inplace=True)

In [17]:
# k-nearest neighbors model based on cosine distance
knn = NearestNeighbors(metric='cosine', algorithm='auto', n_neighbors=10, n_jobs=-1)
knn.fit(csr_data)

NearestNeighbors(metric='cosine', n_jobs=-1, n_neighbors=10)

In [18]:
# function to recommend top 10 nearest neighbour movies based on similar ratings
def rating_recommend(title):
    n = 10
    movie_list = movies[movies['title'].str.contains(title, case=False)]  
    if len(movie_list):        
        movie_idx = movie_list.iloc[0]['movieId']
        movie_idx = ratings_pivot[ratings_pivot['movieId'] == movie_idx].index[0]
        distances, indices = knn.kneighbors(csr_data[movie_idx], n_neighbors=n+1)    
        rec_movies = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
        recommend_frame = []
        for val in rec_movies:
            movie_idx = ratings_pivot.iloc[val[0]]['movieId']
            idx = movies[movies['movieId'] == movie_idx].index
            recommend_frame.append({'Title':movies.iloc[idx]['title'].values[0],'Distance':val[1]})
        df = pd.DataFrame(recommend_frame, index=range(1, n+1))
        df.sort_values(by=['Distance'], inplace=True)
        return df.reset_index(drop=True)
    else:
        return "No movies found."

In [19]:
movie = input("Enter a movie you liked: ")

Enter a movie you liked: interstellar


In [20]:
print("Top 10 movies liked by users who liked", get_title(get_idx(movie)))
rating_recommend(movie)

Top 10 movies liked by users who liked Interstellar (2014)


Unnamed: 0,Title,Distance
0,Everything or Nothing: The Untold Story of 007...,0.326071
1,Joyful Noise (2012),0.338903
2,Bad Words (2013),0.35678
3,Ramona and Beezus (2010),0.376187
4,Footloose (2011),0.38682
5,Nick Fury: Agent of S.H.I.E.L.D. (1998),0.390902
6,Song of the Sea (2014),0.391242
7,No Way Jose (2015),0.398055
8,Untitled Spider-Man Reboot (2017),0.409325
9,Delirium (2014),0.416786


---
### Returning Users


#### User and Item Based

In [21]:
# KNN based collaborative filtering algorithm
reader = Reader()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# cross-validation split
kf = KFold(n_splits=5, random_state=42)
kf.split(data)

# train-test-split (70/30)
trainset, testset = train_test_split(data, test_size=0.3, random_state=42)

# building the model with training data
algo = KNNWithMeans(k=10, sim_options={'name':'cosine', 'user_based':True})
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f9cae92dd30>

In [22]:
# checking the RMSE of the predicted test values
preds = algo.test(testset)
accuracy.rmse(preds, verbose=True)

RMSE: 0.9192


0.9191793752525242

In [23]:
# function for hybrid recommendation for returning users (taking rating history and tag similarity into account)
def hybrid(userId, title):
    
    idx = get_idx(title)
    movieId = final.loc[final['title'].str.contains(title, case=False), 'movieId'].iloc[0]
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    
    rec = final.iloc[movie_indices][['title', 'movieId']]
    rec['est'] = rec['movieId'].apply(lambda x: algo.predict(userId, final.loc[final['movieId'] == x, 'movieId'].iloc[0]).est)
    rec = rec.sort_values('est', ascending=False)
    rec.columns = ['Title', 'Movie Id', 'Estimated Rating']
    rec.drop('Movie Id', axis=1, inplace=True)
    return rec.reset_index(drop=True).head(10)

In [24]:
user = input("Enter your user ID: ")

Enter your user ID: 25


In [25]:
movie = input("Enter a movie you liked: ")

Enter a movie you liked: interstellar


In [26]:
print("Top 10 personalised recommendations for User", user)
hybrid(int(user), movie)

Top 10 personalised recommendations for User 25


Unnamed: 0,Title,Estimated Rating
0,Doctor Strange (2016),5.0
1,"Boy and His Dog, A (1975)",5.0
2,Bill & Ted's Excellent Adventure (1989),5.0
3,Stargate (1994),5.0
4,Time Bandits (1981),5.0
5,Back to the Future (1985),5.0
6,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),5.0
7,Battle Beyond the Stars (1980),4.983151
8,20 Million Miles to Earth (1957),4.84375
9,Android (1982),4.84375


---
## Inference
<br>
The overall accuracy of the predictions cannot be tested until the users themselves give feedback about them. <br>
Personally experimenting with different movies, the hybrid model performs most satisfactorily. The solely content-based recommendations are also quite accurate, while the recommendations based on only popularity seem the least accurate.