<a href="https://colab.research.google.com/github/avyaktawrat/Evaluat-inator/blob/master/KNN_based_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz
import math
import os
import time

# **Loading Dataset**

In [0]:
movies = pd.read_csv('https://raw.githubusercontent.com/avyaktawrat/Evaluat-inator/master/data/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('https://raw.githubusercontent.com/avyaktawrat/Evaluat-inator/master/data/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('https://raw.githubusercontent.com/avyaktawrat/Evaluat-inator/master/data/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

In [0]:
movies.columns = ['movieID', 'title', 'genres']
users.columns = ['userId', 'gender', 'zipcode', 'age_desc', 'occ_desc']

ratings.columns = ['userId', 'movieId', 'rating', 'timestamp']

# **Cleaning data-set**

In [0]:
#counting number of raitngs in a given movieID
df_movies_cnt = pd.DataFrame(ratings.groupby('movieId').size(), columns=['count'])
df_movies_cnt.head()

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,2077
2,701
3,478
4,170
5,296


In [0]:
df_movies_cnt['count'].quantile(np.arange(1, 0.2, -0.05))

1.00    3428.00
0.95    1051.50
0.90     729.50
0.85     553.25
0.80     429.00
0.75     350.00
0.70     280.00
0.65     228.00
0.60     188.00
0.55     154.00
0.50     123.50
0.45      97.00
0.40      74.00
0.35      58.00
0.30      44.00
0.25      33.00
Name: count, dtype: float64

In [0]:
#removing movies which have number of ratings less than 50
#and creating list of popular movies and subsequently creating dataframe
#with ratings of movies which are in popular movies list
popularity_thres = 50
popular_movies = list(set(df_movies_cnt.query('count >= @popularity_thres').index))
df_ratings_drop_movies = ratings[ratings.movieId.isin(popular_movies)]
print('shape of original ratings data: ', ratings.shape)
print('shape of ratings data after dropping unpopular movies: ', df_ratings_drop_movies.shape)

shape of original ratings data:  (1000209, 4)
shape of ratings data after dropping unpopular movies:  (977839, 4)


We removed those movies which had less than 50 ratings. No major change in shape occur and it will increase KNN performance.

In [0]:
# get number of ratings given by every user
df_users_cnt = pd.DataFrame(df_ratings_drop_movies.groupby('userId').size(), columns=['count'])
df_users_cnt.head()

Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
1,53
2,128
3,51
4,21
5,192


In [0]:
# filter data
#drop ratings given by non-active users i.e. those who have given less than 50 ratings
#further creating data-frame with the users that are active and have given ratings to popular movies
ratings_thres = 50
active_users = list(set(df_users_cnt.query('count >= @ratings_thres').index))
df_ratings_drop_users = df_ratings_drop_movies[df_ratings_drop_movies.userId.isin(active_users)]
print('shape of original ratings data: ', ratings.shape)
print('shape of ratings data after dropping both unpopular movies and inactive users: ', df_ratings_drop_users.shape)

shape of original ratings data:  (1000209, 4)
shape of ratings data after dropping both unpopular movies and inactive users:  (920334, 4)


Preparing input for KNN, i.e. user-movies matrix with ratings as values

In [0]:
# pivot and create movie-user matrix
#creating user movies matrix with ratings as value
movie_user_mat = df_ratings_drop_users.pivot(index='movieId', columns='userId', values='rating').fillna(0)

In [0]:
# create mapper from movie title to index
movie_to_idx = {
    movie: i for i, movie in 
    enumerate(list(movies.set_index('movieID').loc[movie_user_mat.index].title))}

# transform matrix to scipy sparse matrix
# it a format where operation as fairly faster than nomal dataframe 
from scipy.sparse import csr_matrix

movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

In [0]:
movie_user_mat.head()

userId,1,2,3,5,6,8,9,10,11,13,15,17,18,19,22,23,24,25,26,27,28,29,31,33,34,35,36,37,38,39,40,42,44,45,48,49,52,53,56,57,...,5989,5990,5991,5992,5994,5995,5996,5997,5998,6000,6001,6002,6003,6004,6005,6006,6007,6009,6010,6011,6013,6014,6015,6016,6018,6019,6021,6023,6024,6025,6026,6030,6031,6032,6033,6035,6036,6037,6039,6040
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,5.0,0.0,0.0,0.0,4.0,4.0,5.0,5.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,4.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,0.0,0.0,5.0,4.0,4.0,5.0,0.0,0.0,5.0,0.0,...,5.0,0.0,4.0,0.0,0.0,5.0,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,5.0,5.0,5.0,0.0,5.0,4.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,4.0,0.0,3.0,0.0,0.0,5.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# **Training the model**

In [0]:
%env JOBLIB_TEMP_FOLDER=/tmp
# define model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
# fit
model_knn.fit(movie_user_mat_sparse)

env: JOBLIB_TEMP_FOLDER=/tmp


NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

In [0]:
def fuzzy_matching(mapper, fav_movie, verbose=True):
    '''
    mapper is a dictionary that has movies mapped to index 
    this function finds index of the fav movie using fuzz ratio
    and returns the idx of movie which is most similar
    
    retuns 
    -------
    index
    '''
    match_tuple = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]


In [0]:
def make_recommendation(model_knn, data, mapper, fav_movie, n_recommendations):
    """
    creates list of distances movies and fav movie using KNN model 
    using cosine as evaluation method for distance.
    mapper = maps movie to index 
    
    Return
    ------
    list of top n similar movie recommendations
    """
    # fit
    model_knn.fit(data)
    
    # get input movie index
    print('You have input movie:', fav_movie)
    idx = fuzzy_matching(mapper, fav_movie, verbose=True)
    
    # inference
    print('Recommendation system start to make inference')
    print('......\n')
    
    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    
    # get list of raw idx of recommendations
    raw_recommends = \
        sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    
    # print recommendations
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))

In [0]:
my_favorite = 'Toy Story'

make_recommendation(
    model_knn=model_knn,
    data=movie_user_mat_sparse,
    fav_movie=my_favorite,
    mapper=movie_to_idx,
    n_recommendations=10)

You have input movie: Toy Story
Found possible matches in our database: ['Toy Story (1995)', 'Toy Story 2 (1999)']

Recommendation system start to make inference
......

Recommendations for Toy Story:
1: Matrix, The (1999), with distance of 0.4123492619057725
2: Forrest Gump (1994), with distance of 0.4108584380956405
3: Star Wars: Episode IV - A New Hope (1977), with distance of 0.4084118525153517
4: Star Wars: Episode V - The Empire Strikes Back (1980), with distance of 0.4028148093017353
5: Men in Black (1997), with distance of 0.40242537183697824
6: Back to the Future (1985), with distance of 0.39037759481459866
7: Bug's Life, A (1998), with distance of 0.3800330458730503
8: Aladdin (1992), with distance of 0.37333047302006495
9: Groundhog Day (1993), with distance of 0.365947628722605
10: Toy Story 2 (1999), with distance of 0.3447502102097404
