## Recommendation Engine using KNN

In [1]:
#adding necessary imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from plotly.offline import plot
from plotly.graph_objs import *
import datetime
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy import stats, linalg

In [2]:
#reading user data, data about ratings and movie info data
user = pd.read_csv('data/u.user', delimiter='|', header=None, names=['userid', 'age', 'gender', 'occupation', 'zipcode'])
rating_data = pd.read_csv('data/u.data', delimiter='\t', header=None, names=['userid', 'itemid', 'rating', 'timestamp'])
movies = pd.read_csv('data/u.item', delimiter='|', encoding='ISO-8859-1', header=None, usecols=[0, 1, 5, 6, 
                                                                                                7, 8, 9, 10, 
                                                                                                11, 12, 13, 
                                                                                                14, 15, 16, 
                                                                                                17, 18, 19, 20, 
                                                                                                21, 22, 23])


## adding a genre column containing genres for each movie

In [3]:
#changing the column names of movie dataframe
movies.columns=['itemid', 'movie', 'unknown', 'Action' , 'Adventure' , 'Animation', "Children's",
                'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
m = movies['movie']
genre_list=[]
genre_dict={}
genres = movies.columns[2:]      #creating a list of all genres
#for each movie creating a list of all it's genres(genre_list)
#genre_dict contains movie id as key and it's list of genres as value
for i in range(len(movies)):
    for g in genres:
        if movies.ix[i, :][g] != 0:
            genre_list.append(g)
                 
    genre_dict[i+1] = genre_list
    genre_list = []
g = pd.DataFrame({'genre': genre_dict})
#adding the genre column to the movies dataframe
movies = movies.join(g, on='itemid')

In [4]:
#table conatining rating by each user for each movie
rating_table = pd.pivot_table(rating_data, columns = 'itemid', index='userid', values='rating')

In [5]:
rating_table=rating_table.fillna(0)
mean_rating_users = rating_table.mean(axis = 1)    #series containing avg rating given by each user

In [6]:
n_users = rating_data.userid.unique().size
n_items = rating_data.itemid.unique().size
#storing the rating data into an array
data_matrix = np.zeros((n_users, n_items))
for i in rating_data.itertuples():
    data_matrix[i[1]-1, i[2]-1] = i[3]

## user-user and item-item similarity matrix

In [7]:
user_similarity = 1-pairwise_distances(data_matrix, metric='cosine') #user-user similarity
item_similarity = 1-pairwise_distances(data_matrix.T, metric='cosine')  #item-item similarity

## finding k-nearest neighbors 

In [8]:
def knn(similarity, k):
    top_ten_similarity = []
    for index, row in enumerate(similarity):
        similarity_grouped = list(zip(range(len(row)), row))#list containing similarity with all users for a particular user
        top_ten = sorted(similarity_grouped, key=lambda l: l[1], reverse=True)
        ids, similarity_score = zip(*top_ten)
        top_ten_similarity.append(list(ids[:k+1]))
    return np.array(top_ten_similarity)


In [9]:
user_k_similar = knn(user_similarity, k=10)
item_k_similar = knn(item_similarity, k=10)

In [14]:
a = [i[1:]for i in item_k_similar]#first element will be the user itself so slicing from 1 to end
b = [i[1:]for i in user_k_similar]#first element will be the item itself so slicing from 1 to end
#creating a dataframe to store similar items/users for each item/user
items_similar_df = pd.DataFrame({'itemid':range(len(item_k_similar)),
                                 'similar_itemid':a})
user_similar_df = pd.DataFrame({'userid':range(len(user_k_similar)),
                                 'similar_userid':b})


## predicting ratings for all the movies user hasn't watched

In [11]:
def predictions(u, user_data, type):
    user_data = user_data.reshape((1, len(user_data)))
    u = u-1 #since we are considering indexes
    sum1 = 0
    sim = 0
    pred = np.zeros(user_data.shape)
    if type=='user':
        for i in range(user_data.shape[1]):
            sum1 = 0
            sim = 0
            if user_data[u, i] == 0:
                similar_users = np.array(user_similar_df[user_similar_df.userid == u]['similar_userid'])[0]#array containing similar users
                
                for v in range(len(similar_users)):
                      neigh = similar_users[v]
                      similar = user_similarity[u, neigh]#similarity between users and it's neighbors
                      sum1 +=  similar * (rating_table.ix[neigh+1, i+1]-mean_rating_users[neigh+1])
                      sim += similar
                      pred[u, i]=mean_rating_users[u+1]+(sum1/sim)
        return pred


## generating movie predictions for a user

In [12]:
def movie_predictions(userid):
    user_data = data_matrix[userid-1, :]  #user_data has rating data for one particular user
    prediction_matrix = predictions(userid, user_data, 'user')
    movie_predict_user1 = prediction_matrix.nonzero()
    movie_genre = []
    movieid = movie_predict_user1[1]#movie id's of movies for which we've predicted
    rating_predicted = prediction_matrix[0][movieid]
    movienames = [movies['itemid'][i] for i in movieid]
    moviegenre = [movies['genre'][i] for i in movieid]
    #dataframe containing info about predicted movies
    prediction_df = pd.DataFrame({'movie':movienames, 'rating_predicted':rating_predicted,
                                  'genre': moviegenre}, index = movieid)
    #sorting the movies by their ratings
    prediction_df.sort_values(by='rating_predicted', ascending=False, inplace=True)
    #returning only the top 10 movie recommendations
    return prediction_df[:10]


In [13]:
movie_predictions(1)

Unnamed: 0,genre,movie,rating_predicted
317,"[Drama, War]",318,3.951406
473,"[Sci-Fi, War]",474,3.767339
654,"[Adventure, Comedy, Drama]",655,3.558491
422,"[Children's, Drama, Fantasy, Sci-Fi]",423,3.553778
402,"[Action, Adventure, Crime, Drama]",403,3.540861
356,[Drama],357,3.532281
432,[Comedy],433,3.46595
384,"[Action, Adventure, Comedy, Romance]",385,3.349106
567,"[Action, Romance, Thriller]",568,3.265209
469,[Western],470,3.154088
