### Load lib and import data

In [55]:
import numpy as np
import pandas as pd
import sklearn

# read the rating files
r_cols = ['user_id','movie_id','rating','unix_timestamp'] # col names 
ratings = pd.read_csv('ml-100k/u.data',sep ='\t',names =r_cols,encoding='latin-1')
ratings # loading data to csv file and encoding it using latin-1

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [56]:
# to find the unique users and items(movie)
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

In [57]:
print("The number of user:", n_users)
print("The number of items:",n_items)

The number of user: 943
The number of items: 1682


### Creating pivot table for user and item  

In [58]:
pivot_tab = ratings.pivot_table(index ='user_id',columns='movie_id',values='rating')
pivot_tab

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [59]:
pivot_mat = pivot_tab.replace(np.nan,0)
pivot_mat

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Find Cosine similarity for user id and movie id


In [60]:
import sklearn
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(pivot_mat,metric='cosine')
item_similarity = pairwise_distances(pivot_mat.T,metric='cosine')

In [61]:
item_similarity

array([[2.22044605e-16, 5.97617822e-01, 6.69755213e-01, ...,
        1.00000000e+00, 9.52816933e-01, 9.52816933e-01],
       [5.97617822e-01, 2.22044605e-16, 7.26930825e-01, ...,
        1.00000000e+00, 9.21700637e-01, 9.21700637e-01],
       [6.69755213e-01, 7.26930825e-01, 0.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+00, 9.03124947e-01],
       ...,
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [9.52816933e-01, 9.21700637e-01, 1.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [9.52816933e-01, 9.21700637e-01, 9.03124947e-01, ...,
        1.00000000e+00, 1.00000000e+00, 0.00000000e+00]])

### Using formula for user and item we are calcuating the score value 

In [62]:
# creating func for score
def predict(ratings,similarity,type='user'):
    if type == 'user':
        
        mean_user_rating= ratings.mean(axis =1) # avg ratings
        to_get_same_format_mean_rating = mean_user_rating.values[:,np.newaxis] # to get the average ratings output in the same format
        ratings_diff = (ratings - to_get_same_format_mean_rating) # to find the difference for rating and same format mean rating
        # formula to findd the similarity score
        # for user T the output value of formula is important
        pred = to_get_same_format_mean_rating + similarity.dot(ratings_diff)/np.array([np.abs(similarity).sum(axis=1)]).T 
        
    elif type == 'item': # item based no transpose
        pred = ratings.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)])
        
    return pred

In [63]:
item_prediction = predict(pivot_mat, item_similarity, type='item')
item_prediction

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.446278,0.475473,0.505938,0.443633,0.512667,0.547939,0.446243,0.463059,0.474916,0.515821,...,0.580579,0.576202,0.582478,0.582478,0.575717,0.588155,0.588155,0.588155,0.573107,0.566696
2,0.108544,0.132957,0.125589,0.124932,0.131178,0.129005,0.110883,0.122223,0.109599,0.121525,...,0.135490,0.136546,0.134829,0.134829,0.134108,0.134458,0.134458,0.134458,0.136576,0.137111
3,0.085685,0.091690,0.087643,0.089966,0.089658,0.089985,0.083492,0.089725,0.085188,0.088331,...,0.089770,0.090506,0.086261,0.086261,0.089201,0.084659,0.084659,0.084659,0.089768,0.090845
4,0.053693,0.059604,0.058114,0.058364,0.059356,0.061472,0.053374,0.058615,0.055905,0.060601,...,0.061349,0.061686,0.061195,0.061195,0.060693,0.057937,0.057937,0.057937,0.061673,0.062281
5,0.224739,0.229171,0.263280,0.226387,0.259973,0.296529,0.232710,0.237109,0.258581,0.275076,...,0.297628,0.295990,0.299922,0.299922,0.298188,0.302051,0.302051,0.302051,0.293373,0.294309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.092574,0.113870,0.110211,0.112040,0.112768,0.123140,0.098578,0.110839,0.098858,0.118579,...,0.123829,0.124430,0.120776,0.120776,0.121360,0.125056,0.125056,0.125056,0.123470,0.124327
940,0.164358,0.184894,0.196502,0.164884,0.195860,0.209652,0.162840,0.165606,0.171761,0.194536,...,0.217536,0.215515,0.219136,0.219136,0.216173,0.218583,0.218583,0.218583,0.216582,0.216819
941,0.032300,0.045024,0.042924,0.043223,0.047493,0.051077,0.032761,0.042646,0.039399,0.047421,...,0.052762,0.053042,0.052692,0.052692,0.051514,0.053028,0.053028,0.053028,0.051910,0.052280
942,0.157779,0.174095,0.189000,0.163514,0.186140,0.194151,0.164910,0.156970,0.167038,0.181295,...,0.197537,0.194479,0.198479,0.198479,0.197969,0.199793,0.199793,0.199793,0.197394,0.200031


### As per item based filtering ,first have to find similarity between the input user and others

In [101]:
# input user detail
input_item = 90

In [102]:
# converting item sim table into df
# to change the item sim pivot table into item_sim_table
item_similarity_table = pd.DataFrame(item_similarity)
item_similarity_table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,2.220446e-16,5.976178e-01,0.669755,5.450621e-01,0.713286,0.883656,0.379021,0.518886,0.503712,0.726065,...,0.964613,1.0,1.000000,1.000000,0.964613,1.0,1.0,1.0,0.952817,0.952817
1,5.976178e-01,2.220446e-16,0.726931,4.974292e-01,0.681164,0.916437,0.616597,0.662998,0.744748,0.828918,...,1.000000,1.0,1.000000,1.000000,1.000000,1.0,1.0,1.0,0.921701,0.921701
2,6.697552e-01,7.269308e-01,0.000000,6.751336e-01,0.787043,0.893278,0.627079,0.799206,0.726331,0.841896,...,1.000000,1.0,1.000000,1.000000,0.967708,1.0,1.0,1.0,1.000000,0.903125
3,5.450621e-01,4.974292e-01,0.675134,1.110223e-16,0.665761,0.909692,0.510717,0.509764,0.580956,0.747439,...,1.000000,1.0,0.905978,0.905978,0.962391,1.0,1.0,1.0,0.943587,0.924782
4,7.132865e-01,6.811638e-01,0.787043,6.657605e-01,0.000000,0.962701,0.665231,0.740839,0.727552,0.944547,...,1.000000,1.0,1.000000,1.000000,1.000000,1.0,1.0,1.0,1.000000,0.905789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1.000000e+00,1.000000e+00,1.000000,1.000000e+00,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.0,1.000000,1.000000,1.000000,0.0,0.0,0.0,1.000000,1.000000
1678,1.000000e+00,1.000000e+00,1.000000,1.000000e+00,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.0,1.000000,1.000000,1.000000,0.0,0.0,0.0,1.000000,1.000000
1679,1.000000e+00,1.000000e+00,1.000000,1.000000e+00,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.0,1.000000,1.000000,1.000000,0.0,0.0,0.0,1.000000,1.000000
1680,9.528169e-01,9.217006e-01,1.000000,9.435867e-01,1.000000,1.000000,0.948502,0.917967,0.942640,1.000000,...,1.000000,1.0,1.000000,1.000000,1.000000,1.0,1.0,1.0,0.000000,1.000000


In [103]:
# to find the similar user taste based on the similarity table
similar_item_input = item_similarity_table[input_item].sort_values(ascending=True).head(5).index 
# here the input user is 10, for the input user 10 need to find any other similar user are there. 

In [104]:
# similar_item_input=item_similarity_table.iloc[[10,55,21]]
# similar_item_input
# this code will convert the output of similar user input to list
similar_item_input = list(similar_item_input)
similar_item_input

[90, 587, 431, 70, 167]

In [105]:
# to retrive the similar user watched movie id
similar_item_userid_list=[]
for similar_item in similar_item_input: # from similar user input
    similar = list(ratings[ratings['movie_id'] ==similar_item]['user_id']) #ratings contains all the data (ie)df, in df user id == movie_id
    similar_item_userid_list.append(similar)

In [106]:
# to get the lenght of similar user movie_id list
len(similar_item_userid_list)

5

In [107]:
# contains the similar users watched movie list
similar_item_userid_list

[[1,
  314,
  109,
  23,
  13,
  130,
  11,
  7,
  327,
  244,
  5,
  254,
  279,
  95,
  379,
  320,
  405,
  56,
  87,
  303,
  194,
  436,
  361,
  280,
  435,
  286,
  270,
  389,
  94,
  393,
  178,
  336,
  373,
  49,
  521,
  537,
  343,
  44,
  472,
  437,
  291,
  363,
  429,
  495,
  551,
  504,
  450,
  262,
  477,
  690,
  766,
  297,
  622,
  642,
  805,
  715,
  846,
  844,
  506,
  416,
  125,
  417,
  660,
  497,
  823,
  870,
  880,
  476,
  798,
  618,
  659,
  868,
  727,
  326,
  648,
  881,
  222,
  773,
  301,
  892,
  59,
  542,
  394,
  541,
  453,
  751,
  790,
  883,
  712,
  887,
  904,
  916,
  442,
  399,
  806],
 [92, 201, 312, 7, 380, 435, 399, 758, 514, 559, 429, 551, 94, 896],
 [70,
  135,
  180,
  307,
  315,
  197,
  194,
  92,
  327,
  200,
  328,
  77,
  311,
  339,
  249,
  320,
  7,
  59,
  393,
  234,
  178,
  387,
  435,
  429,
  115,
  280,
  13,
  145,
  94,
  417,
  363,
  346,
  62,
  405,
  493,
  97,
  406,
  456,
  22,
  399,
  64,
  472,

In [108]:
#6. Convert all the list as single
import itertools
similar_item_userid_single_list=list(itertools.chain.from_iterable(similar_item_userid_list)) # this converts the movieid into single list

In [109]:
len(similar_item_userid_single_list)
# similar_user_movieid_single_list

580

In [110]:
unique_userid_similar_item = set(similar_item_userid_single_list) # to remove the similar movieid's 
len(unique_userid_similar_item)

345

In [111]:
# to get the input user watched movie list
input_item_watched_userid = list(ratings[ratings['movie_id']==input_item]['user_id'])
len(input_item_watched_userid)
input_item_watched_userid

[1,
 314,
 109,
 23,
 13,
 130,
 11,
 7,
 327,
 244,
 5,
 254,
 279,
 95,
 379,
 320,
 405,
 56,
 87,
 303,
 194,
 436,
 361,
 280,
 435,
 286,
 270,
 389,
 94,
 393,
 178,
 336,
 373,
 49,
 521,
 537,
 343,
 44,
 472,
 437,
 291,
 363,
 429,
 495,
 551,
 504,
 450,
 262,
 477,
 690,
 766,
 297,
 622,
 642,
 805,
 715,
 846,
 844,
 506,
 416,
 125,
 417,
 660,
 497,
 823,
 870,
 880,
 476,
 798,
 618,
 659,
 868,
 727,
 326,
 648,
 881,
 222,
 773,
 301,
 892,
 59,
 542,
 394,
 541,
 453,
 751,
 790,
 883,
 712,
 887,
 904,
 916,
 442,
 399,
 806]

In [112]:
# to create recommendation
recommend =[]
for per_id in unique_userid_similar_item: # unique movie similar user, this remove the movie id which is watched by user and similar user
    if(per_id in input_item_watched_userid): 
        pass
    else:
        recommend.append(per_id) # if the unique movie similar user watched input user watched movie pass else append in recommend

In [113]:
item_prediction

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.446278,0.475473,0.505938,0.443633,0.512667,0.547939,0.446243,0.463059,0.474916,0.515821,...,0.580579,0.576202,0.582478,0.582478,0.575717,0.588155,0.588155,0.588155,0.573107,0.566696
2,0.108544,0.132957,0.125589,0.124932,0.131178,0.129005,0.110883,0.122223,0.109599,0.121525,...,0.135490,0.136546,0.134829,0.134829,0.134108,0.134458,0.134458,0.134458,0.136576,0.137111
3,0.085685,0.091690,0.087643,0.089966,0.089658,0.089985,0.083492,0.089725,0.085188,0.088331,...,0.089770,0.090506,0.086261,0.086261,0.089201,0.084659,0.084659,0.084659,0.089768,0.090845
4,0.053693,0.059604,0.058114,0.058364,0.059356,0.061472,0.053374,0.058615,0.055905,0.060601,...,0.061349,0.061686,0.061195,0.061195,0.060693,0.057937,0.057937,0.057937,0.061673,0.062281
5,0.224739,0.229171,0.263280,0.226387,0.259973,0.296529,0.232710,0.237109,0.258581,0.275076,...,0.297628,0.295990,0.299922,0.299922,0.298188,0.302051,0.302051,0.302051,0.293373,0.294309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.092574,0.113870,0.110211,0.112040,0.112768,0.123140,0.098578,0.110839,0.098858,0.118579,...,0.123829,0.124430,0.120776,0.120776,0.121360,0.125056,0.125056,0.125056,0.123470,0.124327
940,0.164358,0.184894,0.196502,0.164884,0.195860,0.209652,0.162840,0.165606,0.171761,0.194536,...,0.217536,0.215515,0.219136,0.219136,0.216173,0.218583,0.218583,0.218583,0.216582,0.216819
941,0.032300,0.045024,0.042924,0.043223,0.047493,0.051077,0.032761,0.042646,0.039399,0.047421,...,0.052762,0.053042,0.052692,0.052692,0.051514,0.053028,0.053028,0.053028,0.051910,0.052280
942,0.157779,0.174095,0.189000,0.163514,0.186140,0.194151,0.164910,0.156970,0.167038,0.181295,...,0.197537,0.194479,0.198479,0.198479,0.197969,0.199793,0.199793,0.199793,0.197394,0.200031


In [114]:
# to convert the user_prediction value to user_prediction table
item_pred = pd.DataFrame(item_prediction)
item_pred_Trans =  item_pred.T


In [115]:
# From recomd list select hightest rated film which would like by the user. Based on User prediction
highest_Rated=[] 
input_item_pre=pd.DataFrame(item_pred[input_item]) # no transpose, here input user pre is getting create
input_item_pred=input_item_pre.T # need to transpose the input_user_pre to get an transpose for respective input
for re in recommend:  
    value=input_item_pred[re].values # here if the recommend value is greater than or equal to 1 than append highest rated.
    if(value>=1):
        highest_Rated.append(re)

In [116]:
highest_Rated

[655]

In [117]:
def itembased(input_user,item_similarity,item_predictions,similar_item_count,thres):
    
    #Convert the user_sim table into DataFrame
    item_sim_table=pd.DataFrame(item_similarity)
    
    # to find the similar user taste based on the similarity table
    similar_item_input = item_similarity_table[input_item].sort_values(ascending=True).head(5).index 
    # here the input user is 10, for the input user 10 need to find any other similar user are there. 
    
    # this code will convert the output of similar user input to list
    similar_item_input = list(similar_item_input)
#     similar_item_input 
    
        # to retrive the similar user watched movie id
    similar_item_userid_list=[]
    for similar_item in similar_item_input: # from similar user input
        similar = list(ratings[ratings['movie_id'] ==similar_item]['user_id']) #ratings contains all the data (ie)df, in df user id == movie_id
        similar_item_userid_list.append(similar)
        
    #Converting as a whole list
    #6. Convert all the list as single
    import itertools
    similar_item_userid_single_list=list(itertools.chain.from_iterable(similar_item_userid_list)) # this converts the movieid into single list

    unique_userid_similar_item = set(similar_item_userid_single_list) # to remove the similar movieid's 
    
    # to get the input user watched movie list
    input_item_watched_userid = list(ratings[ratings['movie_id']==input_item]['user_id'])
    
    # to create recommendation
    recommend =[]
    for per_id in unique_userid_similar_item: # unique movie similar user, this remove the movie id which is watched by user and similar user
        if(per_id in input_item_watched_userid): 
            pass
        else:
            recommend.append(per_id) # if the unique movie similar user watched input user watched movie pass else append in recommend
            
    # From recomd list select hightest rated film which would like by the user. Based on User prediction
    highest_Rated=[] 
    input_item_pre=pd.DataFrame(item_pred[input_item]) # no transpose, here input user pre is getting create
    input_item_pred=input_item_pre.T # need to transpose the input_user_pre to get an transpose for respective input
    for re in recommend:  
        value=input_item_pred[re].values # here if the recommend value is greater than or equal to 1 than append highest rated.
        if(value>=1):
            highest_Rated.append(re)
    
    highest_Rated
    return highest_Rated

In [119]:
Recommended_user=itembased(90,item_similarity,item_pred,4,0.5)

In [120]:
len(Recommended_user)

1

In [121]:
Recommended_user

[655]