### Load Libraries

In [1]:
# import libs
import numpy as np
import pandas as pd

### Load dataset

In [2]:
# read the rating files
r_cols = ['user_id','movie_id','rating','unix_timestamp'] # col names 
ratings = pd.read_csv('ml-100k/u.data',sep ='\t',names =r_cols,encoding='latin-1')
ratings # loading data to csv file and encoding it using latin-1

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [3]:
# to find the unique user and movie id
n_users = ratings.user_id.unique().shape[0] # to get a count of row .shape[0]
n_movitems = ratings.movie_id.unique().shape[0]

In [4]:
print("The number unique users:",n_users)
print("The number of unique Movie_items:",n_movitems)

The number unique users: 943
The number of unique Movie_items: 1682


### Creating pivot table for user and movie based on ratings

In [5]:
datamat = ratings.pivot_table(index='user_id',columns='movie_id',values='rating')
datamat

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [6]:
# replacing NAN with 0
data_matrix = datamat.replace(np.nan,0)
data_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Find Cosine similarity for user id and movie id

In [7]:
# !pip install scikit-learn

In [8]:
import sklearn
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(data_matrix,metric='cosine')
item_similarity = pairwise_distances(data_matrix.T,metric='cosine')

In [9]:
user_similarity

array([[1.33226763e-15, 8.33069016e-01, 9.52540457e-01, ...,
        8.51383057e-01, 8.20492117e-01, 6.01825261e-01],
       [8.33069016e-01, 0.00000000e+00, 8.89408675e-01, ...,
        8.38515222e-01, 8.27732187e-01, 8.94202122e-01],
       [9.52540457e-01, 8.89408675e-01, 0.00000000e+00, ...,
        8.98757435e-01, 8.66583851e-01, 9.73444131e-01],
       ...,
       [8.51383057e-01, 8.38515222e-01, 8.98757435e-01, ...,
        0.00000000e+00, 8.98358201e-01, 9.04880419e-01],
       [8.20492117e-01, 8.27732187e-01, 8.66583851e-01, ...,
        8.98358201e-01, 0.00000000e+00, 8.17535338e-01],
       [6.01825261e-01, 8.94202122e-01, 9.73444131e-01, ...,
        9.04880419e-01, 8.17535338e-01, 0.00000000e+00]])

In [10]:
item_similarity

array([[2.22044605e-16, 5.97617822e-01, 6.69755213e-01, ...,
        1.00000000e+00, 9.52816933e-01, 9.52816933e-01],
       [5.97617822e-01, 2.22044605e-16, 7.26930825e-01, ...,
        1.00000000e+00, 9.21700637e-01, 9.21700637e-01],
       [6.69755213e-01, 7.26930825e-01, 0.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+00, 9.03124947e-01],
       ...,
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [9.52816933e-01, 9.21700637e-01, 1.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [9.52816933e-01, 9.21700637e-01, 9.03124947e-01, ...,
        1.00000000e+00, 1.00000000e+00, 0.00000000e+00]])

### Using formula for user and item we are calcuating the score value 

In [11]:
# creating func for score
def predict(ratings,similarity,type='user'):
    if type == 'user':
        
        mean_user_rating= ratings.mean(axis =1) # avg ratings
        to_get_same_format_mean_rating = mean_user_rating.values[:,np.newaxis] # to get the average ratings output in the same format
        ratings_diff = (ratings - to_get_same_format_mean_rating) # to find the difference for rating and same format mean rating
        # formula to findd the similarity score
        # for user T the output value of formula is important
        pred = to_get_same_format_mean_rating + similarity.dot(ratings_diff)/np.array([np.abs(similarity).sum(axis=1)]).T 
        
    elif type == 'item': # item based no transpose
        pred = ratings.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)])
        
    return pred

In [12]:
# rating_avg = ratings.mean(axis =1)
# rating_avg

In [13]:
# end = rating_avg[:np.newaxis]
# # end
# diff = ratings-end
# diff

In [14]:
user_prediction = predict(data_matrix, user_similarity, type='user')
print("User Prediction:",user_prediction)
item_prediction = predict(data_matrix, item_similarity, type='item')
# print("Item Prediction:",item_prediction)


User Prediction: [[ 2.06532606  0.73430275  0.62992381 ...  0.39359041  0.39304874
   0.3927712 ]
 [ 1.76308836  0.38404019  0.19617889 ... -0.08837789 -0.0869183
  -0.08671183]
 [ 1.79590398  0.32904733  0.15882885 ... -0.13699223 -0.13496852
  -0.13476488]
 ...
 [ 1.59151513  0.27526889  0.10219534 ... -0.16735162 -0.16657451
  -0.16641377]
 [ 1.81036267  0.40479877  0.27545013 ... -0.00907358 -0.00846587
  -0.00804858]
 [ 1.8384313   0.47964837  0.38496292 ...  0.14686675  0.14629808
   0.14641455]]


### As per User based filtering ,first have to find similarity between the input user and others

In [15]:
# input user detail
input_user = 10

In [16]:
# to change the user sim pivot table into user_sim_table
user_similarity_table = pd.DataFrame(user_similarity)
user_similarity_table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
0,1.332268e-15,0.833069,0.952540,0.935642,0.621525,0.569761,0.559633,0.680928,0.921862,0.623456,...,0.630473,0.880518,0.725124,0.810295,0.802674,0.881905,0.685928,0.851383,0.820492,0.601825
1,8.330690e-01,0.000000,0.889409,0.821879,0.927021,0.754157,0.892672,0.896656,0.838952,0.840138,...,0.843014,0.692058,0.641211,0.575954,0.680111,0.771417,0.773210,0.838515,0.827732,0.894202
2,9.525405e-01,0.889409,0.000000,0.655849,0.978755,0.927585,0.933863,0.916940,0.938960,0.934849,...,0.968125,0.957247,0.836171,0.930962,0.875755,0.973729,0.838110,0.898757,0.866584,0.973444
3,9.356422e-01,0.821879,0.655849,0.000000,0.968196,0.931956,0.908770,0.811940,0.898716,0.939141,...,0.947893,0.963216,0.866885,0.806529,0.853942,0.969862,0.803142,0.847959,0.829914,0.941248
4,6.215248e-01,0.927021,0.978755,0.968196,0.000000,0.762714,0.626400,0.751070,0.943153,0.798573,...,0.661206,0.919420,0.905076,0.920221,0.851393,0.928541,0.760045,0.860405,0.847503,0.686059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,8.819047e-01,0.771417,0.973729,0.969862,0.928541,0.888148,0.892973,0.904102,0.960148,0.928540,...,0.933961,0.568846,0.741979,0.773551,0.567334,0.000000,0.912313,0.819971,0.956736,0.855750
939,6.859280e-01,0.773210,0.838110,0.803142,0.760045,0.647551,0.670075,0.753117,0.879505,0.657039,...,0.672847,0.892976,0.812464,0.818683,0.824842,0.912313,0.000000,0.854848,0.738624,0.758972
940,8.513831e-01,0.838515,0.898757,0.847959,0.860405,0.855554,0.940007,0.853855,0.856755,0.909695,...,0.953048,0.796699,0.711682,0.765789,0.686600,0.819971,0.854848,0.000000,0.898358,0.904880
941,8.204921e-01,0.827732,0.866584,0.829914,0.847503,0.682672,0.717997,0.824678,0.907503,0.787670,...,0.773560,0.926487,0.910412,0.870446,0.900615,0.956736,0.738624,0.898358,0.000000,0.817535


In [17]:
# to find the similar user taste based on the similarity table
similar_user_input = user_similarity_table[input_user].sort_values(ascending=True).head(10).index 
# here the input user is 10, for the input user 10 need to find any other similar user are there. 

In [18]:
# user_similarity_table.iloc[[10,270,93,1]] # to check col values

In [19]:
# this code will convert the output of similar user input to list
similar_user_input = list(similar_user_input)
similar_user_input

[10, 270, 93, 377, 58, 449, 513, 296, 532, 536]

In [20]:
# to retrive the similar user watched movie id
similar_user_movieid_list=[]
for similar_user in similar_user_input: # from similar user input
    similar = list(ratings[ratings['user_id'] ==similar_user]['movie_id']) #ratings contains all the data (ie)df, in df user id == movie_id
    similar_user_movieid_list.append(similar)

In [21]:
# to get the lenght of similar user movie_id list
len(similar_user_movieid_list)

10

In [22]:
# contains the similar users watched movie list
similar_user_movieid_list

[[16,
  486,
  175,
  611,
  7,
  100,
  461,
  488,
  285,
  504,
  289,
  340,
  505,
  489,
  657,
  463,
  655,
  321,
  48,
  203,
  218,
  124,
  283,
  4,
  704,
  22,
  155,
  294,
  656,
  64,
  702,
  615,
  200,
  223,
  651,
  711,
  705,
  696,
  367,
  509,
  135,
  498,
  164,
  191,
  496,
  385,
  519,
  156,
  703,
  99,
  194,
  588,
  695,
  447,
  418,
  160,
  176,
  435,
  706,
  275,
  371,
  234,
  168,
  497,
  475,
  269,
  493,
  495,
  195,
  521,
  13,
  183,
  132,
  420,
  712,
  186,
  199,
  710,
  558,
  483,
  56,
  603,
  127,
  133,
  98,
  1,
  652,
  144,
  698,
  474,
  82,
  510,
  216,
  211,
  531,
  357,
  162,
  161,
  697,
  50,
  157,
  302,
  694,
  606,
  221,
  432,
  116,
  334,
  692,
  480,
  527,
  274,
  333,
  33,
  245,
  404,
  174,
  129,
  178,
  11,
  182,
  134,
  137,
  462,
  205,
  663,
  467,
  511,
  32,
  513,
  529,
  664,
  530,
  319,
  273,
  693,
  197,
  69,
  60,
  708,
  170,
  602,
  582,
  701,
  185,
  499,

In [23]:
#6. Convert all the list as single
import itertools
similar_user_movieid_single_list=list(itertools.chain.from_iterable(similar_user_movieid_list)) # this converts the movieid into single list

In [24]:
len(similar_user_movieid_single_list)
# similar_user_movieid_single_list

1210

In [25]:
unique_movie_similar_user = set(similar_user_movieid_single_list) # to remove the similar movieid's 
len(unique_movie_similar_user)

646

In [26]:
# to get the input user watched movie list
input_user_watched_movieid = list(ratings[ratings['user_id']==input_user]['movie_id'])
len(input_user_watched_movieid)
# input_user_watched_movieid

184

In [27]:
# to create recommendation
recommend =[]
for per_id in unique_movie_similar_user: # unique movie similar user, this remove the movie id which is watched by user and similar user
    if(per_id in input_user_watched_movieid): 
        pass
    else:
        recommend.append(per_id) # if the unique movie similar user watched input user watched movie pass else append in recommend

In [29]:
user_prediction

array([[ 2.06532606,  0.73430275,  0.62992381, ...,  0.39359041,
         0.39304874,  0.3927712 ],
       [ 1.76308836,  0.38404019,  0.19617889, ..., -0.08837789,
        -0.0869183 , -0.08671183],
       [ 1.79590398,  0.32904733,  0.15882885, ..., -0.13699223,
        -0.13496852, -0.13476488],
       ...,
       [ 1.59151513,  0.27526889,  0.10219534, ..., -0.16735162,
        -0.16657451, -0.16641377],
       [ 1.81036267,  0.40479877,  0.27545013, ..., -0.00907358,
        -0.00846587, -0.00804858],
       [ 1.8384313 ,  0.47964837,  0.38496292, ...,  0.14686675,
         0.14629808,  0.14641455]])

In [58]:
# to convert the user_prediction value to user_prediction table
user_pred = pd.DataFrame(user_prediction)
user_pred_Trans =  user_pred.T


In [63]:
# From recomd list select hightest rated film which would like by the user. Based on User prediction
highest_Rated=[] 
input_user_pre=pd.DataFrame(user_pred_Trans[input_user]) # by using the transpose, here input user pre is getting create
input_user_pred=input_user_pre.T # need to transpose the input_user_pre to get an transpose for respective input
for re in recommend:  
    value=input_user_pred[re].values # here if the recommend value is greater than or equal to 1 than append highest rated.
    if(value>=1):
        highest_Rated.append(re)

In [67]:
# in u.item we have the list of movie names with respective details 
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
    'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
    'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,encoding='latin-1')

In [76]:
#Creating Movie List based on recom movieid
movie_title=[] # to create a movie name list
for movieid in highest_Rated: # from highest rate movie id
    mov=items[items['movie id']==movieid]['movie title'].values # from item table get the highest rating movie
    movie_title.append(mov)

In [77]:
# movie_title

In [79]:
#Converting into pure list
movie_title_list=[] # to create separate list for movie without any extra info like dtype and array
for m in movie_title: 
    print(m)
    mv=list(m)
    movie_title_list.append(mv)


['Babe (1995)']
['Richard III (1995)']
['Postino, Il (1994)']
['Muppet Treasure Island (1996)']
['Rumble in the Bronx (1995)']
['I.Q. (1994)']
['Professional, The (1994)']
['Santa Clause, The (1994)']
['Sleepless in Seattle (1993)']
['Aladdin (1992)']
['Terminator 2: Judgment Day (1991)']
['Dances with Wolves (1990)']
['Rock, The (1996)']
['Striptease (1996)']
['Sound of Music, The (1965)']
['Swingers (1996)']
['Private Benjamin (1980)']
['Delicatessen (1991)']
['Empire Strikes Back, The (1980)']
['Princess Bride, The (1987)']
['Return of the Jedi (1983)']
['Henry V (1989)']
['Right Stuff, The (1983)']
['This Is Spinal Tap (1984)']
['Field of Dreams (1989)']
['Star Trek VI: The Undiscovered Country (1991)']
['Jerry Maguire (1996)']
['When the Cats Away (Chacun cherche son chat) (1996)']
['Men in Black (1997)']
['unknown']
['Chasing Amy (1997)']
['Starship Troopers (1997)']
['River Wild, The (1994)']
["Marvin's Room (1996)"]
['Donnie Brasco (1997)']
['In & Out (1997)']
['Midnight in the

In [82]:
#Converting into whole list
import itertools
Final_Recommend_movie=list(itertools.chain.from_iterable(movie_title_list))
print("The common Movie in Recom & User:",list(set(recommend)&set(input_user_watched_movieid)))
# return Final_Recommend_movie
    

The common Movie in Recom & User: []


In [95]:
def userbased(input_user,user_similarity,user_predictions,similar_user_count,thres):
    
    #Convert the user_sim table into DataFrame
    user_sim_table=pd.DataFrame(user_similarity)
    
    #Find similarity user for 78 using cosine table
    similar_input_user= user_sim_table[input_user].sort_values(ascending=True).head(similar_user_count).index
    
    #Convert in to list
    similar_user_input=list(similar_input_user) 
    
    #Using similar_user_input,can select movie id from ratings table
    similar_user_movieid_list=[]
    for sim_user in similar_user_input:
        sim=list(ratings[ratings['user_id']==sim_user]['movie_id'])
        similar_user_movieid_list.append(sim)
        
    #Converting as a whole list
    import itertools
    similar_user_movieid_single_list=list(itertools.chain.from_iterable(similar_user_movieid_list))
    
    #Unique movieid from the list
    Unique_movieid_similar_user=set(similar_user_movieid_single_list)
    
    #Input user watched movie_list
    input_user_watched_movieid=list(ratings[ratings['user_id']==input_user]['movie_id'].values)
    
    #Create a list which should have recom movieid to the input user
    recom=[]
    for per_id in Unique_movieid_similar_user:
        if(per_id in input_user_watched_movieid):
            pass
        else:
            recom.append(per_id)
            
    #From recommendation list selecting only hightest rated(predicted) value
    highest_Rated=[]
    user_pred=pd.DataFrame(user_prediction)
    user_pred_Trans=user_pred.T
    input_user_pre=pd.DataFrame(user_pred_Trans[input_user])
    input_user_pred=input_user_pre.T
    
    for re in recom:
        value=input_user_pred[re].values
        if(value>=thres):
            highest_Rated.append(re)
            
    i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
    'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
    'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
    items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,encoding='latin-1')
    
    #Creating Movie List based on recom movieid
    movie_title=[]
    for movieid in highest_Rated:
        mov=items[items['movie id']==movieid]['movie title'].values
        movie_title.append(mov)
        
    #Converting into pure list
    movie_title_list=[]
    for m in movie_title:
        print(m)
        mv=list(m)
        movie_title_list.append(mv)
        
    #Converting into whole list
    import itertools
    Final_Recommend_movie=list(itertools.chain.from_iterable(movie_title_list))
    print("The common Movie in Recom & User:",list(set(recom)&set(input_user_watched_movieid)))
    return Final_Recommend_movie
    

In [96]:
Recommended_movie=userbased(34,user_similarity,user_pred,5,2)

['I.Q. (1994)']
['Snow White and the Seven Dwarfs (1937)']
['Apocalypse Now (1979)']
The common Movie in Recom & User: []
