In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

In [2]:
columns = ["userId","gender"]
gender = pd.read_csv("data/libimseti/gender.dat",sep = ",",names = columns)

In [3]:
columns_names = ["userId","profileId","rating"]
rating = pd.read_csv("data/libimseti/ratings.dat",sep = ",",names = columns_names)

In [4]:
rating.head()

Unnamed: 0,userId,profileId,rating
0,1,133,8
1,1,720,6
2,1,971,10
3,1,1095,7
4,1,1616,10


In [5]:
data = pd.merge(rating,gender,on = "userId")

In [6]:
data.head()

Unnamed: 0,userId,profileId,rating,gender
0,1,133,8,F
1,1,720,6,F
2,1,971,10,F
3,1,1095,7,F
4,1,1616,10,F


In [7]:
rating_table_on_userID = data.groupby("userId",as_index=False)["rating"].mean()

In [8]:
data = pd.merge(data, rating_table_on_userID, on = "userId")

In [9]:
data.head()

Unnamed: 0,userId,profileId,rating_x,gender,rating_y
0,1,133,8,F,6.510145
1,1,720,6,F,6.510145
2,1,971,10,F,6.510145
3,1,1095,7,F,6.510145
4,1,1616,10,F,6.510145


In [10]:
data['deviation'] = data["rating_x"] - data["rating_y"]

Taking only those profile who have got more number of ratings

In [11]:
ratings_profileid = pd.DataFrame(data.groupby("profileId")["rating_x"].mean())
ratings_profileid["Number_of_ratings"] = data.groupby("profileId")["rating_x"].count()

for now the threshold is taken as 2500 just to reduce the number of columns that will be created in the pivot table, but the threshold can be taken as less if the computation power of the system is high

In [12]:
threshold = 500
filter_data_profile  = ratings_profileid.query('Number_of_ratings >=@threshold')
filter_data_profile.head()

Unnamed: 0_level_0,rating_x,Number_of_ratings
profileId,Unnamed: 1_level_1,Unnamed: 2_level_1
55,5.780652,889
77,9.200611,982
90,4.439437,1065
132,2.787524,513
133,6.22125,6974


In [13]:
data.shape

(17359346, 6)

In [14]:
ratings_userid = pd.DataFrame(data.groupby("userId")["rating_x"].mean())
ratings_userid["Number_of_ratings_user_based"] = data.groupby("userId")["rating_x"].count()

In [15]:
ratings_userid.describe()

Unnamed: 0,rating_x,Number_of_ratings_user_based
count,135359.0,135359.0
mean,6.011169,128.246707
std,1.255619,378.413414
min,1.3,20.0
25%,5.153846,29.0
50%,5.986706,73.0
75%,6.86135,124.0
max,9.973046,25042.0


In [16]:
threshold = 500
filter_data_user = ratings_userid.query('Number_of_ratings_user_based >=@threshold')
filter_data_user.describe()

Unnamed: 0,rating_x,Number_of_ratings_user_based
count,3910.0,3910.0
mean,5.654863,1384.235038
std,1.093351,1765.289956
min,2.59646,500.0
25%,4.910895,613.0
50%,5.680345,797.0
75%,6.410025,1374.25
max,9.785714,25042.0


In [17]:
filter_data_user = filter_data_user.drop(columns=['rating_x'])

In [18]:
filter_data_profile = filter_data_profile.drop(columns=['rating_x'])

In [19]:
data = pd.merge(data,filter_data_profile, on = "profileId")

In [20]:
data = pd.merge(data,filter_data_user, on = "userId")

In [21]:
data.shape

(1695600, 8)

In [22]:
data.shape

(1695600, 8)

In [23]:
profile_matrix = pd.pivot_table(data,values='deviation',index='userId',columns='profileId')
profile_matrix.head()

profileId,55,77,90,132,133,208,215,243,261,276,...,220715,220717,220718,220752,220754,220760,220782,220840,220861,220953
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,-0.856007,,,,,,,,,,...,,,,,,,,,,
73,,,,,,,,,,,...,,,,,,,,,,
99,,,,,,,,,,,...,,,,,,,,,,
128,,,,,,,5.181488,,,,...,,,,,,,,,,
134,,,,,,,,,,,...,,,-3.427885,,,,,,,-5.427885


In [24]:
final_profile_matrix = profile_matrix.apply(lambda row: row.fillna(row.mean()), axis=1)

In [49]:
final_profile_matrix

profileId,55,77,90,132,133,208,215,243,261,276,...,220715,220717,220718,220752,220754,220760,220782,220840,220861,220953
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,-0.856007,0.383257,0.383257,0.383257,0.383257,0.383257,0.383257,0.383257,0.383257,0.383257,...,0.383257,0.383257,0.383257,0.383257,0.383257,0.383257,0.383257,0.383257,0.383257,0.383257
73,0.620251,0.620251,0.620251,0.620251,0.620251,0.620251,0.620251,0.620251,0.620251,0.620251,...,0.620251,0.620251,0.620251,0.620251,0.620251,0.620251,0.620251,0.620251,0.620251,0.620251
99,0.723090,0.723090,0.723090,0.723090,0.723090,0.723090,0.723090,0.723090,0.723090,0.723090,...,0.723090,0.723090,0.723090,0.723090,0.723090,0.723090,0.723090,0.723090,0.723090,0.723090
128,0.806488,0.806488,0.806488,0.806488,0.806488,0.806488,5.181488,0.806488,0.806488,0.806488,...,0.806488,0.806488,0.806488,0.806488,0.806488,0.806488,0.806488,0.806488,0.806488,0.806488
134,0.401145,0.401145,0.401145,0.401145,0.401145,0.401145,0.401145,0.401145,0.401145,0.401145,...,0.401145,0.401145,-3.427885,0.401145,0.401145,0.401145,0.401145,0.401145,0.401145,-5.427885
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135240,0.663813,0.663813,0.663813,0.663813,0.663813,0.663813,0.663813,0.663813,0.663813,0.663813,...,0.663813,0.663813,0.663813,0.663813,0.663813,0.663813,0.663813,0.663813,0.663813,0.663813
135273,0.634006,0.558575,0.558575,0.558575,0.558575,0.558575,0.558575,0.558575,0.558575,0.558575,...,0.558575,0.558575,0.558575,0.558575,0.558575,0.558575,0.558575,0.558575,0.558575,0.558575
135281,0.752160,0.291474,0.291474,0.291474,0.291474,0.291474,0.291474,0.291474,0.291474,0.291474,...,0.291474,0.291474,0.291474,0.291474,-3.247840,0.291474,0.291474,0.291474,0.291474,0.291474
135285,0.324593,0.324593,0.324593,0.324593,0.324593,0.324593,0.324593,0.324593,1.715596,0.324593,...,0.324593,0.324593,0.324593,0.324593,0.324593,0.324593,0.324593,0.324593,0.324593,0.324593


In [26]:
cosine = cosine_similarity(final_profile_matrix)


In [27]:
np.fill_diagonal(cosine, 0 )
similarity_with_profile = pd.DataFrame(cosine,index=final_profile_matrix.index)

In [28]:
similarity_with_profile.columns = final_profile_matrix.index

In [29]:
similarity_with_profile.head()

userId,9,73,99,128,134,147,155,245,251,299,...,135036,135066,135158,135216,135234,135240,135273,135281,135285,135298
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,0.0,0.219461,0.329288,0.325482,0.112167,0.249814,0.151569,0.344976,0.322972,0.220652,...,0.231191,0.235683,0.26863,0.268901,0.243671,0.239375,0.351643,0.293893,0.19732,0.284677
73,0.219461,0.0,0.541188,0.559878,0.276397,0.472133,0.300351,0.508174,0.37966,0.453018,...,0.431213,0.395241,0.489603,0.367853,0.448288,0.496369,0.377587,0.170179,0.314685,0.590231
99,0.329288,0.541188,0.0,0.785191,0.266659,0.628168,0.359158,0.739263,0.554422,0.539665,...,0.586076,0.518502,0.657078,0.516782,0.584182,0.592498,0.525282,0.232268,0.388382,0.711534
128,0.325482,0.559878,0.785191,0.0,0.278457,0.647201,0.37087,0.727758,0.580035,0.558577,...,0.606109,0.537182,0.680752,0.526027,0.600947,0.603,0.516266,0.232507,0.402916,0.739803
134,0.112167,0.276397,0.266659,0.278457,0.0,0.312911,0.415371,0.257473,0.196617,0.261836,...,0.264193,0.279788,0.280639,0.194904,0.250376,0.23057,0.224869,0.140025,0.1966,0.295105


In [30]:

def find_n_neighbours(df,n):
    order = np.argsort(df.values, axis=1)[:, :n]
    df = df.apply(lambda x: pd.Series(x.sort_values(ascending=False)
           .iloc[:n].index, 
          index=['top{}'.format(i) for i in range(1, n+1)]), axis=1)
    return df

In [31]:
# top 30 neighbours for each user
sim_user_30_p = find_n_neighbours(similarity_with_profile,30)
sim_user_30_p.head()

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,...,top21,top22,top23,top24,top25,top26,top27,top28,top29,top30
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,35605,115105,1017,106373,83817,89746,11858,31620,119759,31007,...,114810,7747,114670,115533,100299,122930,68332,43562,65384,72409
73,37972,33737,70299,119030,48997,5294,28220,64187,125520,30195,...,3398,107754,134696,44497,41155,128398,57331,132034,128775,99329
99,12087,18628,62538,93286,128982,74937,52056,76802,90763,26530,...,31847,28697,35899,81790,64961,119038,47981,28298,129403,58386
128,52498,93516,62538,90763,10838,74937,50294,133879,37830,87943,...,28697,105133,129403,58386,18628,76802,35151,48978,81790,64961
134,78392,46161,77473,15056,44717,81552,63060,37654,54499,76099,...,107092,103232,10036,46716,58801,69075,85983,71873,86691,17678


In [32]:
def get_user_similar_movies( user1, user2 ):
    common_profile = data[data.userId == user1].merge(
    data[data.userId == user2],
    on = "profileId",
    how = "inner" )
    return common_profile

In [33]:
a = get_user_similar_movies(74937,215)
a = a.loc[ : , ['rating_x_x','rating_x_y','profileId']]
a.head()

Unnamed: 0,rating_x_x,rating_x_y,profileId


In [34]:
item = 55
final_profile_matrix.loc[:,item]

userId
9        -0.856007
73        0.620251
99        0.723090
128       0.806488
134       0.401145
            ...   
135240    0.663813
135273    0.634006
135281    0.752160
135285    0.324593
135298    0.791745
Name: 55, Length: 3910, dtype: float64

In [35]:
rating_table_on_userID.loc[rating_table_on_userID['userId'] == 128]

Unnamed: 0,userId,rating
127,128,4.818512


In [36]:
def User_item_score(user,item):
    a = sim_user_30_p[sim_user_30_p.index==user].values
    b = a.squeeze().tolist()
    c = final_profile_matrix.loc[:,item] #avg rating for that particular item
    d = c[c.index.isin(b)]
    f = d[d.notnull()]
    avg_user = rating_table_on_userID.loc[rating_table_on_userID['userId'] == user,'rating'].values[0]
    #print(avg_user) #4.81
    index = f.index.values.squeeze().tolist() #list of all user that is close to 128
    corr = similarity_with_profile.loc[user,index] #what is the correlation of all users wrt to 128
    fin = pd.concat([f, corr], axis=1)
    fin.columns = ['adg_score','correlation']
    fin['score']=fin.apply(lambda x:x['adg_score'] * x['correlation'],axis=1)
    nume = fin['score'].sum()
    deno = fin['correlation'].sum()
    final_score = avg_user + (nume/deno)
    return final_score


In [37]:
score = User_item_score(74937,215)
print("score (u,i) is",score)


score (u,i) is 5.591795661418129


In [41]:
data = data.astype({"profileId": str})#Convert profile is as str to use in join below
profile_user = data.groupby(by = 'userId')['profileId'].apply(lambda x:','.join(x))

#Commma Seperated list of all profiles each user has rated

In [43]:
print(data)

         userId profileId  rating_x gender  rating_y  deviation  \
0           675       133        10      U  7.019042   2.980958   
1           675      1978        10      U  7.019042   2.980958   
2           675      8923         9      U  7.019042   1.980958   
3           675     10148        10      U  7.019042   2.980958   
4           675     11747         9      U  7.019042   1.980958   
...         ...       ...       ...    ...       ...        ...   
1695595   89911    110058        10      M  4.091429   5.908571   
1695596   89911    143280         1      M  4.091429  -3.091429   
1695597   89911     55749         6      M  4.091429   1.908571   
1695598   89911    123056         8      M  4.091429   3.908571   
1695599   89911      2867         6      M  4.091429   1.908571   

         Number_of_ratings  Number_of_ratings_user_based  
0                     6974                          1838  
1                     2132                          1838  
2                 

In [48]:
filter_data_profile

Unnamed: 0_level_0,Number_of_ratings
profileId,Unnamed: 1_level_1
55,889
77,982
90,1065
132,513
133,6974
...,...
220760,826
220782,555
220840,702
220861,533


In [52]:
def User_item_score1(user):
    Movie_seen_by_user = profile_matrix.columns[profile_matrix[profile_matrix.index==user].notna().any()].tolist()
    a = sim_user_30_p[sim_user_30_p.index==user].values
    b = a.squeeze().tolist()
    d = profile_user[profile_user.index.isin(b)]
    l = ','.join(d.values)
    Movie_seen_by_similar_users = l.split(',')
    Movies_under_consideration = list(set(Movie_seen_by_similar_users)-set(list(map(str, Movie_seen_by_user))))
    Movies_under_consideration = list(map(int, Movies_under_consideration))
    #print(Movies_under_consideration)
    score = []
    for item in Movies_under_consideration:
        c = final_profile_matrix.loc[:,item]
        d = c[c.index.isin(b)]
        f = d[d.notnull()]
        avg_user = rating_table_on_userID.loc[rating_table_on_userID['userId'] == user,'rating'].values[0]
        index = f.index.values.squeeze().tolist()
        corr = similarity_with_profile.loc[user,index]
        fin = pd.concat([f, corr], axis=1)
        fin.columns = ['adg_score','correlation']
        fin['score']=fin.apply(lambda x:x['adg_score'] * x['correlation'],axis=1)
        nume = fin['score'].sum()
        deno = fin['correlation'].sum()
        final_score = avg_user + (nume/deno)
        score.append(final_score)
    data_temp = pd.DataFrame({'profileId':Movies_under_consideration,'score':score})
    top_5_recommendation = data_temp.sort_values(by='score',ascending=False).head(5)
    Movie_Name = top_5_recommendation.merge(data_temp, how='inner', on='profileId')
    Movie_Names = Movie_Name.profileId.values.tolist()
    return Movie_Names
   

In [53]:
predicted_movies = User_item_score1(128)
print(" ")
print("The Recommendations for User Id : 128")
print("   ")
for i in predicted_movies:
    print(i)

 
The Recommendations for User Id : 370
   
71636
93681
130120
9855
32792
