In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('magazines_rating.csv')
df.head()

Unnamed: 0,user_id,magazine_id,ratings
0,AH2IFH762VY5U,B00005N7P0,5
1,AOSFI0JEYU4XM,B00005N7P0,5
2,A3JPFWKS83R49V,B00005N7OJ,3
3,A19FKU6JZQ2ECJ,B00005N7OJ,5
4,A25MDGOMZ2GALN,B00005N7P0,5


## Popularity based Recommender system

In [3]:
num_rating_df = df.groupby('magazine_id').count()['ratings'].reset_index()
num_rating_df = num_rating_df.rename(columns={'ratings':'number_of_ratings'})
num_rating_df

Unnamed: 0,magazine_id,number_of_ratings
0,B00005N7NQ,116
1,B00005N7O3,5
2,B00005N7O4,6
3,B00005N7O6,64
4,B00005N7O9,23
...,...,...
2423,B01HI8V1AE,4
2424,B01HI8V1C2,10
2425,B01HI8V1I6,24
2426,B01HI8V1MC,14


In [14]:
avg_rating_df = df[["magazine_id","ratings"]].groupby('magazine_id').mean()['ratings'].reset_index()
avg_rating_df = avg_rating_df.rename(columns={'ratings': 'avg_ratings'})
avg_rating_df


Unnamed: 0,magazine_id,avg_ratings
0,B00005N7NQ,3.793103
1,B00005N7O3,3.600000
2,B00005N7O4,4.333333
3,B00005N7O6,4.531250
4,B00005N7O9,4.434783
...,...,...
2423,B01HI8V1AE,4.750000
2424,B01HI8V1C2,4.000000
2425,B01HI8V1I6,4.500000
2426,B01HI8V1MC,4.071429


In [15]:
popular_df = num_rating_df.merge(avg_rating_df, on = 'magazine_id')
popular_df.head()

Unnamed: 0,magazine_id,number_of_ratings,avg_ratings
0,B00005N7NQ,116,3.793103
1,B00005N7O3,5,3.6
2,B00005N7O4,6,4.333333
3,B00005N7O6,64,4.53125
4,B00005N7O9,23,4.434783


In [16]:
popular_df.shape

(2428, 3)

In [17]:
popular_df = popular_df[popular_df['number_of_ratings']>=100].sort_values('avg_ratings', ascending=False).head(100)
popular_df.head()


Unnamed: 0,magazine_id,number_of_ratings,avg_ratings
1582,B0065MEDRI,343,4.781341
1314,B000NY15YI,109,4.761468
649,B00006LL1D,382,4.65445
1276,B000IOEJBO,144,4.645833
1344,B000UEI4JU,235,4.621277


In [18]:
popular_df.shape

(100, 3)

In [19]:
popular_df = popular_df.merge(df, on='magazine_id')
popular_df.shape

(33894, 5)

In [20]:
popular_df = popular_df.drop_duplicates('magazine_id')
popular_df.shape

(100, 5)

In [21]:
popular_df

Unnamed: 0,magazine_id,number_of_ratings,avg_ratings,user_id,ratings
0,B0065MEDRI,343,4.781341,A2TUZYARMUEPY2,5
343,B000NY15YI,109,4.761468,A2AX34ALJ2APN2,5
452,B00006LL1D,382,4.654450,A3NQI54DKBTTZ6,5
834,B000IOEJBO,144,4.645833,A24A2HVAN6784Z,5
978,B000UEI4JU,235,4.621277,A1YRODJA6SA0NS,5
...,...,...,...,...,...
30731,B000W3MB5M,441,4.031746,A3NWN21HH5CEWZ,5
31172,B000XXDJ70,109,4.027523,A5IBTO5P48ZNU,5
31281,B00005N7SM,362,4.022099,AC3OYGYW30MJB,5
31643,B000063XJL,772,3.996114,A1EMDSTJDUE6B0,5


In [22]:
popular_df = popular_df[['user_id', 'magazine_id', 'number_of_ratings', 'avg_ratings']]
popular_df.head()

Unnamed: 0,user_id,magazine_id,number_of_ratings,avg_ratings
0,A2TUZYARMUEPY2,B0065MEDRI,343,4.781341
343,A2AX34ALJ2APN2,B000NY15YI,109,4.761468
452,A3NQI54DKBTTZ6,B00006LL1D,382,4.65445
834,A24A2HVAN6784Z,B000IOEJBO,144,4.645833
978,A1YRODJA6SA0NS,B000UEI4JU,235,4.621277


# collaborative filtering apporach

In [23]:
#finding those user who have rated more than or equal to 10 magazine
#filtering based on users 
df_ind = df.groupby('user_id').count()['ratings']>10
user_that_rated_more_than_10 = df_ind[df_ind].index

In [24]:
df_user = df[df['user_id'].isin(user_that_rated_more_than_10)]
df_user

Unnamed: 0,user_id,magazine_id,ratings
2,A3JPFWKS83R49V,B00005N7OJ,3
73,A1RPTVW5VEOSI,B00005N7PS,1
246,AG6TX1ZJHLMO7,B00005N7PS,4
286,A1RPTVW5VEOSI,B00005N7OU,4
296,A3JPFWKS83R49V,B00005N7PS,3
...,...,...,...
88181,A3H8DSJE2TY9AU,B01EGFDTZW,4
88183,AC98CDA0GZDKJ,B01EGFDTZW,5
88276,A1J5ZHE8SG700J,B01H6WOLWC,5
88283,A1J5ZHE8SG700J,B01H6WOLSG,5


In [25]:
#filtering based on magazine
#filtering those 
b_ind = df_user.groupby('magazine_id').count()['ratings']>=4
famous_mag = b_ind[b_ind].index

In [26]:
df_final = df_user[df_user['magazine_id'].isin(famous_mag)]
df_final

Unnamed: 0,user_id,magazine_id,ratings
734,A2JJGIJI8X6KRS,B00005N7OV,5
737,A281NPSIMI1C2R,B00005N7OV,5
749,A281NPSIMI1C2R,B00005N7RD,5
1147,A1CKHSVHDVTKFT,B00005N7OV,5
1411,A1JZFGZEZVWQPY,B00005N7RD,4
...,...,...,...
75231,A254QZWPSXZG52,B000060MJC,5
79869,A3KCW7GFEL6V46,B00079RO7G,1
80946,A2OTUWUSH49XIN,B000ILY9LW,2
81069,A2LI10MAXV0815,B000IOE9Y6,5


In [27]:
df_final.drop_duplicates()

Unnamed: 0,user_id,magazine_id,ratings
734,A2JJGIJI8X6KRS,B00005N7OV,5
737,A281NPSIMI1C2R,B00005N7OV,5
749,A281NPSIMI1C2R,B00005N7RD,5
1147,A1CKHSVHDVTKFT,B00005N7OV,5
1411,A1JZFGZEZVWQPY,B00005N7RD,4
...,...,...,...
75231,A254QZWPSXZG52,B000060MJC,5
79869,A3KCW7GFEL6V46,B00079RO7G,1
80946,A2OTUWUSH49XIN,B000ILY9LW,2
81069,A2LI10MAXV0815,B000IOE9Y6,5


In [28]:
pt = df_final.pivot_table(index = 'magazine_id', columns = 'user_id', values= 'ratings')

In [29]:
pt

user_id,A10VAZLEGTEZU0,A1CKHSVHDVTKFT,A1EMDSTJDUE6B0,A1EWCCX0LSRO7J,A1FEDK55GMLSV,A1GQAKL9CGQLP1,A1HV7PFWWGX2K3,A1J5ZHE8SG700J,A1J6UA1MD96SKE,A1JZFGZEZVWQPY,...,AG6TX1ZJHLMO7,AGT43FXXJM1AP,AHD101501WCN1,AKMEY1BSHSDG7,AMKQP0R290TLS,AOR590VF8YI5J,AVF9FV7AMRP5C,AW3VZ5O895LRK,AXV3OJG9BZ2JW,AYFVLMPVVDC4L
magazine_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B00005N7OV,,5.0,,,5.0,,5.0,,,,...,,,,,,,,,,
B00005N7PG,,,,,,,,,,,...,,,,,,,,,,
B00005N7PN,,,,,4.0,5.0,5.0,5.0,,,...,,,,5.0,5.0,,,,,
B00005N7Q1,,,,4.0,,,5.0,,,,...,,,,,5.0,,,,,
B00005N7QA,,,5.0,,,,,,,,...,,,,,,,3.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B001LF4EVO,5.0,,,,,,,,,,...,,,,5.0,,,,,,5.0
B0037STB02,,,,,,,,,5.0,,...,,5.0,,,,,5.0,,,
B00EVV77A0,,5.0,,,,,,,,,...,,,,,,,,,,5.0
B00EZB37X2,,,,,,,,,,,...,,,,,,,,,,


In [30]:
pt.fillna(0, inplace=True)

In [51]:
pt

user_id,A10VAZLEGTEZU0,A1CKHSVHDVTKFT,A1EMDSTJDUE6B0,A1EWCCX0LSRO7J,A1FEDK55GMLSV,A1GQAKL9CGQLP1,A1HV7PFWWGX2K3,A1J5ZHE8SG700J,A1J6UA1MD96SKE,A1JZFGZEZVWQPY,...,AG6TX1ZJHLMO7,AGT43FXXJM1AP,AHD101501WCN1,AKMEY1BSHSDG7,AMKQP0R290TLS,AOR590VF8YI5J,AVF9FV7AMRP5C,AW3VZ5O895LRK,AXV3OJG9BZ2JW,AYFVLMPVVDC4L
magazine_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B00005N7OV,0.0,5.0,0.0,0.0,5.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00005N7PG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00005N7PN,0.0,0.0,0.0,0.0,4.0,5.0,5.0,5.0,0.0,0.0,...,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0
B00005N7Q1,0.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
B00005N7QA,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B001LF4EVO,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0
B0037STB02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
B00EVV77A0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
B00EZB37X2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
# !pip install -U scikit-learn

In [38]:
similarity_score  = cosine_similarity(pt)

In [41]:
similarity_score

array([[1.        , 0.15362886, 0.29013059, ..., 0.40481612, 0.16922436,
        0.28429693],
       [0.15362886, 1.        , 0.11926968, ..., 0.        , 0.        ,
        0.        ],
       [0.29013059, 0.11926968, 1.        , ..., 0.25391425, 0.        ,
        0.16750598],
       ...,
       [0.40481612, 0.        , 0.25391425, ..., 1.        , 0.24310832,
        0.42300848],
       [0.16922436, 0.        , 0.        , ..., 0.24310832, 1.        ,
        0.16666667],
       [0.28429693, 0.        , 0.16750598, ..., 0.42300848, 0.16666667,
        1.        ]])

In [56]:
# helps to find the index of the magazine_id
np.where(pt.index == 'B01CF3ECNK')[0][0]


81

In [45]:
def recommend(magazine_name):
    #index fetch
    index = np.where(pt.index == magazine_name)[0][0]
    # sorting on the basis of score
    similar_item = sorted(list(enumerate(similarity_score[index])), key = lambda x:x[1], reverse=True)[1:6]
    for i in similar_item:
        print(pt.index[i[0]])

In [47]:
recommend('B00005N7QA')


B00005N7PG
B00005NIP7
B00005R8BA
B00005NIRE
B000085A6U


In [49]:
import pickle
# pickle.dump(popular_df, open('popular.pkl', 'wb'))

In [50]:
pickle.dump(similarity_score, open('similarity_score.pkl', 'wb'))


In [52]:
pickle.dump(pt, open('pivot_table.pkl', 'wb'))
