# Collaborative Filtering 

In [1]:
import pandas as pd
from surprise import Reader, Dataset, SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
from surprise.accuracy import rmse

import warnings
warnings.filterwarnings("ignore")

In [2]:
df_posts = pd.read_csv('raw/posts.csv')
df_users = pd.read_csv('raw/users.csv')
df_views = pd.read_csv('raw/views.csv')

df_posts.rename(columns={'_id': 'post_id', ' post_type': 'post_type'}, inplace=True)
df_users.rename(columns={'_id': 'user_id'}, inplace=True)
df_posts.category = df_posts.category.fillna('')

df_merged = pd.merge(df_views, df_users, on='user_id')
df_merged = pd.merge(df_merged, df_posts, on='post_id')

df_merged.drop(columns='timestamp', inplace=True)
df_merged.head()

Unnamed: 0,user_id,post_id,name,gender,academics,title,category,post_type
0,5df49b32cc709107827fb3c7,5ec821ddec493f4a2655889e,Niriksha Sharma,female,undergraduate,Save Earth.,Visual Arts|Graphic Design|Artistic design|Gra...,artwork
1,5d7c994d5720533e15c3b1e9,5ec821ddec493f4a2655889e,Varun Chowhan,male,undergraduate,Save Earth.,Visual Arts|Graphic Design|Artistic design|Gra...,artwork
2,5e5af599d701ab08af792b63,5ec821ddec493f4a2655889e,Ilupeju Ayokunnumi,female,graduate,Save Earth.,Visual Arts|Graphic Design|Artistic design|Gra...,artwork
3,5de50d768eab6401affbb135,5ec821ddec493f4a2655889e,thesocialcomment,male,graduate,Save Earth.,Visual Arts|Graphic Design|Artistic design|Gra...,artwork
4,5deeef6142a8854bf6eabab9,5ec821ddec493f4a2655889e,siddharth saxena,male,undergraduate,Save Earth.,Visual Arts|Graphic Design|Artistic design|Gra...,artwork


In [3]:
print(df_posts.post_type.unique())
print(df_users.gender.unique())
print(df_users.academics.unique())

['blog' 'artwork' 'project' 'skill']
['male' 'female' 'undefined']
['undergraduate' 'graduate' 'undefined']


In [4]:
# assigning weights/ranks to different dependencies
w1 = {'skill': 4.1, 'project': 3, 'artwork': 2.1, 'blog': 0.9}
w2 = {'male': 3.1, 'female': 2.5, 'undefined': 1.5}
w3 = {'graduate': 4, 'undergraduate': 3, 'undefined': 1.5}

df_merged['strength'] = ((df_merged['post_type'].apply(lambda x: w1[x]))/4.1 + (df_merged['gender'].apply(lambda x: w2[x]))/3.1 + (df_merged['academics'].apply(lambda x: w3[x]))/4)/3
df_merged['strength'] = 5*(df_merged['strength'].values/max(df_merged['strength'].values))

df_merged = df_merged[['user_id', 'post_id', 'strength']]
df_merged.head()

Unnamed: 0,user_id,post_id,strength
0,5df49b32cc709107827fb3c7,5ec821ddec493f4a2655889e,3.447745
1,5d7c994d5720533e15c3b1e9,5ec821ddec493f4a2655889e,3.770325
2,5e5af599d701ab08af792b63,5ec821ddec493f4a2655889e,3.864411
3,5de50d768eab6401affbb135,5ec821ddec493f4a2655889e,4.186992
4,5deeef6142a8854bf6eabab9,5ec821ddec493f4a2655889e,3.770325


In [5]:
reader = Reader()
data = Dataset.load_from_df(df_merged, reader)
bestAlgo = []

for algo in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    result = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=1)
    temp = pd.DataFrame.from_dict(result).mean(axis=0)
    temp = temp.append(pd.Series([str(algo).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    bestAlgo.append(temp)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.3161  0.3504  0.3267  0.2995  0.3300  0.3245  0.0168  
MAE (testset)     0.2400  0.2515  0.2462  0.2279  0.2541  0.2439  0.0094  
Fit time          0.06    0.05    0.05    0.05    0.05    0.05    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2608  0.2796  0.2923  0.2717  0.2947  0.2798  0.0127  
MAE (testset)     0.1983  0.2073  0.2076  0.2030  0.2120  0.2056  0.0046  
Fit time          0.74    0.75    0.79    0.81    0.79    0.78    0.03    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Evaluating RMSE, MAE of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (

In [6]:
final = pd.DataFrame(bestAlgo).sort_values('test_rmse').set_index('Algorithm')
final

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KNNBaseline,0.214588,0.144383,0.002236,0.002814
KNNWithMeans,0.234028,0.154506,0.001599,0.002472
KNNWithZScore,0.259283,0.171409,0.004164,0.002927
SVDpp,0.279822,0.205645,0.775496,0.010565
SlopeOne,0.286599,0.207422,0.005305,0.006514
SVD,0.324534,0.243925,0.053574,0.001397
KNNBasic,0.326612,0.241706,0.000649,0.002097
BaselineOnly,0.327636,0.243514,0.001526,0.000821
NMF,0.342142,0.256405,0.072567,0.001205
CoClustering,0.619922,0.530296,0.071726,0.00092


In [7]:
# although all these parameters have some default values. Refer documentation
# other names are pearson_baseline, msd
sim_options = {'name': 'cosine', 'user_based': True, 'shrinkage': 0}    # 'user_based': True means perform user based recommendation, false means do item based recommendation

# Using Alternating Least Squares (ALS)
bsl_optionsA = {'method': 'als', 'reg_u': 15, 'reg_i': 5, 'n_epochs': 20}  # reg_u, reg_i = regularization parameter for users and items
# Using Stochastic Gradient Descent (SGD)
bsl_optionsS = {'method': 'sgd', 'reg': 0.02, 'learning_rate': .00005, 'n_epochs': 20}

algoA = KNNWithMeans(sim_options=sim_options, bsl_options=bsl_optionsA)
algoS = KNNWithMeans(sim_options=sim_options, bsl_options=bsl_optionsS)

print('ALS-------------------------------------------------------------------------------------------------------------')
cross_validate(algoA, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('SGD-------------------------------------------------------------------------------------------------------------')
cross_validate(algoS, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

ALS-------------------------------------------------------------------------------------------------------------
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2096  0.2171  0.2448  0.2642  0.2480  0.2367  0.0203  
MAE (testset)     0.1458  0.1397  0.1646  0.1639  0.1598  0.1547  0.0101  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    
SGD------------------------------------------------------------------

{'test_rmse': array([0.24591666, 0.238358  , 0.25476606, 0.24212474, 0.23254704]),
 'test_mae': array([0.16329809, 0.1514827 , 0.15568497, 0.15393645, 0.1493136 ]),
 'fit_time': (0.002235889434814453,
  0.0019309520721435547,
  0.0016949176788330078,
  0.0020639896392822266,
  0.001986980438232422),
 'test_time': (0.002772808074951172,
  0.002376079559326172,
  0.0024781227111816406,
  0.0022950172424316406,
  0.0023660659790039062)}

In [8]:
train, test = train_test_split(data, test_size=0.2, random_state=200)
algo = KNNWithMeans(algo=KNNWithMeans(sim_options=sim_options, bsl_options=bsl_optionsA))
prediction = algo.fit(train).test(test)
accuracy.rmse(prediction)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.2443


0.24433621848935497

In [9]:
def getU(ruid):
    try:
        return len(train.ur[train.to_inner_uid(ruid)])
    except ValueError: # User id is not a part of trainset
        return 0

def getI(riid):
    try:
        return len(train.ir[train.to_inner_iid(riid)])
    except ValueError: # Item id is not a part of trainset
        return 0

df_new = pd.DataFrame(prediction, columns=['user_id', 'post_id', 'rui', 'est', 'details'])
df_new['no_item_rated_by_user'] = df_new.user_id.apply(getU)
df_new['no_user_rated_item'] = df_new.post_id.apply(getI)
df_new['errors'] = abs(df_new.est - df_new.rui)
df_new.head()

Unnamed: 0,user_id,post_id,rui,est,details,no_item_rated_by_user,no_user_rated_item,errors
0,5e5855ced701ab08af792b51,5e7bd922cfc8b713f5ac7da9,2.95994,3.711433,"{'was_impossible': True, 'reason': 'User and/o...",0,2,0.751493
1,5ecb979eeaff6b0c3a58a4f0,5eca8fceeaff6b0c3a58a3c0,3.813598,3.835909,"{'actual_k': 6, 'was_impossible': False}",39,7,0.022311
2,5d60098a653a331687083238,5ec278b574f7660d73aa10d5,3.770325,3.654884,"{'actual_k': 5, 'was_impossible': False}",178,5,0.115441
3,5e35a5ed8d344822fed4d13e,5ed0e31a76027d35905cc302,4.230265,4.28696,"{'actual_k': 7, 'was_impossible': False}",16,7,0.056695
4,5e1ef04c2a37d20505da2b8b,5eb1551e10426255a7aaa003,3.447745,3.484953,"{'actual_k': 3, 'was_impossible': False}",46,3,0.037208


In [10]:
bestPred = df_new.sort_values(by='errors')
worstPred = df_new.sort_values(by='errors', ascending=False)

In [11]:
bestPred.head()

Unnamed: 0,user_id,post_id,rui,est,details,no_item_rated_by_user,no_user_rated_item,errors
278,5d7c994d5720533e15c3b1e9,5eb2cbde10426255a7aaa074,3.770325,3.770325,"{'actual_k': 1, 'was_impossible': False}",75,1,0.0
183,5d610ae1653a331687083239,5eaf8b9310426255a7aa9f7e,5.0,5.0,"{'actual_k': 2, 'was_impossible': False}",105,3,0.0
232,5e99b0d4a3258347b42f2bf0,5e9a7e73a3258347b42f2c24,3.770325,3.770325,"{'actual_k': 1, 'was_impossible': False}",3,1,0.0
148,5e7f4ad1a3258347b42f2155,5e897f6ca3258347b42f25cd,3.28252,3.28252,"{'actual_k': 0, 'was_impossible': False}",5,1,0.0
91,5e1ef04c2a37d20505da2b8b,5e2d4d63c85ab714a7da66db,3.447745,3.447745,"{'actual_k': 1, 'was_impossible': False}",46,1,0.0


In [12]:
worstPred.head()

Unnamed: 0,user_id,post_id,rui,est,details,no_item_rated_by_user,no_user_rated_item,errors
289,5e4ce251f5561b1994c8e40d,5ea7cd9610426255a7aa9bd2,5.0,3.711433,"{'was_impossible': True, 'reason': 'User and/o...",0,4,1.288567
195,5e5af599d701ab08af792b63,5de8d73249e8203ff9219a74,4.677419,3.711433,"{'was_impossible': True, 'reason': 'User and/o...",50,0,0.965986
45,5d60098a653a331687083238,5ec2d29074f7660d73aa113b,4.583333,3.711433,"{'was_impossible': True, 'reason': 'User and/o...",178,0,0.8719
282,5ea5bf5110426255a7aa9b88,5ea5aacd10426255a7aa9b71,4.230265,3.376606,"{'actual_k': 0, 'was_impossible': False}",1,4,0.853659
96,5e840a75a3258347b42f2437,5e4ed85af5561b1994c8e470,4.552846,3.711433,"{'was_impossible': True, 'reason': 'User and/o...",0,1,0.841412


In [13]:
df_new = pd.merge(df_new, df_posts, on='post_id')
df_new = df_new[['user_id', 'post_id', 'title', 'category', 'post_type', 'rui', 'est', 'errors']]
df_new.head()

Unnamed: 0,user_id,post_id,title,category,post_type,rui,est,errors
0,5e5855ced701ab08af792b51,5e7bd922cfc8b713f5ac7da9,What sports will look like in the future,Computer Technology|Robotics|Data Science|Info...,blog,2.95994,3.711433,0.751493
1,5ecb979eeaff6b0c3a58a4f0,5eca8fceeaff6b0c3a58a3c0,Zero-Waste Lifestyle,,project,3.813598,3.835909,0.022311
2,5d60098a653a331687083238,5ec278b574f7660d73aa10d5,Rides,Drawings,artwork,3.770325,3.654884,0.115441
3,5e35a5ed8d344822fed4d13e,5ec278b574f7660d73aa10d5,Rides,Drawings,artwork,3.864411,3.881885,0.017473
4,5e35a5ed8d344822fed4d13e,5ed0e31a76027d35905cc302,Designing Cmos circuit from Boolean expression...,,project,4.230265,4.28696,0.056695


In [14]:
df_new[df_new['user_id']==df_new.user_id.value_counts().index[0]].sort_values(by='errors').head()

Unnamed: 0,user_id,post_id,title,category,post_type,rui,est,errors
117,5d60098a653a331687083238,5eb4fab110426255a7aaa0ed,God Drawing,Drawings,artwork,3.770325,3.781211,0.010886
136,5d60098a653a331687083238,5e52fd0ed701ab08af792a1f,Network Security Threats,Computer Technology|Computer Application,blog,3.28252,3.254238,0.028283
274,5d60098a653a331687083238,5e7df283a3258347b42f2128,screw2,Photography,artwork,3.770325,3.730093,0.040232
230,5d60098a653a331687083238,5e7df068a3258347b42f2125,screw town,Photography,artwork,3.770325,3.730093,0.040232
206,5d60098a653a331687083238,5ecf818376027d35905cbf03,GAN's INTRODUCTION,Computer Technology|Machine Learning,blog,3.28252,3.24129,0.04123


In [15]:
df_test = pd.DataFrame(test, columns=['user_id', 'post_id', 'merged'])
df_test

Unnamed: 0,user_id,post_id,merged
0,5e5855ced701ab08af792b51,5e7bd922cfc8b713f5ac7da9,2.959940
1,5ecb979eeaff6b0c3a58a4f0,5eca8fceeaff6b0c3a58a3c0,3.813598
2,5d60098a653a331687083238,5ec278b574f7660d73aa10d5,3.770325
3,5e35a5ed8d344822fed4d13e,5ed0e31a76027d35905cc302,4.230265
4,5e1ef04c2a37d20505da2b8b,5eb1551e10426255a7aaa003,3.447745
...,...,...,...
285,5ed237ee76027d35905cc6c5,5e5e3b35fbc8805f69e02c9e,3.699187
286,5df20f1fee4bb5252b4f5351,5e8c2d01a3258347b42f2627,4.136179
287,5d60098a653a331687083238,5e9489e7a3258347b42f2896,3.770325
288,5e5af599d701ab08af792b63,5e9415d2a3258347b42f27f8,3.376606


In [16]:
def recommend(user_id, n=10):
    res = pd.DataFrame(columns=['user_id', 'post_id', 'estimate'])
    for i in df_test.post_id.unique():
        temp = pd.DataFrame([[user_id, i, algo.predict(user_id, i)[3]]], columns=['user_id', 'post_id', 'estimate'])
        res = res.append(temp, ignore_index=True)
    res = pd.merge(res, df_posts, on='post_id')
    return res.sort_values(by='estimate', ascending=False).reset_index(drop=True)[:n]

### Final Recommendation

In [17]:
recommend('5e4ce251f5561b1994c8e40d')

Unnamed: 0,user_id,post_id,estimate,title,category,post_type
0,5e4ce251f5561b1994c8e40d,5e7bd922cfc8b713f5ac7da9,3.711433,What sports will look like in the future,Computer Technology|Robotics|Data Science|Info...,blog
1,5e4ce251f5561b1994c8e40d,5e948db6a3258347b42f28b2,3.711433,peace,Photography,artwork
2,5e4ce251f5561b1994c8e40d,5e90208ca3258347b42f2730,3.711433,Benefits of Buying Grocery Online in Twin city...,E Commerce|Shopping Platform|Other Online Plat...,blog
3,5e4ce251f5561b1994c8e40d,5ed3476576027d35905cca1d,3.711433,AWS CLI Setup in Mac,Technology,skill
4,5e4ce251f5561b1994c8e40d,5ecf818376027d35905cbf03,3.711433,GAN's INTRODUCTION,Computer Technology|Machine Learning,blog
5,5e4ce251f5561b1994c8e40d,5e830a6ca3258347b42f23f6,3.711433,Dog❤️,Photography,artwork
6,5e4ce251f5561b1994c8e40d,5e8cb88ea3258347b42f267e,3.711433,Palindrome (C# .Net),,project
7,5e4ce251f5561b1994c8e40d,5eb2c11210426255a7aaa052,3.711433,Gangster Style,Drawings,artwork
8,5e4ce251f5561b1994c8e40d,5dada695610ba040fbfdf585,3.711433,Trident,Drawings,artwork
9,5e4ce251f5561b1994c8e40d,5e8d539ba3258347b42f26d8,3.711433,BEING CREATIVE☺☺,Drawings,artwork
