<a href="https://colab.research.google.com/github/abdurahman02/AcademicContent/blob/master/FL(implementation)_Matrix_Factorization_RecSys_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
import pandas as pd
import numpy as np
import os

In [2]:
!git clone "https://github.com/abdurahman02/ml-latest-small.git"
os.chdir("ml-latest-small")
os.listdir()

Cloning into 'ml-latest-small'...
remote: Enumerating objects: 32, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 32 (delta 13), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (32/32), done.


['links.csv',
 'tags.csv',
 'results_02.txt',
 'results.md',
 'ratings.csv',
 'README.md',
 'movies.csv',
 '.git']

In [3]:
data = pd.read_csv("ratings.csv")
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
# split train and validation before encoding
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()

In [5]:
# here is a handy function modified from fast.ai
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [6]:
# encoding the train and validation data
df_train = encode_data(train)
df_val = encode_data(val, train)

In [19]:
# for user_id in df_train.userId.unique():
sel_rows_of_user_i = df_train[(df_train.userId == 1)]
print(sel_rows_of_user_i)
print(sel_rows_of_user_i.movieId.values)
print(sel_rows_of_user_i.rating.values)

     userId  movieId  rating   timestamp
232       1      193     3.0  1445714835
234       1      194     4.5  1445715228
235       1      180     4.0  1445714885
237       1      195     3.5  1445714960
238       1      196     4.0  1445715013
239       1      197     4.0  1445715064
240       1      198     4.5  1445715141
241       1      199     5.0  1445714980
242       1      200     4.5  1445715154
243       1      201     3.0  1445714974
244       1      202     4.0  1445714926
245       1      203     3.0  1445714941
246       1      204     4.0  1445714841
247       1      205     4.5  1445715340
248       1      206     5.0  1445715172
251       1      207     3.5  1445714891
252       1      208     2.5  1445714938
253       1      209     3.5  1445714874
254       1      210     5.0  1445714966
256       1      211     4.0  1445714882
258       1      212     3.5  1445714854
260       1      213     5.0  1445714851
[193 194 180 195 196 197 198 199 200 201 202 203 204 205 

In [7]:
num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())
print(num_users, num_items)

610 8998


In [8]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)

In [9]:
if torch.cuda.is_available():
  model = MF(num_users, num_items, emb_size=100).cuda()
else:
  model = MF(num_users, num_items, emb_size=100)

In [10]:
def test_loss(model, unsqueeze=False):
    model.eval()
    if torch.cuda.is_available():
      users = torch.LongTensor(df_val.userId.values).cuda()
      items = torch.LongTensor(df_val.movieId.values).cuda()
      ratings = torch.FloatTensor(df_val.rating.values).cuda()
    else:
      users = torch.LongTensor(df_val.userId.values)
      items = torch.LongTensor(df_val.movieId.values)
      ratings = torch.FloatTensor(df_val.rating.values)
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [29]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
      for user_id in df_train.userId.unique():
        sel_rows_of_user_i = df_train[(df_train.userId == 1)]

        if torch.cuda.is_available():
            users = torch.LongTensor(sel_rows_of_user_i.userId.values).cuda()
            items = torch.LongTensor(sel_rows_of_user_i.movieId.values).cuda()
            ratings = torch.FloatTensor(sel_rows_of_user_i.rating.values).cuda()
        else:
            users = torch.LongTensor(sel_rows_of_user_i.userId.values) # .cuda()
            items = torch.LongTensor(sel_rows_of_user_i.movieId.values) #.cuda()
            ratings = torch.FloatTensor(sel_rows_of_user_i.rating.values) #.cuda()
        if unsqueeze:
            ratings = ratings.unsqueeze(1)

        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if user_id%100 == 0:
          print(user_id,loss.item()) 
    test_loss(model, unsqueeze)

In [30]:
# Here is what unsqueeze does
ratings = torch.FloatTensor(df_train.rating.values)
print(ratings.shape)
ratings = ratings.unsqueeze(1) # .cuda()
print(ratings.shape)

torch.Size([80450])
torch.Size([80450, 1])


In [31]:
train_epocs(model, epochs=10, lr=0.1)

0 3.979039320256561e-13
100 0.0010035716695711017
200 0.058999087661504745
300 2.961804170809046e-07
400 3.361939393542457e-09
500 1.041718064698216e-06
600 1.4228940942162538e-11
0 7.5446723521444e-12
100 3.2943344233093974e-12
200 1.3404711368680822e-11
300 0.00035325504723004997
400 4.9855244377283725e-09
500 4.994469681579572e-12
600 0.0072554354555904865
0 0.002578369341790676
100 3.2459460896916426e-08
200 1.8835842174186324e-12
300 3.4726160749037893e-12
400 5.211507874552934e-12
500 1.1332510450878797e-11
600 2.618207872728817e-10
0 6.193299650902873e-09
100 1.7896709891829232e-07
200 4.265323511054575e-11
300 1.6624117013530615e-11
400 0.00010674203076632693
500 1.5148771126405336e-09
600 6.699771987195291e-11
0 1.9962632935577318e-10
100 1.1914439710380975e-05
200 8.525970485173673e-10
300 1.98435213573378e-12
400 2.046905700492374e-10
500 5.755516667704796e-06
600 2.772020946117948e-10
0 4.247236937149346e-11
100 6.541902436429936e-11
200 2.6798556064022705e-05
300 1.4698287

In [None]:
train_epocs(model, epochs=15, lr=0.01)

0 2.5837918434033595e-15
100 5.693642037840618e-07
200 8.686707603233756e-12
300 3.3330914525793454e-13
400 4.90920455328836e-13
500 9.301650161913644e-13
600 0.0007885022205300629
0 0.00019902644271496683
100 1.6905569166780765e-09
200 8.30066801427165e-06
300 8.740319401567831e-08
400 1.294738014717689e-11
500 3.527635635691695e-05
600 4.694956090567359e-10
0 6.387701478871577e-10
100 5.2612758736358956e-06
200 6.404350187949603e-06
300 3.2273161920670645e-09
400 0.0013321711448952556
500 1.3780270364804892e-06
600 3.962474792729154e-09
0 3.7530920593553674e-08
100 1.5008729814525168e-09
200 2.802619007979956e-07
300 5.605516761875151e-08
400 1.1929625065576488e-09
500 1.006148863780254e-06
600 5.660980306743113e-08
0 3.0181254828676174e-08
100 0.00023827505356166512
200 0.00010964965622406453
300 7.186801376235508e-09
400 3.381131318747066e-05
500 8.878656080923975e-05
600 7.247378253083525e-09
0 4.315260593301673e-09
100 2.467786828219687e-07
200 0.0005630832747556269
300 1.5832284

In [15]:
train_epocs(model, epochs=15, lr=0.01)

0.668438196182251
0.6314186453819275
0.6382811069488525
0.6136514544487
0.6053114533424377
0.613705575466156
0.6110225915908813
0.5961105227470398
0.58441162109375
0.5828033685684204
0.5837610363960266
0.5786066651344299
0.5678828358650208
0.5577182173728943
0.5515027046203613
test loss 0.759 


In [None]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) +  b_u  + b_v

In [None]:
model = MF_bias(num_users, num_items, emb_size=100) #.cuda()

In [None]:
train_epocs(model, epochs=10, lr=0.05, wd=1e-5)

12.913201332092285
9.158486366271973
4.393214225769043
1.1586315631866455
2.4618592262268066
3.745398998260498
2.4525110721588135
1.079404354095459
0.8143762350082397
1.3159016370773315
test loss 2.068 


In [None]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-5)

1.8920012712478638
1.3238822221755981
0.9342262148857117
0.7443917989730835
0.722421407699585
0.7777937054634094
0.8232869505882263
0.8219900727272034
0.7811893224716187
0.7271417379379272
test loss 0.798 


In [None]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-5)

0.6848835349082947
0.6707419157028198
0.6589115262031555
0.6492202877998352
0.6414564251899719
0.6353794932365417
0.6307392716407776
0.6272894144058228
0.6247975826263428
0.6230531334877014
test loss 0.751 
