<a href="https://colab.research.google.com/github/abdurahman02/AcademicContent/blob/master/Matrix_Factorization_RecSys_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
import pandas as pd
import numpy as np
import os

In [None]:
# !git clone "https://github.com/abdurahman02/ml-latest-small.git"
# os.chdir("ml-latest-small")
os.listdir()

['links.csv',
 'tags.csv',
 'results_02.txt',
 'results.md',
 'ratings.csv',
 'README.md',
 'movies.csv',
 '.git']

In [None]:
data = pd.read_csv("ratings.csv")
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
# split train and validation before encoding
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()

In [None]:
# here is a handy function modified from fast.ai
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [None]:
# encoding the train and validation data
df_train = encode_data(train)
df_val = encode_data(val, train)

In [None]:
num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())
print(num_users, num_items)

610 8998


In [None]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)

In [None]:
if torch.cuda.is_available():
  model = MF(num_users, num_items, emb_size=100).cuda()
else:
  model = MF(num_users, num_items, emb_size=100)

In [None]:
def test_loss(model, unsqueeze=False):
    model.eval()
    if torch.cuda.is_available():
      users = torch.LongTensor(df_val.userId.values).cuda()
      items = torch.LongTensor(df_val.movieId.values).cuda()
      ratings = torch.FloatTensor(df_val.rating.values).cuda()
    else:
      users = torch.LongTensor(df_val.userId.values)
      items = torch.LongTensor(df_val.movieId.values)
      ratings = torch.FloatTensor(df_val.rating.values)
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [None]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        if torch.cuda.is_available():
            users = torch.LongTensor(df_train.userId.values).cuda()
            items = torch.LongTensor(df_train.movieId.values).cuda()
            ratings = torch.FloatTensor(df_train.rating.values).cuda()
        else:
            users = torch.LongTensor(df_train.userId.values) # .cuda()
            items = torch.LongTensor(df_train.movieId.values) #.cuda()
            ratings = torch.FloatTensor(df_train.rating.values) #.cuda()
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item()) 
    test_loss(model, unsqueeze)

In [None]:
# Here is what unsqueeze does
ratings = torch.FloatTensor(df_train.rating.values)
print(ratings.shape)
ratings = ratings.unsqueeze(1) # .cuda()
print(ratings.shape)

torch.Size([80450])
torch.Size([80450, 1])


In [None]:
train_epocs(model, epochs=10, lr=0.1)

12.913736343383789
4.8573527336120605
2.5867645740509033
3.1074562072753906
0.8492456674575806
1.8201146125793457
2.658536434173584
2.1395044326782227
1.093145489692688
0.9747868180274963
test loss 1.850 


In [None]:
train_epocs(model, epochs=15, lr=0.01)

1.641574501991272
1.0041735172271729
0.7119297981262207
0.6609827280044556
0.7256274819374084
0.8036681413650513
0.8432365655899048
0.8351707458496094
0.793044924736023
0.7376187443733215
0.6877866983413696
0.6556909084320068
0.6446264386177063
0.6496798396110535
0.6610802412033081
test loss 0.821 


In [None]:
train_epocs(model, epochs=15, lr=0.01)

0.6690704822540283
0.6315687894821167
0.6389480233192444
0.614279568195343
0.6055468916893005
0.6139962673187256
0.6116964817047119
0.5969144105911255
0.585014820098877
0.5832411646842957
0.58433997631073
0.5794669389724731
0.568882167339325
0.5586512088775635
0.5523501038551331
test loss 0.759 


In [None]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) +  b_u  + b_v

In [None]:
model = MF_bias(num_users, num_items, emb_size=100) #.cuda()

In [None]:
train_epocs(model, epochs=10, lr=0.05, wd=1e-5)

12.913201332092285
9.158486366271973
4.393214225769043
1.1586315631866455
2.4618592262268066
3.745398998260498
2.4525110721588135
1.079404354095459
0.8143762350082397
1.3159016370773315
test loss 2.068 


In [None]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-5)

1.8920012712478638
1.3238822221755981
0.9342262148857117
0.7443917989730835
0.722421407699585
0.7777937054634094
0.8232869505882263
0.8219900727272034
0.7811893224716187
0.7271417379379272
test loss 0.798 


In [None]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-5)

0.6848835349082947
0.6707419157028198
0.6589115262031555
0.6492202877998352
0.6414564251899719
0.6353794932365417
0.6307392716407776
0.6272894144058228
0.6247975826263428
0.6230531334877014
test loss 0.751 
