In [35]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from sklearn import model_selection

# Matrix Factorization

In [36]:
df = pd.read_csv('datasets/df_modcloth.csv')

In [37]:
df.head(3)

Unnamed: 0,item_id,user_id,rating,timestamp,size,fit,user_attr,model_attr,category,brand,year,split
0,7443,Alex,4,2010-01-21 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
1,7443,carolyn.agan,3,2010-01-27 08:00:00+00:00,,,,Small,Dresses,,2012,0
2,7443,Robyn,4,2010-01-29 08:00:00+00:00,,,Small,Small,Dresses,,2012,0


# Preprocessing

In [38]:
# here is a handy function modified from fast.ai
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

# Reference: https://github.com/yanneta/pytorch-tutorials/blob/master/collaborative-filtering-nn.ipynb

In [39]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["user_id", "item_id"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

# Reference https://github.com/yanneta/pytorch-tutorials/blob/master/collaborative-filtering-nn.ipynb

In [40]:
# number of values that are null in each column
df.isnull().sum(axis = 0)

item_id           0
user_id           1
rating            0
timestamp         0
size          21760
fit           18506
user_attr      8367
model_attr        0
category          0
brand         73980
year              0
split             0
dtype: int64

In [41]:
# specifially looking at item_id and user_id because those are the two columns that will be used in the matrix factorization
df = df.dropna(subset=['user_id'])

In [71]:
training_data, validation_data = model_selection.train_test_split(df, test_size=0.2, random_state=12, stratify=df.rating.values)

In [72]:
train_data = encode_data(training_data)
val_data = encode_data(training_data, validation_data)

In [43]:
# After encode_data the values will be continous
training_data[0:3]

Unnamed: 0,item_id,user_id,rating,timestamp,size,fit,user_attr,model_attr,category,brand,year,split
72588,0,0,3,2017-02-10 08:00:00+00:00,,Slightly small,,Small,Bottoms,,2018,0
10355,1,1,5,2013-11-17 08:00:00+00:00,5.0,Just right,Large,Small&Large,Dresses,,2012,2
90708,2,2,3,2018-06-16 19:13:07.131000+00:00,1.0,,Small,Small,Bottoms,ModCloth,2018,0


In [44]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=150):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, 150)
        self.item_emb = nn.Embedding(num_items, 150)
        self.user_emb.weight.data.uniform_(0, 0.1)
        self.item_emb.weight.data.uniform_(0, 0.1)
    
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)

In [45]:
num_unique_users = len(set(df_train.user_id.values))
num_unique_movies = len(set(df_train.item_id.values))

In [46]:
net = MF(num_unique_users, num_unique_movies)

In [47]:
loss_function = nn.MSELoss(reduction='none')

In [75]:
def test_loss(model):
    model.eval()
    users = torch.LongTensor(val_data.user_id.values)
    items = torch.LongTensor(val_data.item_id.values)
    ratings = torch.FloatTensor(val_data.rating.values)

    y_hat = model(users, items)
    
    print(y_hat)
    
    count = 0
    for i in range(len(y_hat)):
        
        diff = np.abs(y_hat[i].detach().numpy() - ratings[i].detach().numpy())
        if round(diff) == 0 or round(diff) == 1:
            count += 1
    print("Accuracy: ", 100*count / len(y_hat), "%")

    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [76]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0):
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(train_data.user_id.values)
        items = torch.LongTensor(train_data.item_id.values)
        ratings = torch.FloatTensor(train_data.rating.values)
        
        y_hat = model(users, items)
        #print("y_hat:", y_hat, y_hat.shape)
        loss = F.mse_loss(y_hat, ratings)
        opt.zero_grad()
        loss.backward()
        opt.step()
        print(loss.item())
    test_loss(model)

In [77]:
train_epocs(net, epochs=15, lr=0.01)

0.32015058398246765
0.3441174328327179
0.2792914807796478
0.26902344822883606
0.25819703936576843
0.23449549078941345
0.21149159967899323
0.1982901245355606
0.19043636322021484
0.1786573827266693
0.16411304473876953
0.15251196920871735
0.14416883885860443
0.13625268638134003
0.12750934064388275
tensor([0.7724, 6.0033, 3.8978,  ..., 3.2061, 5.3983, 4.8739],
       grad_fn=<SumBackward1>)
Accuracy:  0.7177813576269303
test loss 2.245 
