In [35]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from sklearn import model_selection

# Matrix Factorization

In [36]:
df = pd.read_csv('datasets/df_modcloth.csv')

In [37]:
df.head(3)

Unnamed: 0,item_id,user_id,rating,timestamp,size,fit,user_attr,model_attr,category,brand,year,split
0,7443,Alex,4,2010-01-21 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
1,7443,carolyn.agan,3,2010-01-27 08:00:00+00:00,,,,Small,Dresses,,2012,0
2,7443,Robyn,4,2010-01-29 08:00:00+00:00,,,Small,Small,Dresses,,2012,0


In [189]:
class DataSet:
    def __init__(self, users, items, ratings):
        self.users = users
        self.items = items
        self.ratings = ratings
        
    
    def __getitem__(self, index):
        user = self.users[index]
        item = self.items[index]
        rating = self.ratings[index]
        
        return {
            "user": torch.tensor(user, dtype=torch.long),
            "item": torch.tensor(item, dtype=torch.long),
            "rating": torch.tensor(rating, dtype=torch.long),
        }

In [190]:
df_ = DataSet(df["user_id"].values, df["item_id"].values, df["rating"].values)

# Preprocessing

In [191]:
# here is a handy function modified from fast.ai
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

# Reference: https://github.com/yanneta/pytorch-tutorials/blob/master/collaborative-filtering-nn.ipynb

In [192]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["user_id", "item_id"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

# Reference https://github.com/yanneta/pytorch-tutorials/blob/master/collaborative-filtering-nn.ipynb

In [193]:
# number of values that are null in each column
df.isnull().sum(axis = 0)

item_id           0
user_id           0
rating            0
timestamp         0
size          21760
fit           18505
user_attr      8367
model_attr        0
category          0
brand         73980
year              0
split             0
dtype: int64

In [194]:
# specifially looking at item_id and user_id because those are the two columns that will be used in the matrix factorization
df = df.dropna(subset=['user_id'])

In [195]:
training_data, validation_data = model_selection.train_test_split(df, test_size=0.2, random_state=12, stratify=df.rating.values)

In [196]:
# Encoding needed for the embedding layers
train_data_ = encode_data(training_data)
val_data_ = encode_data(training_data, validation_data)

In [197]:
train_data = DataSet(train_data_["user_id"].values, train_data_["item_id"].values, train_data_["rating"].values)
val_data = DataSet(val_data_["user_id"].values, val_data_["item_id"].values, val_data_["rating"].values)

In [198]:
# After encode_data the values will be continous
#train_data[0:3]
val_data[:]


{'user': tensor([6868, 6506, 8039,  ...,   72,  728,  777]),
 'item': tensor([ 71, 208, 105,  ...,   4,  82,  56]),
 'rating': tensor([5, 2, 4,  ..., 5, 5, 5])}

In [199]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=150):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, 150)
        self.item_emb = nn.Embedding(num_items, 150)
        self.user_emb.weight.data.uniform_(0, 0.12)
        self.item_emb.weight.data.uniform_(0, 0.12)
    
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)
    
# Reference https://github.com/yanneta/pytorch-tutorials/blob/master/collaborative-filtering-nn.ipynb

In [200]:
num_unique_users = len(set(df_train.user_id.values))
num_unique_movies = len(set(df_train.item_id.values))

In [201]:
net = MF(num_unique_users, num_unique_movies)

In [202]:
loss_function = nn.MSELoss(reduction='none')

In [203]:
def test_loss(model):
    model.eval()
    
    users = torch.LongTensor(val_data.user_id.values)
    items = torch.LongTensor(val_data.item_id.values)
    ratings = torch.FloatTensor(val_data.rating.values)

    y_hat = model(users, items)
    
    
    count = 0
    for i in range(len(y_hat)):
        
        diff = np.abs(y_hat[i].detach().numpy() - ratings[i].detach().numpy())
        if round(diff) == 0 or round(diff) == 1:
            count += 1
    print("Accuracy: ", 100*count / len(y_hat), "%")

    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [208]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0):
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        
        users = torch.LongTensor(train_data.user_id.values)
        items = torch.LongTensor(train_data.item_id.values)
        ratings = torch.FloatTensor(train_data.rating.values)
        
        y_hat = model(users, items)
        #print("y_hat:", y_hat, y_hat.shape)
        loss = F.mse_loss(y_hat, ratings)
        opt.zero_grad()
        loss.backward()
        opt.step()
        print(loss.item())
    test_loss(model)

In [209]:
train_epocs(net, epochs=15, lr=0.01)

  loss = torch.tensor(F.mse_loss(y_hat, ratings), dtype=float)


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

# Content Based

In [206]:
# Finding out what variables that potentially could be into a specific content
df.head(2)

Unnamed: 0,item_id,user_id,rating,timestamp,size,fit,user_attr,model_attr,category,brand,year,split
0,7443,Alex,4,2010-01-21 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
1,7443,carolyn.agan,3,2010-01-27 08:00:00+00:00,,,,Small,Dresses,,2012,0


In [207]:
df['brand'].nunique()

31