In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from sklearn import model_selection

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cuda device


In [3]:
df = pd.read_csv('/content/drive/MyDrive/Recommendation System/Recommendation-System/datasets/df_modcloth.csv')

In [4]:
df.head()

Unnamed: 0,item_id,user_id,rating,timestamp,size,fit,user_attr,model_attr,category,brand,year,split
0,7443,Alex,4,2010-01-21 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
1,7443,carolyn.agan,3,2010-01-27 08:00:00+00:00,,,,Small,Dresses,,2012,0
2,7443,Robyn,4,2010-01-29 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
3,7443,De,4,2010-02-13 08:00:00+00:00,,,,Small,Dresses,,2012,0
4,7443,tasha,4,2010-02-18 08:00:00+00:00,,,Small,Small,Dresses,,2012,0


# Preprocessing

In [5]:
# here is a handy function modified from fast.ai
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

# Reference: https://github.com/yanneta/pytorch-tutorials/blob/master/collaborative-filtering-nn.ipynb

In [6]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["user_id", "item_id"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

# Reference https://github.com/yanneta/pytorch-tutorials/blob/master/collaborative-filtering-nn.ipynb

In [7]:
# number of values that are null in each column
df.isnull().sum(axis = 0)

item_id           0
user_id           1
rating            0
timestamp         0
size          21760
fit           18506
user_attr      8367
model_attr        0
category          0
brand         73980
year              0
split             0
dtype: int64

In [8]:
df = df.dropna(subset=['user_id'])

In [9]:
df.isnull().sum(axis = 0)

item_id           0
user_id           0
rating            0
timestamp         0
size          21760
fit           18505
user_attr      8367
model_attr        0
category          0
brand         73980
year              0
split             0
dtype: int64

In [10]:
training_data, validation_data = model_selection.train_test_split(df, test_size=0.2, random_state=12, stratify=df.rating.values)

In [11]:
# Encoding needed for the embedding layers
train_data = encode_data(training_data)
val_data = encode_data(training_data, validation_data)

In [12]:
# After encode_data the values will be continous
train_data.head()

Unnamed: 0,item_id,user_id,rating,timestamp,size,fit,user_attr,model_attr,category,brand,year,split
72588,0,0,3,2017-02-10 08:00:00+00:00,,Slightly small,,Small,Bottoms,,2018,0
10355,1,1,5,2013-11-17 08:00:00+00:00,5.0,Just right,Large,Small&Large,Dresses,,2012,2
90708,2,2,3,2018-06-16 19:13:07.131000+00:00,1.0,,Small,Small,Bottoms,ModCloth,2018,0
78809,3,3,4,2017-10-31 07:00:00+00:00,3.0,,Small,Small&Large,Bottoms,,2018,0
59122,4,4,2,2016-06-29 07:00:00+00:00,2.0,Slightly small,Small,Small,Tops,,2019,0


#Model

In [134]:
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.lin1 = nn.Linear(emb_size*2, emb_size)
        #self.lin2 = nn.Linear(emb_size*4, emb_size)
        self.lin3 = nn.Linear(emb_size, n_hidden)
        self.lin4 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        #x = F.relu(self.lin2(x))
        x = F.relu(self.lin3(x))
        x = self.lin4(x)
        return x

#Train and Test Model

In [130]:
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(val_data.user_id.values)
    items = torch.LongTensor(val_data.item_id.values)
    ratings = torch.FloatTensor(val_data.rating.values)

    y_hat = model(users, items)

    count = 0
    for i in range(len(y_hat)):
        
        diff = np.abs(y_hat[i].detach().numpy() - ratings[i].detach().numpy())
        if np.rint(diff) == 0 or np.rint(diff) == 1:
            count += 1
    print("Accuracy: ", 100*count / len(y_hat), "%")


    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [81]:
def train_epochs(model, epochs=10, lr=0.01):
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    model.train()
    for epoch in range(epochs):
        
        users = torch.LongTensor(train_data.user_id.values)
        items = torch.LongTensor(train_data.item_id.values)
        ratings = torch.FloatTensor(train_data.rating.values)
        
        y_hat = model(users, items).reshape(-1)

        loss = F.mse_loss(y_hat, ratings)
        
        mae = np.mean([ abs(ratings[i] - y_hat[i]) for i in range(len(y_hat)) ], dtype=np.float)

        opt.zero_grad()
        loss.backward()
        opt.step()
        print('Train Epoch: {}/{} mae: {} Loss: {} '.format(
                epoch, epochs, mae ,loss.item()))
    test_loss(model)
    
# Reference https://github.com/yanneta/pytorch-tutorials/blob/master/collaborative-filtering-nn.ipynb

Training and Testing

In [16]:
num_unique_users = len(set(training_data.user_id.values))
num_unique_movies = len(set(training_data.item_id.values))

In [126]:
model = CollabFNet(num_unique_users, num_unique_movies, emb_size=200) #.cuda()

In [127]:
model

CollabFNet(
  (user_emb): Embedding(38417, 200)
  (item_emb): Embedding(1014, 200)
  (lin1): Linear(in_features=400, out_features=10, bias=True)
  (lin4): Linear(in_features=10, out_features=1, bias=True)
  (drop1): Dropout(p=0.1, inplace=False)
)

In [128]:
train_epochs(model, epochs=20, lr=0.05)#default

Train Epoch: 0/20 mae: 4.241038069663965 Loss: 19.17345428466797 
Train Epoch: 1/20 mae: 1.3984957268006162 Loss: 3.2551674842834473 
Train Epoch: 2/20 mae: 1.1409006470735168 Loss: 2.476318597793579 
Train Epoch: 3/20 mae: 1.0345723354428435 Loss: 1.412343144416809 
Train Epoch: 4/20 mae: 1.452508607126769 Loss: 2.5383832454681396 
Train Epoch: 5/20 mae: 1.3442027296395875 Loss: 2.182101011276245 
Train Epoch: 6/20 mae: 0.9196942222431195 Loss: 1.1497161388397217 
Train Epoch: 7/20 mae: 0.7250494771815646 Loss: 1.1624573469161987 
Train Epoch: 8/20 mae: 1.0030430004608735 Loss: 1.7748606204986572 
Train Epoch: 9/20 mae: 0.8021877599022373 Loss: 1.2843645811080933 
Train Epoch: 10/20 mae: 0.6516144483315266 Loss: 0.7944533228874207 
Train Epoch: 11/20 mae: 0.8836134273617172 Loss: 1.0320453643798828 
Train Epoch: 12/20 mae: 1.0102854654195348 Loss: 1.2746163606643677 
Train Epoch: 13/20 mae: 0.9038602591393303 Loss: 1.063537836074829 
Train Epoch: 14/20 mae: 0.6547380310261136 Loss: 0.



test loss 1.902 


In [None]:
model2 = CollabFNet(num_unique_users, num_unique_movies, emb_size=200) #.cuda()

In [138]:
model2

CollabFNet(
  (user_emb): Embedding(38417, 200)
  (item_emb): Embedding(1014, 200)
  (lin1): Linear(in_features=400, out_features=200, bias=True)
  (lin3): Linear(in_features=200, out_features=10, bias=True)
  (lin4): Linear(in_features=10, out_features=1, bias=True)
  (drop1): Dropout(p=0.1, inplace=False)
)

In [144]:
train_epochs(model2, epochs=20, lr=0.05)

Train Epoch: 0/20 mae: 3.8526107173313697 Loss: 16.010324478149414 
Train Epoch: 1/20 mae: 3.7780264384002504 Loss: 15.441200256347656 
Train Epoch: 2/20 mae: 3.6985099172928697 Loss: 14.84669303894043 
Train Epoch: 3/20 mae: 3.6140391683915025 Loss: 14.228996276855469 
Train Epoch: 4/20 mae: 3.524556375035576 Loss: 13.590213775634766 
Train Epoch: 5/20 mae: 3.4299833652870633 Loss: 12.932503700256348 
Train Epoch: 6/20 mae: 3.33023295256426 Loss: 12.258170127868652 
Train Epoch: 7/20 mae: 3.2252155530991695 Loss: 11.569731712341309 
Train Epoch: 8/20 mae: 3.1215275412510546 Loss: 10.869969367980957 
Train Epoch: 9/20 mae: 3.0138906496728994 Loss: 10.161953926086426 
Train Epoch: 10/20 mae: 2.9011455384441422 Loss: 9.449100494384766 
Train Epoch: 11/20 mae: 2.7832503898028635 Loss: 8.735159873962402 
Train Epoch: 12/20 mae: 2.6601871721763604 Loss: 8.024246215820312 
Train Epoch: 13/20 mae: 2.5319679501718615 Loss: 7.3208417892456055 
Train Epoch: 14/20 mae: 2.3986420760772242 Loss: 6.



test loss 3.058 


In [146]:
train_epocs(model2, epochs=10, lr=0.03)

Train Epoch: 0/10 mae: 1.353190140766965 Loss: 2.1951639652252197 
Train Epoch: 1/10 mae: 1.2875232302833608 Loss: 1.9801666736602783 
Train Epoch: 2/10 mae: 1.221067307808777 Loss: 1.7882517576217651 
Train Epoch: 3/10 mae: 1.1540890030580873 Loss: 1.6209524869918823 
Train Epoch: 4/10 mae: 1.0869493538617339 Loss: 1.4795712232589722 
Train Epoch: 5/10 mae: 1.0201308595991427 Loss: 1.36503005027771 
Train Epoch: 6/10 mae: 0.9542648135335096 Loss: 1.2776676416397095 
Train Epoch: 7/10 mae: 0.8901582167945268 Loss: 1.2169944047927856 
Train Epoch: 8/10 mae: 0.8743613370927106 Loss: 1.1814285516738892 
Train Epoch: 9/10 mae: 0.8652776366662217 Loss: 1.1680748462677002 
Accuracy:  90.6538162426397 %




test loss 1.176 
