In [1]:
import pandas as pd
import numpy as np
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [2]:
item = pd.read_csv('item_feature.csv')
train = pd.read_csv('training.csv')
df = train.merge(item, on = 'item_id', how = 'left')
df['label'] = 1

In [3]:
item_ids = train['item_id'].unique()

users, items, labels, cntxt_ftres = [], [], [], []

user_item_set = set(zip(train['user_id'], train['item_id'], train['context_feature_id']))

track = set(zip(train['user_id'], train['item_id']))


for (u, i, c) in user_item_set:
    users.append(u)
    items.append(i)
    cntxt_ftres.append(c)
    labels.append(1)
    negative_item = np.random.choice(item_ids) 
    while (u, negative_item) in track:
        negative_item = np.random.choice(item_ids)
    users.append(u)
    items.append(negative_item)
    cntxt_ftres.append(c)
    labels.append(0)

In [4]:
df = pd.concat([pd.Series(users),pd.Series(items),
               pd.Series(cntxt_ftres), pd.Series(labels)], axis =1).\
rename(columns={0:'user_id', 1:'item_id', 2:'context_feature_id', 3:'label'})

In [5]:
df = df.merge(item, on = 'item_id', how = 'left')

In [6]:
def data_sample(data):

    train=data.sample(frac=0.8)
    val=data.drop(train.index)
    return train, val

In [7]:
train , val = data_sample(df)

### Old Model: More parameters fine-tuning

In [9]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=20):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        # init 
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        self.classifier = nn.Sigmoid()
        self.nonlin = nn.LeakyReLU()
        self.drop = nn.Dropout(p = 0.1)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        U = self.nonlin(U)
        V = self.item_emb(v)
        V = self.drop(V)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        self.U, self.V = U, V
        return self.classifier((U*V).sum(1) +  b_u  + b_v)
    
    def __getitem__(self,idx):
        return self.U[idx], self.V[idx]

In [16]:
def train_one_epoch(model, train_df, optimizer):
    """ Trains the model for one epoch"""
    model.train()
    ### BEGIN SOLUTION
    y = torch.FloatTensor(train_df.label.values)
    u = torch.LongTensor(train_df.user_id.values)
    v = torch.LongTensor(train_df.item_id.values)
    y_hat = model(u,v)
    output = torch.as_tensor(y_hat > 0.5, dtype = torch.int8)
    train_acc = accuracy_score(output,y)
    train_loss = F.binary_cross_entropy(y_hat, y)
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()
    ### END SOLUTION
    return train_loss.item(), train_acc

def valid_metrics(model, valid_df):
    """Computes validation loss and accuracy"""
    model.eval()
    ### BEGIN SOLUTION
    u = torch.LongTensor(valid_df.user_id.values)
    v = torch.LongTensor(valid_df.item_id.values)
    y = torch.FloatTensor(valid_df.label.values)
    y_hat = model(u,v)
    valid_loss = F.binary_cross_entropy(y_hat, y)
    output = torch.as_tensor(y_hat > 0.5, dtype = torch.int8)
    auc = roc_auc_score( y.detach().numpy(), y_hat.detach().numpy())
    valid_acc = accuracy_score(output,y)
    ### END SOLUTION
    return valid_loss.item(), valid_acc, auc

def training(model, df, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    train, val = data_sample(df)
    track_val_loss = 1000
    for i in range(epochs):
        train_loss, train_acc = train_one_epoch(model, train, optimizer)
        valid_loss, valid_acc, auc = valid_metrics(model, val) 
        if i%10== 0:
            print("train loss %.3f train acc %.3f valid loss %.3f valid acc %.3f roc auc acc %.3f" % (train_loss,train_acc,valid_loss, valid_acc, auc))
        if i%2 == 0: 
            train, val = data_sample(df)

In [17]:
model_1 = MF(df.user_id.max()+1, df.item_id.max()+1, emb_size= 75)
training(model_1, df, epochs=126, lr=.12, wd=1e-6)

train loss 0.693 train acc 0.500 valid loss 0.650 valid acc 0.805 roc auc acc 0.819
train loss 0.235 train acc 0.938 valid loss 0.264 valid acc 0.926 roc auc acc 0.977
train loss 0.185 train acc 0.957 valid loss 0.215 valid acc 0.942 roc auc acc 0.984
train loss 0.178 train acc 0.964 valid loss 0.202 valid acc 0.952 roc auc acc 0.988
train loss 0.174 train acc 0.967 valid loss 0.198 valid acc 0.955 roc auc acc 0.990
train loss 0.172 train acc 0.968 valid loss 0.196 valid acc 0.956 roc auc acc 0.991
train loss 0.171 train acc 0.969 valid loss 0.195 valid acc 0.957 roc auc acc 0.991
train loss 0.171 train acc 0.969 valid loss 0.194 valid acc 0.957 roc auc acc 0.991
train loss 0.170 train acc 0.969 valid loss 0.194 valid acc 0.957 roc auc acc 0.991
train loss 0.171 train acc 0.969 valid loss 0.193 valid acc 0.958 roc auc acc 0.992
train loss 0.170 train acc 0.969 valid loss 0.193 valid acc 0.958 roc auc acc 0.992
train loss 0.170 train acc 0.969 valid loss 0.193 valid acc 0.958 roc auc ac

### New Model: MF3, concat context feature and item feature

In [119]:
class MF3(nn.Module):
    def __init__(self, num_users, num_items,num_context_feature ,num_item_feature, 
                 user_emb_size=20, item_emb_size=20, item_context_emb_size=8, item_feature_emb_size=8):
        super(MF3, self).__init__()
#         torch.manual_seed(seed)
        self.user_emb = nn.Embedding(num_users, user_emb_size)
        self.item_emb = nn.Embedding(num_items, item_emb_size)
        self.context_feature_emb = nn.Embedding(num_context_feature, item_context_emb_size)
        self.item_feature_emb = nn.Embedding(num_item_feature, item_feature_emb_size)
        # init
        self.item_feature_emb.weight.data.uniform_(0,0.05)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.classifier = nn.Sigmoid()
        self.linear1 = nn.Linear(user_emb_size+item_emb_size+item_context_emb_size+item_feature_emb_size, 5)
        self.dropout = nn.Dropout(p = 0.05)
        self.nonlin = nn.LeakyReLU()
        self.linear2 = nn.Linear(5, 1)
        
    def forward(self, u, v, c, i):
        U = self.user_emb(u)
        V = self.item_emb(v)
        C = self.context_feature_emb(c)
        I = self.item_feature_emb(i)
        ensemble = torch.cat((U,V,C,I),dim=1)
        pred = self.linear1(ensemble)
        pred = self.dropout(pred)
        pred = self.nonlin(pred)
        pred = self.linear2(pred)
        return self.classifier(pred.squeeze())

In [8]:
def train_one_epoch(model, train_df, optimizer):
    """ Trains the model for one epoch"""
    model.train()
    y = torch.FloatTensor(train_df.label.values)
    u = torch.LongTensor(train_df.user_id.values)
    v = torch.LongTensor(train_df.item_id.values)
    c = torch.LongTensor(train_df.context_feature_id.values)
    i = torch.LongTensor(train_df.item_feature_id.values)
    y_hat = model(u,v,c,i)
    output = torch.as_tensor(y_hat > 0.5, dtype = torch.int8)
    train_acc = accuracy_score(output,y)
    train_loss = F.binary_cross_entropy(y_hat, y)
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()
    return train_loss.item(), train_acc

def valid_metrics(model, valid_df):
    """Computes validation loss and accuracy"""
    model.eval()
    u = torch.LongTensor(valid_df.user_id.values)
    v = torch.LongTensor(valid_df.item_id.values)
    y = torch.FloatTensor(valid_df.label.values)
    c = torch.LongTensor(valid_df.context_feature_id.values)
    i = torch.LongTensor(valid_df.item_feature_id.values)
    y_hat = model(u,v,c,i)
    valid_loss = F.binary_cross_entropy(y_hat, y)
    output = torch.as_tensor(y_hat > 0.5, dtype = torch.int8)
    auc = roc_auc_score(y.detach().numpy(), y_hat.detach().numpy())
    valid_acc = accuracy_score(output,y)
    return valid_loss.item(), valid_acc, auc

def training(model, df, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    train, val = data_sample(df)
    track_val_loss = 1000
    for i in range(epochs):
        train_loss, train_acc = train_one_epoch(model, train, optimizer)
        valid_loss, valid_acc, auc = valid_metrics(model, val) 
        if i%5== 0:
            print("train loss %.3f train acc %.3f valid loss %.3f valid acc %.3f roc auc acc %.3f" % (train_loss,train_acc,valid_loss, valid_acc, auc))
        
#         if track_val_loss <= valid_loss:
#             break
#         track_val_loss = valid_loss
#         if i%2 == 0: 
#             train, val = data_sample(df)
        

In [328]:
model_2 = MF(df.user_id.max()+1, df.item_id.max()+1, 
              df.context_feature_id.max()+1, df.item_feature_id.max()+1, emb_size=50)
training(model_2, df, epochs=126, lr=.05, wd=1e-6)

train loss 0.668 train acc 0.667 valid loss 0.646 valid acc 0.667 roc auc acc 0.577
train loss 0.386 train acc 0.668 valid loss 0.390 valid acc 0.879 roc auc acc 0.933
train loss 0.323 train acc 0.887 valid loss 0.362 valid acc 0.871 roc auc acc 0.933
train loss 0.293 train acc 0.887 valid loss 0.336 valid acc 0.871 roc auc acc 0.937
train loss 0.278 train acc 0.892 valid loss 0.312 valid acc 0.880 roc auc acc 0.940
train loss 0.264 train acc 0.897 valid loss 0.294 valid acc 0.885 roc auc acc 0.943
train loss 0.253 train acc 0.901 valid loss 0.291 valid acc 0.886 roc auc acc 0.944
train loss 0.249 train acc 0.902 valid loss 0.293 valid acc 0.885 roc auc acc 0.944
train loss 0.248 train acc 0.903 valid loss 0.292 valid acc 0.885 roc auc acc 0.945
train loss 0.246 train acc 0.903 valid loss 0.287 valid acc 0.887 roc auc acc 0.946
train loss 0.244 train acc 0.904 valid loss 0.280 valid acc 0.889 roc auc acc 0.948
train loss 0.241 train acc 0.904 valid loss 0.277 valid acc 0.889 roc auc ac

In [133]:
model_5 = MF3(df.user_id.max()+1, df.item_id.max()+1, 
              df.context_feature_id.max()+1, df.item_feature_id.max()+1, emb_size= 75)
training(model_5, df, epochs=100, lr=.01, wd=1e-3)

train loss 0.693 train acc 0.500 valid loss 0.693 valid acc 0.528 roc auc acc 0.559
train loss 0.688 train acc 0.613 valid loss 0.686 valid acc 0.637 roc auc acc 0.714
train loss 0.667 train acc 0.649 valid loss 0.661 valid acc 0.646 roc auc acc 0.744
train loss 0.624 train acc 0.672 valid loss 0.613 valid acc 0.677 roc auc acc 0.774
train loss 0.566 train acc 0.738 valid loss 0.555 valid acc 0.749 roc auc acc 0.827
train loss 0.506 train acc 0.798 valid loss 0.494 valid acc 0.807 roc auc acc 0.870
train loss 0.450 train acc 0.826 valid loss 0.442 valid acc 0.831 roc auc acc 0.897
train loss 0.412 train acc 0.842 valid loss 0.408 valid acc 0.844 roc auc acc 0.910
train loss 0.385 train acc 0.850 valid loss 0.385 valid acc 0.851 roc auc acc 0.916
train loss 0.370 train acc 0.854 valid loss 0.372 valid acc 0.854 roc auc acc 0.919
train loss 0.361 train acc 0.857 valid loss 0.366 valid acc 0.856 roc auc acc 0.920
train loss 0.357 train acc 0.858 valid loss 0.362 valid acc 0.856 roc auc ac

In [134]:
model_6 = MF3(df.user_id.max()+1, df.item_id.max()+1, 
              df.context_feature_id.max()+1, df.item_feature_id.max()+1, 
              user_emb_size= 100, item_emb_size=130,
              item_context_emb_size=20,item_feature_emb_size=30)
training(model_6, df, epochs=120, lr=.01, wd=1e-3)

train loss 0.712 train acc 0.500 valid loss 0.701 valid acc 0.501 roc auc acc 0.554
train loss 0.689 train acc 0.576 valid loss 0.684 valid acc 0.581 roc auc acc 0.622
train loss 0.660 train acc 0.661 valid loss 0.652 valid acc 0.678 roc auc acc 0.737
train loss 0.618 train acc 0.694 valid loss 0.608 valid acc 0.700 roc auc acc 0.772
train loss 0.560 train acc 0.743 valid loss 0.545 valid acc 0.750 roc auc acc 0.812
train loss 0.488 train acc 0.809 valid loss 0.476 valid acc 0.816 roc auc acc 0.870
train loss 0.438 train acc 0.823 valid loss 0.430 valid acc 0.831 roc auc acc 0.905
train loss 0.403 train acc 0.840 valid loss 0.400 valid acc 0.846 roc auc acc 0.913
train loss 0.384 train acc 0.847 valid loss 0.382 valid acc 0.851 roc auc acc 0.916
train loss 0.371 train acc 0.851 valid loss 0.372 valid acc 0.854 roc auc acc 0.919
train loss 0.363 train acc 0.854 valid loss 0.365 valid acc 0.856 roc auc acc 0.920
train loss 0.357 train acc 0.855 valid loss 0.360 valid acc 0.857 roc auc ac

In [308]:
model_8 = MF3(df.user_id.max()+1, df.item_id.max()+1, 
              df.context_feature_id.max()+1, df.item_feature_id.max()+1, 
              user_emb_size= 100, item_emb_size=130,
              item_context_emb_size=20,item_feature_emb_size=30)
training(model_8, df, epochs=41, lr=.01, wd=1e-5)

train loss 0.693 train acc 0.500 valid loss 0.690 valid acc 0.581 roc auc acc 0.664
train loss 0.651 train acc 0.797 valid loss 0.635 valid acc 0.819 roc auc acc 0.886
train loss 0.545 train acc 0.846 valid loss 0.521 valid acc 0.847 roc auc acc 0.902
train loss 0.460 train acc 0.863 valid loss 0.451 valid acc 0.859 roc auc acc 0.902
train loss 0.415 train acc 0.864 valid loss 0.409 valid acc 0.859 roc auc acc 0.909
train loss 0.357 train acc 0.864 valid loss 0.363 valid acc 0.858 roc auc acc 0.917
train loss 0.325 train acc 0.870 valid loss 0.349 valid acc 0.859 roc auc acc 0.922
train loss 0.304 train acc 0.876 valid loss 0.348 valid acc 0.859 roc auc acc 0.923
train loss 0.286 train acc 0.883 valid loss 0.352 valid acc 0.857 roc auc acc 0.922


In [309]:
model_9 = MF3(df.user_id.max()+1, df.item_id.max()+1, 
              df.context_feature_id.max()+1, df.item_feature_id.max()+1, 
              user_emb_size= 40, item_emb_size=40,
              item_context_emb_size=20,item_feature_emb_size=30)
training(model_9, df, epochs=41, lr=.01, wd=1e-5)

train loss 0.695 train acc 0.500 valid loss 0.693 valid acc 0.501 roc auc acc 0.559
train loss 0.688 train acc 0.546 valid loss 0.686 valid acc 0.577 roc auc acc 0.722
train loss 0.657 train acc 0.721 valid loss 0.645 valid acc 0.720 roc auc acc 0.798
train loss 0.576 train acc 0.735 valid loss 0.557 valid acc 0.749 roc auc acc 0.858
train loss 0.452 train acc 0.852 valid loss 0.436 valid acc 0.852 roc auc acc 0.904
train loss 0.376 train acc 0.868 valid loss 0.378 valid acc 0.860 roc auc acc 0.916
train loss 0.336 train acc 0.871 valid loss 0.358 valid acc 0.857 roc auc acc 0.920
train loss 0.313 train acc 0.874 valid loss 0.356 valid acc 0.856 roc auc acc 0.921
train loss 0.300 train acc 0.874 valid loss 0.358 valid acc 0.855 roc auc acc 0.920


In [135]:
test = pd.read_csv('test_kaggle.csv')
test = test.merge(item, on = 'item_id', how = 'left')
u = torch.LongTensor(test.user_id.values)
v = torch.LongTensor(test.item_id.values)
c = torch.LongTensor(test.context_feature_id.values)
i = torch.LongTensor(test.item_feature_id.values)
y_hat = model_5(u,v,c,i)
y_hat

tensor([0.6507, 0.3761, 0.8977,  ..., 0.9230, 0.9230, 0.1248],
       grad_fn=<SigmoidBackward0>)

In [136]:
prob = pd.Series(y_hat.detach().numpy()).reset_index().rename(columns = {'index':'id',0:'rating'})
sum(prob.rating>0.5)/len(prob)

0.5306160441548566

### overfitting

In [305]:
model_7 = MF3(df.user_id.max()+1, df.item_id.max()+1, 
              df.context_feature_id.max()+1, df.item_feature_id.max()+1, user_emb_size=100, item_emb_size=120,
              item_context_emb_size=16,item_feature_emb_size=20)
training(model_7, df, epochs=81, lr=.01, wd=1e-5)

train loss 0.726 train acc 0.500 valid loss 0.724 valid acc 0.500 roc auc acc 0.473
train loss 0.716 train acc 0.500 valid loss 0.715 valid acc 0.500 roc auc acc 0.581
train loss 0.707 train acc 0.500 valid loss 0.705 valid acc 0.500 roc auc acc 0.597
train loss 0.691 train acc 0.500 valid loss 0.686 valid acc 0.500 roc auc acc 0.710
train loss 0.655 train acc 0.696 valid loss 0.642 valid acc 0.701 roc auc acc 0.858
train loss 0.577 train acc 0.733 valid loss 0.553 valid acc 0.746 roc auc acc 0.892
train loss 0.496 train acc 0.822 valid loss 0.478 valid acc 0.847 roc auc acc 0.900
train loss 0.450 train acc 0.857 valid loss 0.438 valid acc 0.859 roc auc acc 0.902
train loss 0.416 train acc 0.862 valid loss 0.412 valid acc 0.858 roc auc acc 0.899
train loss 0.360 train acc 0.869 valid loss 0.368 valid acc 0.857 roc auc acc 0.911
train loss 0.315 train acc 0.880 valid loss 0.363 valid acc 0.853 roc auc acc 0.918
train loss 0.296 train acc 0.888 valid loss 0.379 valid acc 0.848 roc auc ac

### Extract model parmeters and replace with average embeddings

In [151]:
test = pd.read_csv('test_kaggle.csv')
test = test.merge(item, on = 'item_id', how = 'left')
para = [i for i in model_5.parameters()]
user_mean = torch.mean(para[0], dim = 0)
item_mean = torch.mean(para[1], dim = 0)
user_context_mean = torch.mean(para[2], dim=0)
item_feature_mean = torch.mean(para[3], dim=0)


x = []
train_user = set(train.user_id)
for i in test.user_id:
    if i in train_user:
        x.append(para[0][i])
    else:
        x.append(user_mean)
x = torch.stack(x)

y = []
train_item = set(train.item_id)
for i in test.item_id:
    if i in train_item:
        y.append(para[1][i])
    else:
        y.append(item_mean)
y = torch.stack(y)

w = []
train_user_context = set(train.context_feature_id)
for i in test.context_feature_id:
    if i in train_user_context:
        w.append(para[2][i])
    else:
        w.append(user_context_mean)
        
w = torch.stack(w)

z = []
train_item_feature = set(train.item_feature_id)
for i in test.item_feature_id:
    if i in train_item_feature:
        z.append(para[3][i])
    else:
        z.append(item_feature_mean)
        
z = torch.stack(z)

ensemble = torch.cat((x,y,w,z),dim=1)
pred = torch.matmul(ensemble, para[4].T)+para[5]
pred = torch.nn.functional.leaky_relu(pred)
pred = torch.matmul(pred, para[6].T)+para[7]
pred = torch.sigmoid(pred)
pred

tensor([[0.6507],
        [0.3761],
        [0.8977],
        ...,
        [0.9230],
        [0.9230],
        [0.1248]], grad_fn=<SigmoidBackward0>)

In [128]:
prob = pd.Series(pred.squeeze().detach().numpy()).reset_index().rename(columns = {'index':'id',0:'rating'})

In [130]:
prob.to_csv('trial54.csv',index = False)