In [8]:
import pandas as pd
import numpy as np
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

### MF Model: More Hyperparameter Tuning

In [9]:
item = pd.read_csv('item_feature.csv')
train = pd.read_csv('training.csv')
df = train.merge(item, on = 'item_id', how = 'left')
df['label'] = 1

In [10]:
# Random Negative Sample
u = np.random.randint(low=0.0, high=df.user_id.max(), size=int(len(df)))
i = np.random.randint(low=0.0, high=df.item_id.max(), size=int(len(df)))
c = np.random.randint(low=0.0, high=df.context_feature_id.max(), size=int(len(df)))

In [11]:
sample= pd.concat([pd.Series(u),pd.Series(i),pd.Series(c)], axis =1).\
rename(columns={0:'user_id', 1:'item_id', 2:'context_feature_id'})
sample = sample.merge(item, on = 'item_id', how = 'left')
sample['label'] = 0
df = pd.concat([df,sample])
df = df.drop_duplicates(subset=['user_id','item_id']).reset_index(drop = True)

In [12]:
pos = df[df.label ==1].reset_index(drop = True)
neg = df[df.label ==0].reset_index(drop = True)

In [16]:
def data_sample(data):

    train=data.sample(frac=0.8)
    val=data.drop(train.index)
    return train, val

In [13]:
class MF2(nn.Module):
    def __init__(self, num_users, num_items, emb_size=20):
        super(MF2, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        # init 
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        self.classifier = nn.Sigmoid()
        self.nonlin = nn.ReLU()
        self.drop = nn.Dropout(p = 0.1)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        U = self.nonlin(U)
        V = self.item_emb(v)
        V = self.drop(V)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return self.classifier((U*V).sum(1) +  b_u  + b_v)

In [48]:
def train_one_epoch(model, train_df, optimizer):
    """ Trains the model for one epoch"""
    model.train()
    y = torch.FloatTensor(train_df.label.values)
    u = torch.LongTensor(train_df.user_id.values)
    v = torch.LongTensor(train_df.item_id.values)
    y_hat = model(u,v)
    output = torch.as_tensor(y_hat > 0.5, dtype = torch.int8)
    train_acc = accuracy_score(output,y)
    train_loss = F.binary_cross_entropy(y_hat, y)
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()
    return train_loss.item(), train_acc

def valid_metrics(model, valid_df):
    """Computes validation loss and accuracy"""
    model.eval()
    u = torch.LongTensor(valid_df.user_id.values)
    v = torch.LongTensor(valid_df.item_id.values)
    y = torch.FloatTensor(valid_df.label.values)
    y_hat = model(u,v)
    valid_loss = F.binary_cross_entropy(y_hat, y)
    output = torch.as_tensor(y_hat > 0.5, dtype = torch.int8)
    auc = roc_auc_score(y.detach().numpy(), y_hat.detach().numpy())
    valid_acc = accuracy_score(output,y)
    return valid_loss.item(), valid_acc, auc

def training(model, df, epochs=10, lr=0.01, wd=0.0, whole=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    if whole:
        train = df
    else:
        train, val = data_sample(df)
    for i in range(epochs):
        if whole:
            train_loss, train_acc = train_one_epoch(model, train, optimizer)
            if i%10== 0:
                print("train loss %.3f train acc %.3f" % (train_loss,train_acc))
        else:
            train_loss, train_acc = train_one_epoch(model, train, optimizer)
            valid_loss, valid_acc, auc = valid_metrics(model, val) 
            if i%10== 0:
                print("train loss %.3f train acc %.3f valid loss %.3f valid acc %.3f roc auc acc %.3f" % (train_loss,train_acc,valid_loss, valid_acc, auc)) 
#         if i%2 == 0: 
#             train, val = data_sample(df)
        

In [46]:
model = MF2(df.user_id.max()+1, df.item_id.max()+1, emb_size=85) # our best result with test data
training(model, df, epochs=126, lr=.06, wd=1e-6)

train loss 0.695 train acc 0.471 valid loss 0.652 valid acc 0.823 roc auc acc 0.869
train loss 0.222 train acc 0.926 valid loss 0.357 valid acc 0.870 roc auc acc 0.927
train loss 0.190 train acc 0.966 valid loss 0.333 valid acc 0.879 roc auc acc 0.941
train loss 0.174 train acc 0.968 valid loss 0.315 valid acc 0.884 roc auc acc 0.945
train loss 0.169 train acc 0.972 valid loss 0.306 valid acc 0.889 roc auc acc 0.948
train loss 0.165 train acc 0.974 valid loss 0.298 valid acc 0.893 roc auc acc 0.951
train loss 0.163 train acc 0.975 valid loss 0.292 valid acc 0.897 roc auc acc 0.953
train loss 0.162 train acc 0.976 valid loss 0.288 valid acc 0.900 roc auc acc 0.955
train loss 0.161 train acc 0.976 valid loss 0.285 valid acc 0.902 roc auc acc 0.956
train loss 0.161 train acc 0.977 valid loss 0.283 valid acc 0.904 roc auc acc 0.957
train loss 0.161 train acc 0.977 valid loss 0.281 valid acc 0.905 roc auc acc 0.958
train loss 0.160 train acc 0.977 valid loss 0.281 valid acc 0.906 roc auc ac

In [49]:
model = MF2(df.user_id.max()+1, df.item_id.max()+1, emb_size=85) # our best result with test data
training(model, df, epochs=126, lr=.06, wd=1e-6, whole = True)

train loss 0.695 train acc 0.471
train loss 0.239 train acc 0.915
train loss 0.205 train acc 0.958
train loss 0.186 train acc 0.962
train loss 0.181 train acc 0.966
train loss 0.176 train acc 0.969
train loss 0.174 train acc 0.970
train loss 0.173 train acc 0.971
train loss 0.172 train acc 0.971
train loss 0.172 train acc 0.971
train loss 0.172 train acc 0.972
train loss 0.172 train acc 0.972
train loss 0.172 train acc 0.972


In [50]:
test = pd.read_csv('test_kaggle.csv')
u = torch.LongTensor(test.user_id.values)
v = torch.LongTensor(test.item_id.values)
y_hat = model(u,v)
y_hat

tensor([0.3890, 0.3308, 0.7549,  ..., 0.8805, 0.8728, 0.1300],
       grad_fn=<SigmoidBackward0>)

In [51]:
prob = pd.Series(y_hat.detach().numpy()).reset_index().rename(columns = {'index':'id',0:'rating'})
sum(prob.rating>0.5)/len(prob)

0.45943338096673964

In [52]:
prob.to_csv('trial77.csv',index = False)

### NN Model: The best result of 0.411

Added a break condition when val_loss<0.3 to avoid overfitting

In [None]:
cuda = torch.device('cuda')

In [None]:
item = pd.read_csv('item_feature.csv')
train = pd.read_csv('training.csv')
df = train.merge(item, on = 'item_id', how = 'left')
df['label'] =1
train = df.copy()

In [None]:
u = np.random.randint(low=0.0, high=df.user_id.max(), size=int(len(df)*2))
i = np.random.randint(low=0.0, high=df.item_id.max(), size=int(len(df)*2))
p = (df.context_feature_id.value_counts()/len(df)).sort_index().values
p[p.argmin()]+=1-p.sum()
c = np.random.choice(4, len(df)*10, p=p)

In [None]:
sample= pd.concat([pd.Series(u),pd.Series(i)], axis =1).\
rename(columns={0:'user_id', 1:'item_id'})
sample = sample.merge(item, on = 'item_id', how = 'left')
sample['label'] = 0
df = pd.concat([df,sample])
df = df.drop_duplicates(subset=['user_id','item_id']).reset_index(drop = True)
df = df[df['label']==0]
df = pd.concat([train, df]).reset_index(drop=True)

In [None]:
pos = df[df.label ==1].reset_index(drop = True)
neg = df[df.label ==0].reset_index(drop = True)

In [None]:
def data_sample(pos, neg):

  msk = np.random.rand(len(pos)) < 0.8
  train_pos = pos[msk].reset_index(drop = True)
  val_pos = pos[~msk].reset_index(drop = True)

  msk = np.random.rand(len(neg)) < 0.8
  train_neg = neg[msk].sample(frac = len(pos)/len(neg)).reset_index(drop = True)
  val_neg = neg[~msk].sample(frac = len(pos)/len(neg)).reset_index(drop = True)

  train = pd.concat([train_pos, train_neg]).sample(frac=1).reset_index(drop = True)
  val = pd.concat([val_pos, val_neg]).sample(frac=1).reset_index(drop = True)
  
  return train, val

In [None]:
train, val = data_sample(pos, neg)

In [None]:
class NN(nn.Module):
    def __init__(self, num_users, num_items, num_item_feature, emb_user_size=20, 
                 emb_item_size=20, emb_item_feature_size=20):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_user_size)
        self.item_emb = nn.Embedding(num_items, emb_item_size)
        self.item_feature_emb = nn.Embedding(num_item_feature, emb_item_feature_size)
        # init 
        self.item_feature_emb.weight.data.uniform_(0,0.05)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.classifier = nn.Sigmoid()
        self.linear1 = nn.Linear(emb_user_size+emb_item_size+emb_item_feature_size, 2)
        self.nonlin = nn.LeakyReLU()
        self.linear3 = nn.Linear(2,1)
        
    def forward(self, u, v, c):
        U = self.user_emb(u)
        V = self.item_emb(v)
        C = self.item_feature_emb(c)
        ensemble = torch.cat((U,V,C),dim=1)
        pred = self.linear1(ensemble)
        pred = self.nonlin(pred)
        pred = self.linear3(pred)
        return self.classifier(pred.squeeze())

  
    def __getitem__(self,idx):

        return self.U[idx], self.V[idx], self.C[idx]

In [None]:
def train_one_epoch(model, train_df, optimizer):
  tensor_x_tr = torch.LongTensor(np.array(train_df[['user_id', 'item_id', 'item_feature_id']])) # transform to torch tensor
  tensor_y_tr = torch.Tensor(train_df['label'])
  train_ds = TensorDataset(tensor_x_tr,tensor_y_tr)
  train_dl = DataLoader(train_ds, batch_size=50000, shuffle=True)
  """ Trains the model for one epoch"""
  ### BEGIN SOLUTION
  losses = []
  acc=[]
  for x, y in train_dl:
      model.train()
      y = y.cuda()
      u = torch.LongTensor(x[:,0]).cuda()
      v = torch.LongTensor(x[:,1]).cuda()
      c = torch.LongTensor(x[:,2]).cuda()
      y_hat = model(u,v,c)
      output = torch.as_tensor(y_hat > 0.5, dtype = torch.int8)
      loss = F.binary_cross_entropy(y_hat, y.float())
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      losses.append(loss.item())
      acc.append(accuracy_score(output.cpu().detach().numpy(),y.cpu().detach().numpy()))
      
  train_loss = np.mean(losses)
  train_acc = np.mean(acc)

    ### END SOLUTION
  return train_loss, train_acc

def valid_metrics(model, valid_df):
    """Computes validation loss and accuracy"""
    model.eval()
    ### BEGIN SOLUTION
    u = torch.LongTensor(valid_df.user_id.values).cuda()
    v = torch.LongTensor(valid_df.item_id.values).cuda()
    c = torch.LongTensor(valid_df.item_feature_id.values).cuda()
    y = torch.FloatTensor(valid_df.label.values).cuda()
    y_hat = model(u,v,c)
    valid_loss = F.binary_cross_entropy(y_hat, y)
    output = torch.as_tensor(y_hat > 0.5, dtype = torch.int8)
    auc = roc_auc_score(y.cpu().detach().numpy(), y_hat.cpu().detach().numpy())
    valid_acc = accuracy_score(output.cpu().detach().numpy(),y.cpu().detach().numpy())
    ### END SOLUTION
    return valid_loss.item(), valid_acc, auc

def training(model, pos, neg, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    train, val = data_sample(pos, neg)
    for i in range(epochs):
        train_loss, train_acc = train_one_epoch(model, train, optimizer)
        valid_loss, valid_acc, auc = valid_metrics(model, val) 
        if valid_loss<0.3: 
          print("train loss %.3f train acc %.3f valid loss %.3f valid acc %.3f roc auc acc %.3f" % (train_loss,train_acc,valid_loss, valid_acc, auc)) 
          break
        if i%2 == 0: 
          train, val = data_sample(pos, neg)
        if i%5 == 0:
          print("train loss %.3f train acc %.3f valid loss %.3f valid acc %.3f roc auc acc %.3f" % (train_loss,train_acc,valid_loss, valid_acc, auc)) 
        


In [None]:
model = MF(df.user_id.max()+1, df.item_id.max()+1, df.item_feature_id.max()+1,
           emb_user_size=32, emb_item_size=24, emb_item_feature_size=8).cuda()
#optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)
training(model, pos, neg, epochs=51, lr=0.0005, wd=1e-6)

train loss 0.749 train acc 0.500 valid loss 0.746 valid acc 0.500 roc auc acc 0.886
train loss 0.647 train acc 0.500 valid loss 0.627 valid acc 0.500 roc auc acc 0.935
train loss 0.429 train acc 0.875 valid loss 0.414 valid acc 0.874 roc auc acc 0.945
train loss 0.320 train acc 0.879 valid loss 0.314 valid acc 0.880 roc auc acc 0.954
train loss 0.305 train acc 0.883 valid loss 0.307 valid acc 0.881 roc auc acc 0.954


In [None]:
# stop, but continue training until a second stop
training(model, pos, neg, epochs=51, lr=0.0001, wd=1e-6)

train loss 0.300 train acc 0.884 valid loss 0.300 valid acc 0.883 roc auc acc 0.956


In [None]:
test = pd.read_csv('test_kaggle.csv')
test = test.merge(item, on = 'item_id', how = 'left')
u = torch.LongTensor(test.user_id.values).cuda()
v = torch.LongTensor(test.item_id.values).cuda()
c = torch.LongTensor(test.item_feature_id.values).cuda()
y_hat = model(u,v,c)
y_hat

tensor([0.5350, 0.2149, 0.8678,  ..., 0.9268, 0.9268, 0.1153], device='cuda:0',
       grad_fn=<SigmoidBackward0>)

In [None]:
np.mean(y_hat.cpu().detach().numpy())

0.51798266

In [None]:
prob = pd.Series(y_hat.cpu().detach().numpy()).reset_index().rename(columns = {'index':'id',0:'rating'})
sum(prob.rating>0.5)/len(prob)

0.5012703698362547

In [None]:
prob.to_csv('trial63.csv',index = False)

### Ensemble our best results

Decreased the variance by taking average of our trials whose scores are under around 0.415.

In [2]:
t77 = pd.read_csv('trial77.csv')
t40 = pd.read_csv('trial40.csv')
t41 = pd.read_csv('trial41.csv')
t63 = pd.read_csv('trial63.csv')
t68 = pd.read_csv('trial68.csv')

In [3]:
t78 = t77.copy()
t78.rating = (t77.rating+t40.rating+t41.rating+t63.rating+t68.rating)/5

In [6]:
(t78.rating - t77.rating).describe()

count    381385.000000
mean          0.002731
std           0.025649
min          -0.231606
25%          -0.010564
50%           0.003496
75%           0.016680
max           0.245898
Name: rating, dtype: float64

In [8]:
t78.to_csv('trial78.csv', index=False)