In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch import optim
from torch.autograd import Variable
from torch.utils.data.dataset import TensorDataset
from torch.utils.data.dataloader import DataLoader 

In [3]:
def normalize(df):
    df = df.iloc[:,1:] 
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

In [4]:
data = pd.read_csv('./data-processed/detailed/inputs.csv.xz')
data_wk = data.copy()
data_wk = data_wk.replace([np.inf, -np.inf], np.nan)
data_wky = data_wk.dropna(axis=1)
outy = pd.read_csv('./data-processed/detailed/outputs.csv')
cvv = pd.read_csv('./data-processed/detailed/folds.csv')

In [5]:
data_wky = normalize(data_wky)

In [6]:
def data_sepration(in_data, out, cvv, val_id):
    """
    convert the df to dataset and seperate between
    train and validation set
    """
    trainset = torch.from_numpy(in_data[cvv['fold'] != val_id].values)
    traintarget = torch.from_numpy(out[cvv['fold'] != val_id].iloc[:,1:].values)
    valset = torch.from_numpy(in_data[cvv['fold'] == val_id].values)
    valtarget = torch.from_numpy(out[cvv['fold'] == val_id].iloc[:,1:].values)
    return trainset, valset, traintarget, valtarget

In [7]:
i_set = 1
trainset, valset, traintarget, valtarget = data_sepration(data_wky, outy, cvv, i_set)
traindataset = TensorDataset(trainset.unsqueeze(1), traintarget.unsqueeze(1))
valdataset = TensorDataset(valset.unsqueeze(1), valtarget.unsqueeze(1))
trainLoader = DataLoader(traindataset, shuffle=True, batch_size=10000)
valLoader = DataLoader(valdataset, shuffle=True, batch_size=40)

In [11]:
def accuarcy(pred, true):
    true = true.squeeze(1)
    pred = pred.reshape(-1)
    count = torch.sum((pred > true[:,0]) & (pred < true[:,1]))
#     print(pred)
#     print(true[:,0])
#     print(true[:,1])
#     print(100*count/pred.size(0))
    return 100.0*count/pred.size(0)

In [17]:
class SquareHingeLoss(nn.Module):
    def __init__(self):
        super(SquareHingeLoss,self).__init__()
    
    def ifelse(self, condition, a, b):
        crit = (condition >= 0).squeeze(1)
        copy_con = condition.clone()
        copy_con[crit] = condition[crit] ** 2
        copy_con[~crit] = b
        return copy_con

    def phi(self, in_phi):
        return self.ifelse(in_phi, in_phi**2, 0) 
       
    def forward(self, x, target_y):
#         print(torch.mean(self.phi(-x + target_y[:,:,0] + 1) + self.phi(x - target_y[:,:,1] + 1)))
        return torch.mean(self.phi(- x + target_y[:,:,0] + 1) + self.phi(x - target_y[:,:,1] + 1))

In [18]:
class convNet(nn.Module):
    def __init__(self):
        super(convNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Linear(256, 1024),
            nn.Linear(1024, 128),
            nn.Linear(128, 1)
        )
        
    def forward(self, x):
        x = x.float()
        x = self.layer1(x)
        return x

In [19]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [20]:
model = convNet().to(device)
criterion = SquareHingeLoss()
optimier = optim.Adam(model.parameters(), lr=1e-4, weight_decay=5e-4)

In [None]:
e = 0
num_epoches = 3000
train_loss_record = np.zeros(num_epoches)
train_acc_record = np.zeros(num_epoches)
test_loss_record = np.zeros(num_epoches)
test_acc_record = np.zeros(num_epoches)
for epoch in range(num_epoches):
    loss_value, iter_num, print_loss = 0, 0, 0
    acc = 0
    for data in trainLoader:
        e += 1
        iter_num += 1
        inputs, targets = data
        inputs = Variable(inputs).to(device)
        targets = Variable(targets).to(device)
        inputs = inputs.type(torch.DoubleTensor).to(device)
        out = model(inputs)
        loss = criterion(out, targets.float())
        optimier.zero_grad()
        loss.backward()
        optimier.step()
        
        print_loss += loss.cpu().data.numpy()
        acc += accuarcy(out.cpu().data, targets.cpu().data.float()).data.numpy()
    
    test_in = Variable(valdataset.tensors[0]).to(device)
    test_in = test_in.type(torch.DoubleTensor).to(device)
    test_out = model(test_in)
    test_loss = criterion(test_out, Variable(valdataset.tensors[1].cuda()).float())
    test_loss = test_loss.cpu().data.numpy()
    test_acc = accuarcy(test_out.cpu().data, valdataset.tensors[1].float()).data.numpy()
        
    print('-'* 120)
    print('Epoch [{:-03d}/{}]  |  Train Loss:  {:.3f}  |  Test Loss:  {:.3f}  |  Test Accuarcy:  {:.3f}  |  Train Accuracy:  {:.3f}'
          .format(epoch+1, num_epoches, print_loss/iter_num, test_loss, test_acc, acc/iter_num))
    train_loss_record[epoch] = print_loss/iter_num
    test_loss_record[epoch] = test_loss
    train_acc_record[epoch] = acc/iter_num
    test_acc_record[epoch] = test_acc

------------------------------------------------------------------------------------------------------------------------
Epoch [001/3000]  |  Train Loss:  0.511  |  Test Loss:  0.311  |  Test Accuarcy:  93.813  |  Train Accuracy:  88.665
------------------------------------------------------------------------------------------------------------------------
Epoch [002/3000]  |  Train Loss:  0.496  |  Test Loss:  0.299  |  Test Accuarcy:  93.645  |  Train Accuracy:  88.602
------------------------------------------------------------------------------------------------------------------------
Epoch [003/3000]  |  Train Loss:  0.487  |  Test Loss:  0.292  |  Test Accuarcy:  93.645  |  Train Accuracy:  89.049
------------------------------------------------------------------------------------------------------------------------
Epoch [004/3000]  |  Train Loss:  0.484  |  Test Loss:  0.289  |  Test Accuarcy:  93.645  |  Train Accuracy:  89.400
------------------------------------------------

In [None]:
plt.plot(test_acc_record)
plt.plot(train_acc_record)
plt.plot(train_acc_record)
plt.show()