In [1]:
import os 

import pandas as pd 
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix

from torch import nn, optim
import torch.nn.functional as F
import functools

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [2]:
class MLP(nn.Module):
    
    def __init__(self, input_dimension, num_class):
        super(MLP, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(input_dimension, 150),
            nn.ReLU(),
            nn.Linear(150, 50),
            nn.ReLU(),
            nn.Linear(50, num_class),
        )
    
    def forward(self, x):
        return self.model(x)

def generate_NN(func, input_dimension, num_class):
    return functools.partial(func, input_dimension, num_class)

In [3]:
estimator = generate_NN

cross-validation 

In [4]:
feat = ['ArmSwelling', 'BreastSwelling', 'Skin', 'SYM_COUNT', 'TIME_LAPSE_LOG']
data_train = pd.read_csv('./data/result_data/split_train_Oct21_stratified.csv')
data_train = data_train.drop(columns=['Unnamed: 0', 'Username'])

In [5]:
X = data_train[feat].values
Y = data_train.iloc[:,-1].values
X -= np.mean(X, axis=0)
X /= np.std(X, axis=0)

In [6]:
shuffle=True
random_state = 320

In [7]:
class StatDataSet(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        assert X.shape[0] == Y.shape[0], 'X and Y have different dimension'
    
    def __getitem__(self, index):
        x = torch.from_numpy(self.X[index]).float()
        y = torch.from_numpy(np.array(self.Y[index])).long()
        return x, y 
    
    def __len__(self):
        return self.X.shape[0]

In [8]:
def evaluator(X, y, num_repeated, estimator):
    n_splits = 8
    if num_repeated > 1:
        print("num_repeated is not 1")
        skf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=num_repeated, random_state=random_state)
    else:
        skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    test_scores = []
    fold = 1
    
    model_generator = generate_NN(MLP, X.shape[1], 3)
    for train_index, test_index in skf.split(X, Y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        trainset, valset = StatDataSet(X_train, y_train), StatDataSet(X_test, y_test)
        epoch_acc, epoch_loss = train_val_NN(model_generator, trainset, valset, fold)
        test_scores.append(epoch_acc)
        fold += 1
    return np.mean(test_scores), np.std(test_scores)



In [9]:
def train_val_NN(model, trainset, valset, fold): 
    # define learning rate 
    lr = 0.1
    num_epochs = 100
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # construct model 
    model = model()
    print(model)
    # define loss function
    criterion = nn.CrossEntropyLoss(weight=torch.tensor([0.15913545, 0.54254953, 0.29831502]))
    # move to device 
    model = model.to(device)
    criterion = criterion.to(device)
    # optimizer 
    optimizer = optim.Adadelta(model.parameters(), lr=lr)
    # construct dataloaders
    train_loader = DataLoader(trainset, batch_size=25, shuffle=True, num_workers=16)
    val_loader = DataLoader(valset, batch_size=5, num_workers=8)
    train_val_loaders = {'train':train_loader, 'val':val_loader}
    train_val_sizes = {x:len(train_val_loaders[x].dataset) for x in ['train', 'val']}
    # start training
    
    
    for epoch in range(1,num_epochs+1):
        
        
        for phase in ['train','val']:
            
            running_loss = 0
            running_corrects = 0
            
            if phase == 'train':
                model.train()
            else:
                model.eval()
                
            label_list = [] 
            pred_list = []
            
            for inputs, label in train_val_loaders[phase]:
                inputs = inputs.to(device)
                labels = label.to(device)

                optimizer.zero_grad()

                if phase == 'train':
                    logits = model(inputs)
                else:
                    with torch.no_grad():
                        logits = model(inputs)

                loss = criterion(logits, labels)
                

                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                probs = nn.Softmax(dim=1)(logits)
                preds = torch.max(probs, 1)[1]
                
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                
                label_list += list(labels.data)
                pred_list += list(preds)    
            
            epoch_loss = running_loss / train_val_sizes[phase]
            epoch_acc = running_corrects.double() / train_val_sizes[phase]
            print("[{}][{}] Epoch: {}/{} Loss: {:.4f} Acc: {:.4f}".format(phase, fold, epoch+1, num_epochs, epoch_loss, epoch_acc))
            
            CM = confusion_matrix(label_list, pred_list)
            print(CM)
            
            
    return epoch_acc, epoch_loss

In [10]:
estimator
mu, std = evaluator(X, Y, num_repeated=1, estimator=estimator)

MLP(
  (model): Sequential(
    (0): Linear(in_features=5, out_features=150, bias=True)
    (1): ReLU()
    (2): Linear(in_features=150, out_features=50, bias=True)
    (3): ReLU()
    (4): Linear(in_features=50, out_features=3, bias=True)
  )
)


[train][1] Epoch: 2/100 Loss: 1.0092 Acc: 0.5969
[[196 221   7]
 [ 24  76  24]
 [ 19  17 190]]
[val][1] Epoch: 2/100 Loss: 0.9384 Acc: 0.7857
[[51  9  1]
 [ 2  8  8]
 [ 2  2 29]]
[train][1] Epoch: 3/100 Loss: 0.8586 Acc: 0.8282
[[381  36   7]
 [ 36  51  37]
 [ 11   6 209]]
[val][1] Epoch: 3/100 Loss: 0.7968 Acc: 0.8661
[[58  3  0]
 [ 2  9  7]
 [ 2  1 30]]
[train][1] Epoch: 4/100 Loss: 0.7281 Acc: 0.8527
[[389  29   6]
 [ 32  61  31]
 [  9   7 210]]
[val][1] Epoch: 4/100 Loss: 0.6694 Acc: 0.8750
[[58  3  0]
 [ 3 10  5]
 [ 2  1 30]]
[train][1] Epoch: 5/100 Loss: 0.6288 Acc: 0.8527
[[389  30   5]
 [ 35  62  27]
 [  8   9 209]]
[val][1] Epoch: 5/100 Loss: 0.5725 Acc: 0.8750
[[58  3  0]
 [ 2 11  5]
 [ 1  3 29]]
[train][1] Epoch: 6/100 Loss: 0.5515 Acc: 0.8630
[[388  32   4]
 [ 28  76  20]
 [  4  18 204]]
[val][1] Epoch: 6/100 Loss: 0.5063 Acc: 0.8839
[[58  3  0]
 [ 1 12  5]
 [ 1  3 29]]
[train][1] Epoch: 7/100 Loss: 0.4948 Acc: 0.8734
[[387  34   3]
 [ 25  85  14]
 [  5  17 204]]
[val][1] E

In [11]:
mu, std

(0.9334087645297509, 0.018960069603184273)

Train and Test

In [12]:
feat = ['ArmSwelling', 'BreastSwelling', 'Skin', 'SYM_COUNT', 'TIME_LAPSE_LOG']

data_train = pd.read_csv('./data/result_data/split_train_Oct21_stratified.csv')
data_test = pd.read_csv('./data/result_data/split_test_Oct21_stratified.csv')
data_train = data_train.drop(columns=['Unnamed: 0', 'Username'])
data_test = data_test.drop(columns=['Unnamed: 0', 'Username'])

X_train = data_train[feat].values
Y_train = data_train.iloc[:,-1].values

X_test = data_test[feat].values
Y_test = data_test.iloc[:,-1].values

mean, std = np.mean(X_train, axis=0), np.std(X_train, axis=0)

X_train -= mean
X_train /= std

X_test -= mean
X_test /= std

In [13]:
X_train.shape

(886, 5)

In [14]:
X_test.shape

(178, 5)

In [15]:
model_generator = generate_NN(MLP, X.shape[1], 3)

In [16]:
trainset, testset = StatDataSet(X_train, Y_train), StatDataSet(X_test, Y_test)
epoch_acc, epoch_loss = train_val_NN(model_generator, trainset, testset, 0)

MLP(
  (model): Sequential(
    (0): Linear(in_features=5, out_features=150, bias=True)
    (1): ReLU()
    (2): Linear(in_features=150, out_features=50, bias=True)
    (3): ReLU()
    (4): Linear(in_features=50, out_features=3, bias=True)
  )
)
[train][0] Epoch: 2/100 Loss: 1.0300 Acc: 0.5000
[[169 297  19]
 [  6  93  43]
 [  1  77 181]]
[val][0] Epoch: 2/100 Loss: 0.9571 Acc: 0.6629
[[52 40  6]
 [ 0 17 11]
 [ 0  3 49]]
[train][0] Epoch: 3/100 Loss: 0.8493 Acc: 0.8251
[[412  65   8]
 [ 26  74  42]
 [  4  10 245]]
[val][0] Epoch: 3/100 Loss: 0.7853 Acc: 0.8146
[[80 13  5]
 [ 3 16  9]
 [ 0  3 49]]
[train][0] Epoch: 4/100 Loss: 0.6971 Acc: 0.8657
[[443  37   5]
 [ 32  81  29]
 [  5  11 243]]
[val][0] Epoch: 4/100 Loss: 0.6411 Acc: 0.8539
[[87  7  4]
 [ 5 17  6]
 [ 0  4 48]]
[train][0] Epoch: 5/100 Loss: 0.5842 Acc: 0.8736
[[450  30   5]
 [ 33  86  23]
 [  8  13 238]]
[val][0] Epoch: 5/100 Loss: 0.5529 Acc: 0.8708
[[87  8  3]
 [ 5 20  3]
 [ 1  3 48]]
[train][0] Epoch: 6/100 Loss: 0.5147 A