In [1]:
import os
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import confusion_matrix, classification_report

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [14]:
DATA_FOLDER = "../data"
ABUNDANCE_FILE = "abundance_with_unique.csv"
ABUNDANCE_START = 212

PERCENT_INFO = 0.95
TEST_SPLIT = 0.2
SEED = 42

BATCH_SIZE = 32
LAYERS = [1024, 512, 256, 128, 64, 64, 32, 32]
LEARNING_RATE = 0.00005

In [3]:
abundance_data = pd.read_csv(os.path.join(DATA_FOLDER, ABUNDANCE_FILE), low_memory=False)

In [4]:
metadata = abundance_data.iloc[:, :ABUNDANCE_START]
abundance = abundance_data.iloc[:, ABUNDANCE_START:]

In [5]:
X_train = abundance.values
Y_train = metadata['disease']

healthy_classes = ['n', 'leaness', 'nd', ' -', 'n_relative', '-']

Y_train = Y_train.apply(lambda x: 'healthy' if x in healthy_classes else x)
Y_train = Y_train.apply(lambda x: 'psoriasis' if x == 'y' else x)

classes = Y_train.unique()
n_classes = len(classes)
Y_train = pd.Categorical(Y_train, categories=classes).codes

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [6]:
type(X_train_scaled), X_train_scaled.shape, type(Y_train), Y_train.shape

print(np.unique(Y_train, return_counts=True))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
      dtype=int8), array([2692,  164,   52,  148,   25,   36,  118,    5,   10,    1,  223,
         49,   48,   26,   13]))


In [7]:
lda = LinearDiscriminantAnalysis()
X_train_lda = lda.fit_transform(X_train_scaled, Y_train)

In [8]:
print("The number of components is", X_train_lda.shape[1])

The number of components is 14


In [9]:
class AdundancaDataset(Dataset):
    def __init__(self, data, target):
        self.X = data
        self.Y = target
        
        print("X shape", self.X.shape)
        print("Y shape", self.Y.shape)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.Y[idx], dtype=torch.long)

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train_lda, Y_train, test_size=TEST_SPLIT, random_state=SEED)

train_dataset = AdundancaDataset(X_train, Y_train)
test_dataset = AdundancaDataset(X_test, Y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

X shape (2888, 14)
Y shape (2888,)
X shape (722, 14)
Y shape (722,)


In [11]:
class MLP(nn.Module):
    def __init__(self, idim, odim, layers, batch_norm=True):
        super(MLP, self).__init__()
        self.layers = nn.Sequential()
        
        for i, layer in enumerate(layers):
            if i == 0:
                self.layers.add_module("fc{}".format(i), nn.Linear(idim, layer))
            else:
                self.layers.add_module("fc{}".format(i), nn.Linear(layers[i-1], layer))
            
            if batch_norm:
                self.layers.add_module("bn{}".format(i), nn.BatchNorm1d(layer))
            
            self.layers.add_module("relu{}".format(i), nn.ReLU())
            
        self.layers.add_module("fc{}".format(len(layers)), nn.Linear(layers[-1], odim))
        self.layers.add_module("softmax", nn.Softmax(dim=1))
        
    def forward(self, x):
        return self.layers(x)

In [12]:
# Training

def train(model, data_loader, criterion, optimizer):
    """
    Generic training function
    """
    model.train() # Set model to training mode
    running_loss = 0.0
    for _, data in enumerate(data_loader):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad() # Zero the gradients
        
        outputs = model(inputs) # Forward pass
        loss = criterion(outputs, labels) # Compute loss
        loss.backward() # Backward pass
        optimizer.step() # Update weights
        
        running_loss += loss.item()
    return running_loss / len(data_loader)

def evaluate(model, data_loader, criterion):
    """
    Generic evaluation function    
    """
    model.eval() # Set model to evaluation mode
    running_loss = 0.0
    correct = 0
    total = 0
    true_labels = []
    predicted_labels = []
    with torch.no_grad():
        for _, data in enumerate(data_loader):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predicted.cpu().numpy())
            
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    return running_loss / len(data_loader), correct / total, true_labels, predicted_labels

In [15]:
# Training

model = MLP(idim=X_train_lda.shape[1], odim=n_classes, layers=LAYERS)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in range(300):
    train_loss = train(model, train_loader, criterion, optimizer)
    _, train_accuracy, _, _ = evaluate(model, train_loader, criterion)
    _, test_accuracy, _, _ = evaluate(model, test_loader, criterion)
    
    if(epoch % 10 == 0):
        print("Epoch: {} Train Loss: {:.4f} Train Accuracy: {:.4f} Test Accuracy: {:.4f}".format(epoch, train_loss, train_accuracy, test_accuracy))

Epoch: 0 Train Loss: 2.6880 Train Accuracy: 0.5048 Test Accuracy: 0.4806
Epoch: 10 Train Loss: 2.3966 Train Accuracy: 0.8269 Test Accuracy: 0.8393
Epoch: 20 Train Loss: 2.1927 Train Accuracy: 0.8359 Test Accuracy: 0.8380
Epoch: 30 Train Loss: 2.0771 Train Accuracy: 0.8684 Test Accuracy: 0.8504
Epoch: 40 Train Loss: 2.0031 Train Accuracy: 0.8913 Test Accuracy: 0.8573
Epoch: 50 Train Loss: 1.9620 Train Accuracy: 0.9034 Test Accuracy: 0.8573
Epoch: 60 Train Loss: 1.9487 Train Accuracy: 0.9020 Test Accuracy: 0.8476
Epoch: 70 Train Loss: 1.9403 Train Accuracy: 0.9179 Test Accuracy: 0.8407
Epoch: 80 Train Loss: 1.9225 Train Accuracy: 0.9110 Test Accuracy: 0.8352
Epoch: 90 Train Loss: 1.9176 Train Accuracy: 0.9072 Test Accuracy: 0.8504
Epoch: 100 Train Loss: 1.9164 Train Accuracy: 0.9193 Test Accuracy: 0.8421
Epoch: 110 Train Loss: 1.9147 Train Accuracy: 0.9176 Test Accuracy: 0.8393
Epoch: 120 Train Loss: 1.9204 Train Accuracy: 0.9214 Test Accuracy: 0.8421
Epoch: 130 Train Loss: 1.9019 Train 

In [16]:
train_loss, train_accuracy, true_labels, predicted_labels = evaluate(model, train_loader, criterion)

print("Train Loss: {:.4f} Train Accuracy: {:.4f}".format(train_loss, train_accuracy))

print(confusion_matrix(true_labels, predicted_labels, labels=range(len(classes))))
print(classification_report(true_labels, predicted_labels, target_names=classes, labels=range(len(classes)), zero_division=0))

Train Loss: 1.8532 Train Accuracy: 0.9650
[[2135   10    0    0    0    7    0    0    0    0    0    0    0    0
     0]
 [  36   88    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [   1    0   44    0    0    0    0    0    0    0    0    0    0    0
     0]
 [   0    0    0  118    0    0    0    0    0    0    0    0    0    0
     0]
 [   0    0    0    0   20    0    0    0    0    0    0    0    0    0
     0]
 [   5    0    0    0    0   22    0    0    0    0    0    0    0    0
     0]
 [  11    0    0    0    0    0   78    0    0    0    1    0    0    0
     0]
 [   2    3    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [   7    0    0    0    0    1    0    0    0    0    0    0    0    0
     0]
 [   1    0    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [   5    0    0    1    0    0    0    0    0    0  180    0    0    0
     0]
 [   0    0    0    0    0    0    0    0    0    0    1   37    0    0
     0

In [17]:
test_loss, test_accuracy, true_labels, predicted_labels = evaluate(model, test_loader, criterion)

print("Test Loss: {:.4f} Test Accuracy: {:.4f}".format(test_loss, test_accuracy))

print(confusion_matrix(true_labels, predicted_labels, labels=range(len(classes))))
print(classification_report(true_labels, predicted_labels, target_names=classes, labels=range(len(classes)), zero_division=0))

Test Loss: 1.9810 Test Accuracy: 0.8366
[[510   9   0   8   0   0   0   0   0   0  11   1   0   0   1]
 [ 26  14   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  2   0   5   0   0   0   0   0   0   0   0   0   0   0   0]
 [  7   0   0  22   1   0   0   0   0   0   0   0   0   0   0]
 [  3   0   0   0   2   0   0   0   0   0   0   0   0   0   0]
 [  2   0   0   0   0   7   0   0   0   0   0   0   0   0   0]
 [ 14   0   0   0   0   0  12   0   0   0   1   0   0   1   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  2   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 13   0   0   1   0   0   1   0   0   0  21   1   0   0   0]
 [  3   0   0   0   0   0   0   0   0   0   4   4   0   0   0]
 [  2   0   0   0   0   0   0   0   0   0   1   0   5   0   0]
 [  3   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   2]]
              