## Yeast training
 This notebook loads the Yeast dataset (https://archive.ics.uci.edu/dataset/110/yeast), preprocesses it and trains a simple model

In [1]:
import sys
import os
PROJ_DIR = os.path.realpath(os.path.dirname(os.path.abspath('')))
sys.path.append(os.path.join(PROJ_DIR,'src'))



In [5]:
import pandas as pd
df = pd.read_csv(os.path.join(PROJ_DIR, 'assets', 'data', 'yeast.data'), delimiter=';', header=None)

In [6]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,ADT1_YEAST,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,ADT2_YEAST,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,ADT3_YEAST,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,AAR2_YEAST,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,AATM_YEAST,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT
...,...,...,...,...,...,...,...,...,...,...
1479,YUR1_YEAST,0.81,0.62,0.43,0.17,0.5,0.0,0.53,0.22,ME2
1480,ZIP1_YEAST,0.47,0.43,0.61,0.40,0.5,0.0,0.48,0.47,NUC
1481,ZNRP_YEAST,0.67,0.57,0.36,0.19,0.5,0.0,0.56,0.22,ME2
1482,ZUO1_YEAST,0.43,0.40,0.60,0.16,0.5,0.0,0.53,0.39,NUC


Load data from CSV and save it to a suitable format. This can be skipped if concrete_data.npz is in assets.

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def load_yeast(path):
    df = pd.read_csv(path, delimiter=';', header=None)

    labels = df[9]
    x = df.drop(columns=0)
    x = x.drop(columns=9)

    possible_labels = labels.unique().tolist()
    print(possible_labels)
    print(len(possible_labels))
    y = labels.map(lambda x: possible_labels.index(x))

    return x.to_numpy(), y.to_numpy()
    
x, y = load_yeast(os.path.join(PROJ_DIR, 'assets', 'data', 'yeast.data'))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

['MIT', 'NUC', 'CYT', 'ME1', 'EXC', 'ME2', 'ME3', 'VAC', 'POX', 'ERL']
10


In [9]:
# Save to assets
np.savez(os.path.join(PROJ_DIR, 'assets', 'data', 'yeast.npz'),\
        x_train=x_train,\
        x_test=x_test,\
        y_train=y_train,\
        y_test=y_test)

Load data from file

In [10]:
file_data = np.load(os.path.join(PROJ_DIR, 'assets', 'data', 'yeast.npz'))
x_train = file_data['x_train']
x_test = file_data['x_test']
y_train = file_data['y_train']
y_test = file_data['y_test']

Train a MLP model

In [12]:
import torch

MODEL_NEURONS = 100
MODEL_EPOCHS= 2000
MODEL_LR = 1.0e-1
MODEL_LABEL_NUM = len(np.unique(y_train))

class MLP(torch.nn.Module):
    def __init__(self, n_neurons):
        super(MLP, self).__init__()
        self.fc1 = torch.nn.Linear(x_train.shape[1], n_neurons)
        self.ac1 = torch.nn.Sigmoid()
        self.fc2 = torch.nn.Linear(n_neurons, MODEL_LABEL_NUM)
        self.ac2 = torch.nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.ac1(x)
        logits = self.fc2(x)
        x = self.ac2(logits)
        return x

x_train_tensor = torch.tensor(x_train).float()
y_train_tensor = torch.tensor(y_train)
x_test_tensor = torch.tensor(x_test).float()
y_test_tensor = torch.tensor(y_test)

label_onehot = torch.zeros(y_train.shape[0], MODEL_LABEL_NUM)
label_onehot.scatter_(1, y_train_tensor.unsqueeze(1), 1)
class_weights = 1.0/label_onehot.mean(axis=0)
print(class_weights)

network = MLP(MODEL_NEURONS)
loss = torch.nn.BCELoss(weight=class_weights)
optimizer = torch.optim.Adam(network.parameters(), lr=MODEL_LR)#, weight_decay=5e-3)

for epoch in range(MODEL_EPOCHS):
    optimizer.zero_grad()
    
    preds = network(x_train_tensor)
    label_onehot = torch.zeros(y_train.shape[0], MODEL_LABEL_NUM)
    label_onehot.scatter_(1, y_train_tensor.unsqueeze(1), 1)
    loss_value = loss(preds, label_onehot)
    loss_value.backward()        
    optimizer.step()

    train_accuracy = (preds.argmax(dim=1) == y_train_tensor).float().mean() 

    test_preds = network.forward(x_test_tensor)        
    test_accuracy = (test_preds.argmax(dim=1) == y_test_tensor).float().mean() 
    print(f'Epoch {epoch}/{MODEL_EPOCHS} - Loss: {loss_value.item()} - Train accuracy: {train_accuracy} - Test accuracy: {test_accuracy}')  
    if test_accuracy > 0.75: # Undertrained
        break
    
print(test_accuracy.item())


tensor([  6.0872,   3.4306,   3.2343,  35.9697,  40.9310,  25.8043,   9.2734,
         59.3500,  65.9444, 237.4000])
Epoch 0/2000 - Loss: 7.044772624969482 - Train accuracy: 0.3091828227043152 - Test accuracy: 0.3232323229312897
Epoch 1/2000 - Loss: 6.841350078582764 - Train accuracy: 0.3091828227043152 - Test accuracy: 0.3232323229312897
Epoch 2/2000 - Loss: 8.341196060180664 - Train accuracy: 0.3091828227043152 - Test accuracy: 0.2861952781677246
Epoch 3/2000 - Loss: 8.842792510986328 - Train accuracy: 0.29486098885536194 - Test accuracy: 0.2861952781677246
Epoch 4/2000 - Loss: 8.127361297607422 - Train accuracy: 0.29233360290527344 - Test accuracy: 0.31986531615257263
Epoch 5/2000 - Loss: 7.090317726135254 - Train accuracy: 0.31002527475357056 - Test accuracy: 0.3232323229312897
Epoch 6/2000 - Loss: 5.84609842300415 - Train accuracy: 0.3091828227043152 - Test accuracy: 0.3232323229312897
Epoch 7/2000 - Loss: 5.0866193771362305 - Train accuracy: 0.3091828227043152 - Test accuracy: 0.

In [18]:
# Save model
torch.save(network.state_dict(), os.path.join(PROJ_DIR,'assets','models','htru2-mlp.pth'))