In [1]:
# Import
import pandas as pd

In [16]:
from glob import glob
import string

def get_file_list_from_dir(*, path, datadir):
    data_files = sorted(glob(os.path.join(path, "data", datadir, "*.csv.gz")))
    return data_files

In [22]:
alphabet = list(string.ascii_uppercase)  # to ease the manipulation of the data
input_compos = alphabet[:8]
input_params = input_compos + ["p1", "p2", "p3", "p4", "p5"]

## Dirty preprocessing

In [33]:
train_files = get_file_list_from_dir(path=".", datadir="train")
dtrain = pd.concat((pd.read_csv(f) for f in train_files))

train_data = dtrain[alphabet].add_prefix('Y_')
train_data["times"] = dtrain["times"]
train_data = train_data[ train_data["times"] > 0.]
temp = dtrain.loc[0][input_params].reset_index(drop=True)
temp = temp.loc[temp.index.repeat(80)].reset_index(drop=True)
train_data = pd.concat([temp, train_data.reset_index(drop=True)], axis=1)

train_target_A = train_data.groupby(input_params)['Y_A'].apply(list).apply(pd.Series).rename(
    columns=lambda x: 'A' + str(x + 1)).reset_index()
train_target_A

X_train = train_target_A[input_params]

y_train_all = []
for i in alphabet:
    y_train_all.append(
        train_data.groupby(input_params)['Y_'+i].apply(list).apply(pd.Series).rename(
        columns=lambda x: i + str(x + 1)).reset_index().iloc[:, len(input_params):]
    )
    
y_train_all = pd.concat(y_train_all, axis=1)

In [32]:
test_files = get_file_list_from_dir(path=".", datadir="test")
dtest = pd.concat((pd.read_csv(f) for f in test_files))

test_data = dtest[alphabet].add_prefix('Y_')
test_data["times"] = dtest["times"]
test_data = test_data[test_data["times"] > 0.]
temp = dtest.loc[0][input_params].reset_index(drop=True)
temp = temp.loc[temp.index.repeat(80)].reset_index(drop=True)
test_data = pd.concat([temp, test_data.reset_index(drop=True)], axis=1)

test_target_A = test_data.groupby(input_params)['Y_A'].apply(list).apply(pd.Series).rename(
    columns=lambda x: 'A' + str(x + 1)).reset_index()

X_test = test_target_A[input_params]

y_test_all = []
for i in alphabet:
    y_test_all.append(
        test_data.groupby(input_params)['Y_'+i].apply(list).apply(pd.Series).rename(
        columns=lambda x: i + str(x + 1)).reset_index().iloc[:, len(input_params):]
    )
    
y_test_all = pd.concat(y_test_all, axis=1)


# Current work

In [67]:
import torch
from torch import nn, optim
from functools import reduce
from operator import add

In [201]:
class neuralNetwork (nn.Module):
    
    def __init__ (self):
        
        super(neuralNetwork, self).__init__()
        
        self._initial_nucleids = list(string.ascii_uppercase)[0:8]
        self._generated_nucleids = list(string.ascii_uppercase)[8:]

        self.networks = nn.ModuleDict()

        for initial_nucleid in self._initial_nucleids:
            self.networks[initial_nucleid] = nn.Sequential(*[
                nn.Linear(13, 200),
                nn.Dropout(0),
                nn.BatchNorm1d(200),
                nn.ReLU(),
                nn.Linear(200, 100),
                nn.Dropout(0),
                nn.BatchNorm1d(100),
                nn.ReLU(),
                nn.Linear(100, 80),
            ])

        for generated_nucleid in self._generated_nucleids:
            self.networks[generated_nucleid] = nn.Sequential(*[
                nn.Linear(653, 500),
                nn.Dropout(0),
                nn.BatchNorm1d(500),
                nn.ReLU(),
                nn.Linear(500, 100),
                nn.Dropout(0),
                nn.BatchNorm1d(100),
                nn.ReLU(),
                nn.Linear(100, 80),
            ])

        self.loss_fn = nn.MSELoss()
        self.optimizer = optim.Adam(self.parameters(), lr=5e-2)
    
    def forward(self, X):

        # Separating dataset
        y_hat = {}

        # 1. Training the original data
        for inital_nucleid in self._initial_nucleids:
            y_hat[inital_nucleid] = self.networks[inital_nucleid](X)

        # 2. Getting the prediction to predict the new nucleids
        X_ = torch.concat(list(y_hat.values()), axis=1)
        X_ = torch.concat([X, X_], axis=1)

        for generated_nucleid in self._generated_nucleids:
            y_hat[generated_nucleid] = self.networks[generated_nucleid](X_)

        # 3. Generating full ouput
        y_hat_final = torch.concat(list(y_hat.values()), axis=1)

        return y_hat_final

    def fit(self, X, y):

        self.train()
        self.optimizer.zero_grad()

        y_hat = self.forward(X)
        loss = self.loss_fn(y, y_hat)
        loss.backward()

        self.optimizer.step()

        return loss.detach().item()

    def predict(self, X):

        self.eval()
        with torch.no_grad():
            y_hat = self.forward(X)

        return y_hat

In [202]:
X_scaling_ = np.max(X_train.values, axis=0, keepdims=True)
Y_scaling_ = np.max(y_train_all.values, axis=0, keepdims=True)

In [203]:
X_train_tensor = torch.tensor(X_train.values/X_scaling_, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_all.values/Y_scaling_, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test.values/X_scaling_, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_all.values/Y_scaling_, dtype=torch.float32)

In [204]:
model = neuralNetwork()

In [205]:
y_hat = model.predict(X_train_tensor)

In [206]:
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error

In [207]:
n_epochs = 1000

for i in range(n_epochs):

    if i%10 == 0 and i != 0:
        y_hat_test = torch.exp(model.predict(torch.log(X_test_tensor)).detach()).numpy()*Y_scaling_
        y_test_loss = mean_absolute_percentage_error(y_test_all.values, y_hat_test)

        print(f"Epoch {i} - MSE Loss {train_loss} - MAPE Test loss {y_test_loss}")
    
    train_loss = model.fit(torch.log(X_train_tensor), torch.log(y_train_tensor))