In [1]:
# Import
import pandas as pd

In [2]:
from glob import glob
import string

def get_file_list_from_dir(*, path, datadir):
    data_files = sorted(glob(os.path.join(path, "data", datadir, "*.csv.gz")))
    return data_files

In [3]:
alphabet = list(string.ascii_uppercase)  # to ease the manipulation of the data
input_compos = alphabet[:8]
input_params = input_compos + ["p1", "p2", "p3", "p4", "p5"]

## Dirty preprocessing

In [4]:
train_files = get_file_list_from_dir(path=".", datadir="train")
dtrain = pd.concat((pd.read_csv(f) for f in train_files))

train_data = dtrain[alphabet].add_prefix('Y_')
train_data["times"] = dtrain["times"]
train_data = train_data[ train_data["times"] > 0.]
temp = dtrain.loc[0][input_params].reset_index(drop=True)
temp = temp.loc[temp.index.repeat(80)].reset_index(drop=True)
train_data = pd.concat([temp, train_data.reset_index(drop=True)], axis=1)

train_target_A = train_data.groupby(input_params)['Y_A'].apply(list).apply(pd.Series).rename(
    columns=lambda x: 'A' + str(x + 1)).reset_index()
train_target_A

X_train = train_target_A[input_params]

y_train_all = []
for i in alphabet:
    y_train_all.append(
        train_data.groupby(input_params)['Y_'+i].apply(list).apply(pd.Series).rename(
        columns=lambda x: i + str(x + 1)).reset_index().iloc[:, len(input_params):]
    )
    
y_train_all = pd.concat(y_train_all, axis=1)

In [5]:
test_files = get_file_list_from_dir(path=".", datadir="test")
dtest = pd.concat((pd.read_csv(f) for f in test_files))

test_data = dtest[alphabet].add_prefix('Y_')
test_data["times"] = dtest["times"]
test_data = test_data[test_data["times"] > 0.]
temp = dtest.loc[0][input_params].reset_index(drop=True)
temp = temp.loc[temp.index.repeat(80)].reset_index(drop=True)
test_data = pd.concat([temp, test_data.reset_index(drop=True)], axis=1)

test_target_A = test_data.groupby(input_params)['Y_A'].apply(list).apply(pd.Series).rename(
    columns=lambda x: 'A' + str(x + 1)).reset_index()

X_test = test_target_A[input_params]

y_test_all = []
for i in alphabet:
    y_test_all.append(
        test_data.groupby(input_params)['Y_'+i].apply(list).apply(pd.Series).rename(
        columns=lambda x: i + str(x + 1)).reset_index().iloc[:, len(input_params):]
    )
    
y_test_all = pd.concat(y_test_all, axis=1)


# Current work

In [6]:
import torch
from torch import nn, optim
from functools import reduce
from operator import add

In [93]:
class neuralNetwork (nn.Module):
    
    def __init__ (self):
        
        super(neuralNetwork, self).__init__()
        
        self._initial_nucleids = list(string.ascii_uppercase)[0:8]
        self._generated_nucleids = list(string.ascii_uppercase)[8:]

        self.networks = nn.ModuleDict()

        i = 0
        input_features = 378

        for initial_nucleid in self._initial_nucleids:
            self.networks[initial_nucleid] = nn.Sequential(*[
                nn.Linear(input_features+(i*80), 500),
                nn.Dropout(0),
                nn.BatchNorm1d(500),
                nn.ReLU(),
                nn.Linear(500, 150),
                nn.Dropout(0),
                nn.BatchNorm1d(150),
                nn.ReLU(),
                nn.Linear(150, 100),
                nn.BatchNorm1d(100),
                nn.ReLU(),
                nn.Linear(100, 80),
            ])
            i += 1

        for generated_nucleid in self._generated_nucleids:
            self.networks[generated_nucleid] = nn.Sequential(*[
                nn.Linear(input_features+(i*80), 500),
                nn.Dropout(0),
                nn.BatchNorm1d(500),
                nn.ReLU(),
                nn.Linear(500, 150),
                nn.BatchNorm1d(150),
                nn.ReLU(),
                nn.Linear(150, 100),
                nn.BatchNorm1d(100),
                nn.ReLU(),
                nn.Linear(100, 80),
            ])
            i += 1

        self.loss_fn = nn.MSELoss()
        self.optimizer = optim.AdamW(self.parameters(), lr=1e-2, amsgrad=True)
        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=10, factor=0.5)
    
    def forward(self, X):

        # Separating dataset
        y_hat = {}

        X_ = X

        # 1. Training the original data
        for inital_nucleid in self._initial_nucleids:
            y_hat[inital_nucleid] = self.networks[inital_nucleid](X_)
            X_ = torch.concat([X_, y_hat[inital_nucleid]], axis=1)

        # 2. Getting the prediction to predict the new nucleids
        for generated_nucleid in self._generated_nucleids:
            y_hat[generated_nucleid] = self.networks[generated_nucleid](X_)
            X_ = torch.concat([X_, y_hat[generated_nucleid]], axis=1)

        # 3. Generating full ouput
        y_hat_final = torch.concat(list(y_hat.values()), axis=1)

        return y_hat_final

    def fit(self, X, y):

        self.train()
        self.optimizer.zero_grad()

        y_hat = self.forward(X)

        loss = self.loss_fn(y, y_hat)
        loss.backward()

        self.optimizer.step()

        return loss.detach().item()

    def predict(self, X):

        self.eval()
        with torch.no_grad():
            y_hat = self.forward(X)

        return y_hat

In [94]:
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.utils import check_array
from sklearn.preprocessing import PolynomialFeatures
from torch.utils.data import DataLoader, Dataset

In [194]:
# Pre-processing
class pre_processor():
    
    def __init__ (self, degree=2):

        self.polynomial = PolynomialFeatures(degree=degree)

        # Liste des output à exprimer en log
        #self.to_log = [True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
        self.to_log = [True for x in list(string.ascii_uppercase)]
        self.to_log = np.repeat(self.to_log, 80)

    def fit(self, X,y):

        X = check_array(X)
        y = check_array(y, ensure_2d=False)

        # Scaling in log
        X_ = np.concatenate([np.log(X), X], axis=1)
        y_ = np.copy(y)
        y_[:, self.to_log] = np.log(y[:, self.to_log])

        # Getting polynomial
        X_ = self.polynomial.fit_transform(X_)

        # Normalization
        ## Computing parameters
        self.X_mean = np.mean(X_, axis=0)
        self.X_range = np.max(X_, axis=0)-np.min(X_, axis=0)+1e-8

        self.y_mean = np.mean(y_, axis=0)
        self.y_range = np.max(y_, axis=0)-np.min(y_, axis=0)+1e-8


    def transform(self, X, y=None):

        X = check_array(X)

        # Getting polynomial
        X_ = np.concatenate([np.log(X), X], axis=1)
        X_ = self.polynomial.transform(X_)

        if y is not None:
            y = check_array(y, ensure_2d=False)

            y_ = np.copy(y)
            y_[:, self.to_log] = np.log(y[:, self.to_log])

            y_ = (y_-self.y_mean)/self.y_range
        else:
            y_ = None

        X_ = (X_-self.X_mean)/self.X_range

        return X_, y_

    def inverse_transform(self, y):

        y = check_array(y, ensure_2d=False)
        y_ = (y*self.y_range)+self.y_mean
        y_[:, self.to_log] = np.exp(y_[:, self.to_log])

        return y_

In [195]:
class basicDataset(Dataset):
    def __init__ (self, X, y):
        self.X = X
        self.y = y

    def __len__ (self):
        return self.X.shape[0]
    
    def __getitem__ (self, idx):
        return self.X[idx], self.y[idx]

In [196]:
model = neuralNetwork()
model = model.to("cuda:0")

preprocess = pre_processor()
preprocess.fit(X_train, y_train_all)

In [197]:
X_train_preprocess, y_train_preprocess = preprocess.transform(X_train, y_train_all)
X_test_preprocess, y_test_preprocess = preprocess.transform(X_test, y_test_all)

X_train_tensor = torch.tensor(X_train_preprocess, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train_preprocess, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test_preprocess, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_preprocess, dtype=torch.float32)

X_train_tensor = X_train_tensor.to("cuda:0")
y_train_tensor = y_train_tensor.to("cuda:0")
X_test_tensor = X_test_tensor.to("cuda:0")

train_dataset = basicDataset(X_train_tensor, y_train_tensor)

# Using dataloader to benefit from stochasticity
train_loader = DataLoader(train_dataset, batch_size=690, shuffle=True)

In [198]:
n_epochs = 500

for i in range(n_epochs):

    if i%100 == 0 and i != 0:
        y_hat_test = model.predict(X_test_tensor).detach().cpu().numpy()
        y_hat_test_inverse_transform = preprocess.inverse_transform(y_hat_test)
        y_test_loss = mean_absolute_percentage_error(y_test_all.values, y_hat_test_inverse_transform)

        print(f"Epoch {i} - MSE Loss {epoch_loss} - MAPE Test loss {y_test_loss}")

    losses = []
    
    for x,y in train_loader:
        train_loss = model.fit(x, y)
        losses.append(train_loss)

    epoch_loss = np.sum(losses)
    model.scheduler.step(epoch_loss)