In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

import sys
import warnings
import os

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.exceptions import ConvergenceWarning

import matplotlib.pyplot as plt

In [2]:
# Ignore convergence warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Set the seed for reproducibility
seed = 10
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)


In [3]:
def load_data():
    """
    This function loads the data from the csv files and returns it as numpy arrays.

    input: None
    
    output: x_pretrain: np.ndarray, the features of the pretraining set
            y_pretrain: np.ndarray, the labels of the pretraining set
            x_train: np.ndarray, the features of the training set
            y_train: np.ndarray, the labels of the training set
            x_test: np.ndarray, the features of the test set
    """
    x_pretrain = pd.read_csv("pretrain_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
    y_pretrain = pd.read_csv("pretrain_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_train = pd.read_csv("train_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
    y_train = pd.read_csv("train_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_test = pd.read_csv("test_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1)
    return x_pretrain, y_pretrain, x_train, y_train, x_test


In [4]:
# 3 layer autoencoder
class AutoEncoder1(nn.Module):
    def __init__(self):
        super(AutoEncoder1, self).__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(1000, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(True),
            nn.Dropout(0.5),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(True),
            nn.Dropout(0.5),
            nn.Linear(256, 128)
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(128, 256),
            nn.ReLU(True),
            nn.Dropout(0.5),
            nn.Linear(256, 512),
            nn.ReLU(True),
            nn.Dropout(0.5),
            nn.Linear(512, 1000)
        )
        
        # Classifier
        self.classifier = nn.Linear(128, 1)

    def forward(self, x):
        x = self.encoder(x)
        if self.training:
            x = self.decoder(x)
        else:
            x = self.classifier(x)
        return x

In [7]:
def make_feature_extractor(x, y, model = AutoEncoder1(), batch_size=256, eval_size=1000, lr=0.01, weight_decay = 0.0001, patience=5, alpha=0.4, step_size=10, gamma=0.5):
    """
    This function trains the feature extractor on the pretraining data and returns a function which
    can be used to extract features from the training and test data.

    input: x: np.ndarray, the features of the pretraining set
              y: np.ndarray, the labels of the pretraining set
                batch_size: int, the batch size used for training
                eval_size: int, the size of the validation set
            
    output: make_features: function, a function which can be used to extract features from the training and test data
    """
    # Pretraining data loading
    in_features = x.shape[-1]
    x_tr, x_val, y_tr, y_val = train_test_split(x, y, test_size=eval_size, random_state=10, shuffle=True)
    x_tr, x_val = torch.tensor(x_tr, dtype=torch.float), torch.tensor(x_val, dtype=torch.float)
    y_tr, y_val = torch.tensor(y_tr, dtype=torch.float), torch.tensor(y_val, dtype=torch.float)

    #reshaping
    y_tr = y_tr.view(-1, 1)
    y_val = y_val.view(-1, 1)
    
    # Data loading
    train_data = TensorDataset(x_tr, y_tr)
    val_data = TensorDataset(x_val, y_val)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

    # model declaration
    model = model
    
    # Training parameters
    classification_loss = nn.MSELoss()
    reconstruction_loss = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)
    epochs = 50
    patience = patience
    best_val_loss = np.inf
    patience_counter = 0
    alpha = 0.4

    # Initialize arrays for plotting
    train_recon_time = np.array([])
    train_class_time = np.array([])
    train_total_time = np.array([])

    vali_recon_time = np.array([])
    vali_class_time = np.array([])
    vali_total_time = np.array([])

    # Training loop
    for epoch in range(epochs):

        # training
        total_train_reconstruction = 0
        total_train_classification = 0
        total_train = 0

        for x_batch, y_batch in train_loader:
            optimizer.zero_grad()

            # Reconstruction loss
            model.train()
            outputs = model(x_batch)
            recon_loss = reconstruction_loss(outputs, x_batch)
            total_train_reconstruction += recon_loss.item() * x_batch.size(0)

            # Classification loss
            model.eval()
            outputs = model(x_batch)
            class_loss = classification_loss(outputs, y_batch)
            total_train_classification += class_loss.item() * x_batch.size(0)

            # Total loss
            loss = alpha * recon_loss + (1 - alpha) * class_loss
            total_train += loss.item() * x_batch.size(0)

            loss.backward()
            optimizer.step()

        total_train_reconstruction /= len(train_loader.dataset)
        total_train_classification /= len(train_loader.dataset)
        total_train /= len(train_loader.dataset)

        train_recon_time = np.append(train_recon_time, total_train_reconstruction)
        train_class_time = np.append(train_class_time, total_train_classification)
        train_total_time = np.append(train_total_time, total_train)

        # validation
        total_vali_reconstruction = 0
        total_vali_classification = 0
        total_vali = 0

        with torch.no_grad():
            for x_batch, y_batch in val_loader:

                # Reconstruction loss
                model.train()
                outputs = model(x_batch)
                recon_loss = reconstruction_loss(outputs, x_batch)
                total_vali_reconstruction += recon_loss.item() * x_batch.size(0)

                # Classification loss
                model.eval()
                outputs = model(x_batch)
                class_loss = classification_loss(outputs, y_batch)
                total_vali_classification += class_loss.item() * x_batch.size(0)

                # Total loss
                loss = alpha * recon_loss + (1 - alpha) * class_loss
                total_vali += loss.item() * x_batch.size(0)

        total_vali_reconstruction /= len(val_loader.dataset)
        total_vali_classification /= len(val_loader.dataset)
        total_vali /= len(val_loader.dataset)

        vali_recon_time = np.append(vali_recon_time, total_vali_reconstruction)
        vali_class_time = np.append(vali_class_time, total_vali_classification)
        vali_total_time = np.append(vali_total_time, total_vali)

        # early stopping
        if total_vali < best_val_loss:
            best_val_loss = total_vali
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break
        
        # learning rate scheduler
        scheduler.step()

    def make_features(x, num_layers = 2):
        """
        This function extracts features from the training and test data, used in the actual pipeline 
        after the pretraining.

        input: x: np.ndarray, the features of the training or test set

        output: features: np.ndarray, the features extracted from the training or test set, propagated
        further in the pipeline
        """
        # TODO: Implement the feature extraction, a part of a pretrained model used later in the pipeline.
        model.eval()
        with torch.no_grad():
            new_model = nn.Sequential(*list(model.encoder.children())[:-num_layers])
            features = new_model(torch.tensor(x, dtype=torch.float))
        return features.detach().numpy()

    return make_features

In [8]:
def linear_regression(x, y):
        kfold = KFold(n_splits=10, shuffle=True, random_state=10)
        
        models= [Ridge(fit_intercept=True, random_state=10),
                Lasso(fit_intercept=True, random_state=10),
                ElasticNet(fit_intercept=True, random_state=10)]

        # Define set of possible hyperparameter values
        grid = {"Ridge": {"alpha": [0.01, 0.1, 0.5, 1, 5, 10, 15, 20]},
                "Lasso": {"alpha": [0.001, 0.01, 0.1, 1, 10]},
                "ElasticNet": {"alpha": [0.001, 0.01, 0.1, 1, 10],
                        "l1_ratio": [0.2, 0.4, 0.6, 0.8]}}

        algorithm = ["Ridge", "Lasso", "ElasticNet"]

        # Apply Grid search and add for each model the best score and the respective parameter to the list
        gs_bestscore = []
        gs_bestpara = []
        for i, model in enumerate(models):
                gs = GridSearchCV(model, param_grid = grid[algorithm[i]], cv=kfold, scoring="neg_root_mean_squared_error")
                gs.fit(x, y)
                gs_bestscore.append(gs.best_score_)
                gs_bestpara.append(gs.best_estimator_)

        return max(gs_bestscore), gs_bestpara[gs_bestscore.index(max(gs_bestscore))]

In [None]:
model = AutoEncoder1() # Change to assigned autoencoder
x_pretrain, y_pretrain, x_train, y_train, x_test = load_data()

feature_extractor =  make_feature_extractor(x_pretrain, y_pretrain)
x_train_t = feature_extractor(x_train)
best_score, best_para = linear_regression(x_train_t, y_train)