In [1]:
# This serves as a template which will guide you through the implementation of this task.  It is advised
# to first read the whole template and get a sense of the overall structure of the code before trying to fill in any of the TODO gaps
# First, we import necessary libraries:
from pprint import pprint

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.preprocessing import scale
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader
from sklearn.linear_model import LinearRegression, Lasso

In [2]:
seed = 0
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x19f0e9facb0>

In [3]:
def load_data():
    """
    This function loads the data from the csv files and returns it as numpy arrays.

    input: None

    output: x_pretrain: np.ndarray, the features of the pretraining set
            y_pretrain: np.ndarray, the labels of the pretraining set
            x_train: np.ndarray, the features of the training set
            y_train: np.ndarray, the labels of the training set
            x_test: np.ndarray, the features of the test set
    """
    x_pretrain = pd.read_csv("public/pretrain_features.csv.zip", index_col="Id", compression='zip').drop("smiles",
                                                                                                         axis=1).to_numpy()
    y_pretrain = pd.read_csv("public/pretrain_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_train = pd.read_csv("public/train_features.csv.zip", index_col="Id", compression='zip').drop("smiles",
                                                                                                   axis=1).to_numpy()
    y_train = pd.read_csv("public/train_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_test = pd.read_csv("public/test_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1)
    return x_pretrain, y_pretrain, x_train, y_train, x_test

In [4]:
x_pretrain, y_pretrain, x_train, y_train, x_test = load_data()
print("Data loaded!")

Data loaded!


In [62]:
class Net(nn.Module):
    """
    The model class, which defines our feature extractor used in pretraining.
    """

    def __init__(self):
        """
        The constructor of the model.
        """
        super().__init__()
        # TODO: Define the architecture of the model. It should be able to be trained on pretraining data
        # and then used to extract features from the training and test data.

        # activation = nn.ReLU()

        # THINGS TO INCLUDE: BATCH NORM, DROPOUT
        self.encoder = nn.Sequential(
            nn.Linear(1000, 500), nn.BatchNorm1d(500),
            # nn.Sigmoid(),
            nn.ReLU(),
            nn.Linear(500, 250), nn.BatchNorm1d(250),
            # nn.Sigmoid(),
            nn.ReLU(),
            nn.Linear(250, 90), nn.BatchNorm1d(90),
            # nn.Sigmoid()
            # nn.ReLU(),
            # nn.Linear(100, 50), nn.BatchNorm1d(50),
            # nn.ReLU(),
            # nn.Linear(50, 10), nn.BatchNorm1d(50),
            # nn.ReLU(),
        )
        self.decoder = nn.Sequential(
            # nn.Linear(10, 50), nn.BatchNorm1d(50), nn.ReLU(),
            # nn.Linear(50, 100), nn.BatchNorm1d(100), nn.ReLU(),
            nn.Linear(90, 250), nn.BatchNorm1d(250),
            # nn.Sigmoid(),
            nn.ReLU(),
            nn.Linear(250, 500), nn.BatchNorm1d(500),
            # nn.Sigmoid(),
            nn.ReLU(),
            nn.Linear(500, 1000)
        )
        self.last_linear = nn.Sequential(nn.Linear(90, 1))

    def forward(self, x):
        """
        The forward pass of the model.
        input: x: torch.Tensor, the input to the model

        output: x: torch.Tensor, the output of the model
        """
        # TODO: Implement the forward pass of the model, in accordance with the architecture
        # defined in the constructor.
        x = self.encoder(x)
        y = self.last_linear(x)
        x = self.decoder(x)
        return x, y

In [63]:
def make_feature_extractor(x, y, batch_size=256, eval_size=1000):
    """
    This function trains the feature extractor on the pretraining data and returns a function which
    can be used to extract features from the training and test data.

    input: x: np.ndarray, the features of the pretraining set
              y: np.ndarray, the labels of the pretraining set
                batch_size: int, the batch size used for training
                eval_size: int, the size of the validation set

    output: make_features: function, a function which can be used to extract features from the training and test data
    """
    # Pretraining data loading
    in_features = x.shape[-1]
    x_tr, x_val, y_tr, y_val = train_test_split(x, y, test_size=eval_size, random_state=0, shuffle=True)
    x_tr, x_val = torch.tensor(x_tr, dtype=torch.float), torch.tensor(x_val, dtype=torch.float)
    y_tr, y_val = torch.tensor(y_tr, dtype=torch.float), torch.tensor(y_val, dtype=torch.float)

    # model declaration
    model = Net()
    model.train()

    # TODO: Implement the training loop. The model should be trained on the pretraining data. Use validation set
    # to monitor the loss.

    criterion_decoded = nn.MSELoss()
    criterion_predictions = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
    n_epochs = 5
    a = 0.1

    losses = []
    valid_losses = []

    train_loader = DataLoader(
        dataset=TensorDataset(x_tr, y_tr),
        batch_size=batch_size,
        shuffle=True
    )

    valid_loader = DataLoader(
        dataset=TensorDataset(x_val, y_val),
        batch_size=batch_size,
        shuffle=True
    )

    for epoch in range(n_epochs):
        train_loss_epoch = []
        valid_loss_epoch = []
        with tqdm(train_loader, unit="batch") as tepoch:
            for data, target in tepoch:
                tepoch.set_description(f"Epoch {epoch} train")

                optimizer.zero_grad()

                decoded_features, predictions = model(data)
                predictions = predictions.squeeze()

                loss_decoded = criterion_decoded(decoded_features, data)
                loss_predictions = criterion_predictions(predictions, target)

                train_loss = a*loss_predictions + (1-a)*loss_decoded

                train_loss_epoch.append(train_loss.item())

                train_loss.backward()
                optimizer.step()

                tepoch.set_postfix({'Train loss': train_loss.item()})

        train_loss_avg = np.mean(train_loss_epoch)

        with torch.no_grad():
            with tqdm(valid_loader, unit="batch") as tepoch:
                for valid_data, valid_target in tepoch:
                    tepoch.set_description(f"Epoch {epoch} valid")

                    valid_decoded_features, valid_predictions = model(valid_data)
                    valid_predictions = valid_predictions.squeeze()

                    valid_loss_decoded = criterion_decoded(valid_decoded_features, valid_data)
                    valid_loss_predictions = criterion_predictions(valid_predictions, valid_target)

                    valid_loss = a*valid_loss_predictions + (1-a)*valid_loss_decoded

                    valid_loss_epoch.append(valid_loss.item())

                    tepoch.set_postfix({'Validation loss': valid_loss.item()})

        valid_loss_avg = np.mean(valid_loss_epoch)


        losses.append(train_loss_avg)
        valid_losses.append(valid_loss_avg)

        print('Final train loss: ', train_loss_avg, 'Final valid loss: ', valid_loss_avg)

    def make_features(x):
        """
        This function extracts features from the training and test data, used in the actual pipeline
        after the pretraining.

        input: x: np.ndarray, the features of the training or test set

        output: features: np.ndarray, the features extracted from the training or test set, propagated
        further in the pipeline
        """
        model.eval()
        # TODO: Implement the feature extraction, a part of a pretrained model used later in the pipeline.
        model_no_last_layers = nn.Sequential(*list(model.children())[:-2])

        with torch.no_grad():
            if isinstance(x, pd.DataFrame):
                x = x.to_numpy()
            x = torch.tensor(x, dtype=torch.float)
            x_features = model_no_last_layers(x)

        return x_features

    return make_features

In [64]:
# Utilize pretraining data by creating feature extractor which extracts lumo energy
# features from available initial features
feature_extractor = make_feature_extractor(x_pretrain, y_pretrain)

x_train_transformed = feature_extractor(x_train).numpy()
x_test_transformed = feature_extractor(x_test).numpy()

y_pred = np.zeros(x_test.shape[0])

# STANDARDSCALER, FUNCTIONTRANSFORMER, etc.
# x_train_transformed = scale(x_train_transformed)
# x_test_transformed = scale(x_test_transformed)

Epoch 0 train: 100%|██████████| 192/192 [00:04<00:00, 41.47batch/s, Train loss=0.0285]
Epoch 0 valid: 100%|██████████| 4/4 [00:00<00:00, 96.35batch/s, Validation loss=0.0277]


Final train loss:  0.21637873087699214 Final valid loss:  0.027751350309699774


Epoch 1 train: 100%|██████████| 192/192 [00:04<00:00, 42.12batch/s, Train loss=0.0242]
Epoch 1 valid: 100%|██████████| 4/4 [00:00<00:00, 106.63batch/s, Validation loss=0.0248]


Final train loss:  0.025789997608323272 Final valid loss:  0.02399823348969221


Epoch 2 train: 100%|██████████| 192/192 [00:05<00:00, 33.56batch/s, Train loss=0.0217]
Epoch 2 valid: 100%|██████████| 4/4 [00:00<00:00, 78.43batch/s, Validation loss=0.0206]


Final train loss:  0.022239563821737345 Final valid loss:  0.02078984398394823


Epoch 3 train: 100%|██████████| 192/192 [00:06<00:00, 29.57batch/s, Train loss=0.0192]
Epoch 3 valid: 100%|██████████| 4/4 [00:00<00:00, 79.17batch/s, Validation loss=0.0196]


Final train loss:  0.019683974765939638 Final valid loss:  0.01948577957227826


Epoch 4 train: 100%|██████████| 192/192 [00:06<00:00, 30.73batch/s, Train loss=0.0169]
Epoch 4 valid: 100%|██████████| 4/4 [00:00<00:00, 82.44batch/s, Validation loss=0.018]


Final train loss:  0.01753880308630566 Final valid loss:  0.017389880493283272


In [90]:
from sklearn.model_selection import LeaveOneOut

print('alpha =', 0)
pprint(np.mean(cross_val_score(LinearRegression(), x_train_transformed, y_train, cv=LeaveOneOut(), scoring='neg_root_mean_squared_error')))
for alpha in [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 100]:
    print('alpha =', alpha)
    pprint(np.mean(cross_val_score(Ridge(alpha=alpha), x_train_transformed, y_train, cv=LeaveOneOut(), scoring='neg_root_mean_squared_error')))

alpha = 0
-0.490482071318851
alpha = 0.001
-0.23444284915176922
alpha = 0.01
-0.16603550085346433
alpha = 0.05
-0.15191928117631673
alpha = 0.1
-0.1511619747964349
alpha = 0.5
-0.1502997804813138
alpha = 1
-0.15078359663241184
alpha = 5
-0.15995668548899838
alpha = 10
-0.17120536150849006
alpha = 100
-0.21013934673300597


In [91]:
regression_model = Ridge(alpha=0.5)

In [92]:
regression_model.fit(x_train_transformed, y_train)
y_pred = regression_model.predict(x_test_transformed)

assert y_pred.shape == (x_test.shape[0],)
y_pred = pd.DataFrame({"y": y_pred}, index=x_test.index)
y_pred.to_csv("results.csv", index_label="Id")
print("Predictions saved, all done!")

Predictions saved, all done!
