In [43]:
# This serves as a template which will guide you through the implementation of this task.  It is advised
# to first read the whole template and get a sense of the overall structure of the code before trying to fill in any of the TODO gaps
# First, we import necessary libraries:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, RBF, Matern, WhiteKernel
from sklearn.model_selection import GridSearchCV
import itertools
import warnings
warnings.filterwarnings('ignore')

In [44]:
seed = 0
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x180a8ebb270>

In [3]:
def load_data():
    """
    This function loads the data from the csv files and returns it as numpy arrays.

    input: None

    output: x_pretrain: np.ndarray, the features of the pretraining set
            y_pretrain: np.ndarray, the labels of the pretraining set
            x_train: np.ndarray, the features of the training set
            y_train: np.ndarray, the labels of the training set
            x_test: np.ndarray, the features of the test set
    """
    x_pretrain = pd.read_csv("public/pretrain_features.csv.zip", index_col="Id", compression='zip').drop("smiles",
                                                                                                         axis=1).to_numpy()
    y_pretrain = pd.read_csv("public/pretrain_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_train = pd.read_csv("public/train_features.csv.zip", index_col="Id", compression='zip').drop("smiles",
                                                                                                   axis=1).to_numpy()
    y_train = pd.read_csv("public/train_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_test = pd.read_csv("public/test_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1)
    return x_pretrain, y_pretrain, x_train, y_train, x_test

In [74]:
class Net(nn.Module):
    """
    The model class, which defines our feature extractor used in pretraining.
    """

    def __init__(self):
        """
        The constructor of the model.
        """
        super().__init__()
        # TODO: Define the architecture of the model. It should be able to be trained on pretraining data
        # and then used to extract features from the training and test data.
        #self.fully_con1 = nn.Sequential(nn.Linear(1000, 128), nn.ReLU())
        self.fully_con1 = nn.Linear(1000, 1)

    def forward(self, x):
        """
        The forward pass of the model.

        input: x: torch.Tensor, the input to the model

        output: x: torch.Tensor, the output of the model
        """
        # TODO: Implement the forward pass of the model, in accordance with the architecture
        # defined in the constructor.
        x = self.fully_con1(x)
        #x = self.fully_con2(x)
        return x

In [75]:
def make_feature_extractor(x, y, batch_size=256, eval_size=1000):
    """
    This function trains the feature extractor on the pretraining data and returns a function which
    can be used to extract features from the training and test data.

    input: x: np.ndarray, the features of the pretraining set
              y: np.ndarray, the labels of the pretraining set
                batch_size: int, the batch size used for training
                eval_size: int, the size of the validation set

    output: make_features: function, a function which can be used to extract features from the training and test data
    """
    # Pretraining data loading
    in_features = x.shape[-1]
    x_tr, x_val, y_tr, y_val = train_test_split(x, y, test_size=eval_size, random_state=0, shuffle=True)
    x_tr, x_val = torch.tensor(x_tr, dtype=torch.float), torch.tensor(x_val, dtype=torch.float)
    y_tr, y_val = torch.tensor(y_tr, dtype=torch.float), torch.tensor(y_val, dtype=torch.float)

    # model declaration
    model = Net()
    model.train()

    # TODO: Implement the training loop. The model should be trained on the pretraining data. Use validation set
    # to monitor the loss.

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-2) # 1e-2
    n_epochs = 5

    losses = []
    valid_losses = []

    train_loader = DataLoader(
        dataset=TensorDataset(x_tr, y_tr),
        batch_size=batch_size,
        shuffle=True
    )

    valid_loader = DataLoader(
        dataset=TensorDataset(x_val, y_val),
        batch_size=batch_size,
        shuffle=True
    )

    for epoch in range(n_epochs):
        train_loss_epoch = []
        valid_loss_epoch = []
        with tqdm(train_loader, unit="batch") as tepoch:
            for data, target in tepoch:
                tepoch.set_description(f"Epoch {epoch} train")

                optimizer.zero_grad()

                predictions = model(data).squeeze()
                loss = criterion(predictions, target)
                train_loss_epoch.append(loss.item())

                loss.backward()
                optimizer.step()

                train_loss_avg = np.sum(train_loss_epoch) / len(train_loss_epoch)
                tepoch.set_postfix({'Train loss': train_loss_avg})


            with torch.no_grad():
                with tqdm(valid_loader, unit="batch") as tepoch:
                    for valid_data, valid_target in tepoch:
                        tepoch.set_description(f"Epoch {epoch} valid")
                        valid_predictions = model(valid_data).squeeze()
                        valid_loss = criterion(valid_predictions, valid_target)
                        valid_loss_epoch.append(valid_loss.item())

                        valid_loss_avg = np.sum(valid_loss_epoch) / len(valid_loss_epoch)
                        tepoch.set_postfix({'Validation loss': valid_loss.item()})

        losses.append(train_loss_avg)
        valid_losses.append(valid_loss_avg)

        print('Final train loss: ', train_loss_avg, 'Final valid loss: ', valid_loss_avg)

    def make_features(x):
        """
        This function extracts features from the training and test data, used in the actual pipeline
        after the pretraining.

        input: x: np.ndarray, the features of the training or test set

        output: features: np.ndarray, the features extracted from the training or test set, propagated
        further in the pipeline
        """
        model.eval()
        # TODO: Implement the feature extraction, a part of a pretrained model used later in the pipeline.
        model_no_last_layer = nn.Sequential(*list(model.children())[:-1])

        with torch.no_grad():
            if isinstance(x, pd.DataFrame):
                x = x.to_numpy()
            x = torch.tensor(x, dtype=torch.float)
            x_features = model_no_last_layer(x)

        return x_features

    return make_features

In [6]:
def make_pretraining_class(feature_extractors):
    """
    The wrapper function which makes pretraining API compatible with sklearn pipeline

    input: feature_extractors: dict, a dictionary of feature extractors

    output: PretrainedFeatures: class, a class which implements sklearn API
    """

    class PretrainedFeatures(BaseEstimator, TransformerMixin):
        """
        The wrapper class for Pretraining pipeline.
        """

        def __init__(self, *, feature_extractor=None, mode=None):
            self.feature_extractor = feature_extractor
            self.mode = mode

        def fit(self, X=None, y=None):
            return self

        def transform(self, X):
            assert self.feature_extractor is not None
            X_new = feature_extractors[self.feature_extractor](X)
            return X_new

    return PretrainedFeatures

In [36]:
def get_regression_model():
    """
    This function returns the regression model used in the pipeline.

    input: None

    output: model: sklearn compatible model, the regression model
    """
    # TODO: Implement the regression model. It should be able to be trained on the features extracted
    # by the feature extractor.
    
    # model = Ridge(alpha=0)
    
    model = GaussianProcessRegressor(optimizer='fmin_l_bfgs_b', random_state=None, n_restarts_optimizer=0)
    
    return model

In [17]:
x_pretrain, y_pretrain, x_train, y_train, x_test = load_data()
print("Data loaded!")

Data loaded!


In [78]:
# Utilize pretraining data by creating feature extractor which extracts lumo energy
# features from available initial features
feature_extractor = make_feature_extractor(x_pretrain, y_pretrain)
PretrainedFeatureClass = make_pretraining_class({"pretrain": feature_extractor})

# regression model
regression_model = get_regression_model()

y_pred = np.zeros(x_test.shape[0])
# TODO: Implement the pipeline. It should contain feature extraction and regression. You can optionally
# use other sklearn tools, such as StandardScaler, FunctionTransformer, etc.
x_train_transformed = feature_extractor(x_train).numpy()
x_test_transformed = feature_extractor(x_test).numpy()

param_range = [np.arange(0, 1.0, 0.2), np.arange(0, 1.0, 0.2)]

param_grid = [
    {
    "alpha":  [1e-2, 1e-3, 1e-3],
    "kernel": [Matern(length_scale, nu=1.5) + WhiteKernel() for length_scale in np.arange(0, 1.0, 0.1)]}# list(itertools.product(*param_range))]}
]

clf = GridSearchCV(estimator=regression_model, param_grid=param_grid, cv=5, refit=True, n_jobs = 5, verbose=True,
                       scoring='%s' % 'neg_root_mean_squared_error')

clf.fit(x_train_transformed, y_train)
regression_model = clf.best_estimator_
print(clf.best_params_)

regression_model.fit(x_train_transformed, y_train)
y_pred = regression_model.predict(x_test_transformed)

assert y_pred.shape == (x_test.shape[0],)
y_pred = pd.DataFrame({"y": y_pred}, index=x_test.index)
y_pred.to_csv("results.csv", index_label="Id")
print("Predictions saved, all done!")

Epoch 0 train: 100%|██████████| 192/192 [00:00<00:00, 202.54batch/s, Train loss=0.324]
Epoch 0 valid: 100%|██████████| 4/4 [00:00<00:00, 133.34batch/s, Validation loss=0.0799]


Final train loss:  0.3243788054484564 Final valid loss:  0.05493928864598274


Epoch 1 train: 100%|██████████| 192/192 [00:00<00:00, 204.26batch/s, Train loss=0.0503]
Epoch 1 valid: 100%|██████████| 4/4 [00:00<00:00, 114.28batch/s, Validation loss=0.0504]


Final train loss:  0.050292948338513575 Final valid loss:  0.045554437674582005


Epoch 2 train: 100%|██████████| 192/192 [00:00<00:00, 204.69batch/s, Train loss=0.0418]
Epoch 2 valid: 100%|██████████| 4/4 [00:00<00:00, 137.93batch/s, Validation loss=0.0296]


Final train loss:  0.04183835885487497 Final valid loss:  0.03759887535125017


Epoch 3 train: 100%|██████████| 192/192 [00:00<00:00, 205.35batch/s, Train loss=0.0357]
Epoch 3 valid: 100%|██████████| 4/4 [00:00<00:00, 142.85batch/s, Validation loss=0.0321]


Final train loss:  0.03569615497447861 Final valid loss:  0.030999042559415102


Epoch 4 train: 100%|██████████| 192/192 [00:00<00:00, 215.01batch/s, Train loss=0.0309]
Epoch 4 valid: 100%|██████████| 4/4 [00:00<00:00, 142.85batch/s, Validation loss=0.0221]


Final train loss:  0.030930804001400247 Final valid loss:  0.028341423720121384
Fitting 5 folds for each of 30 candidates, totalling 150 fits
{'alpha': 0.001, 'kernel': Matern(length_scale=0.8, nu=1.5) + WhiteKernel(noise_level=1)}
Predictions saved, all done!
