## Imports

In [13]:
import pandas as pd
import numpy as np
import re
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from rdkit import Chem
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.base import BaseEstimator, TransformerMixin

## Load data

In [14]:
x_pretrain = pd.read_csv("public/pretrain_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
smiles_pretrain = pd.read_csv("public/pretrain_features.csv.zip", usecols=["smiles"], compression='zip').to_numpy().squeeze(-1)
y_pretrain = pd.read_csv("public/pretrain_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
x_train = pd.read_csv("public/train_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
smiles_train = pd.read_csv("public/train_features.csv.zip", usecols=["smiles"], compression='zip').to_numpy().squeeze(-1)
y_train = torch.tensor(pd.read_csv("public/train_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1), dtype=torch.float, requires_grad=False)
x_test = pd.read_csv("public/test_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1)
smiles_test = pd.read_csv("public/test_features.csv.zip", usecols=["smiles"], compression='zip').to_numpy().squeeze(-1)

## Smiles features

In [15]:
def get_other_features(smiles):
    x = np.zeros((len(smiles), 19))
    for i in range(len(smiles)):
        smile = str(smiles[i])
        mol = Chem.MolFromSmiles(smile)
        x[i][0] = len(smile)
        x[i][1] = smile.count("c")
        x[i][2] = smile.count("C")
        x[i][3] = smile.count("s")
        x[i][4] = smile.count("-")
        x[i][5] = smile.count("nH")
        x[i][6] = smile.count("SiH2")
        x[i][7] = len(set(smile))
        x[i][8] = smile.count("=")
        x[i][9] = smile.count("o")
        x[i][10] = smile.count("e")
        x[i][11] = smile.count("cc")
        x[i][12] = smile.count("ccc")
        x[i][13] = max([int(s) for s in re.findall(r'\d', smile)])
        x[i][14] = smile.count("(")
        x[i][15] = mol.GetRingInfo().NumRings()
        x[i][16] = sum(1 for bond in mol.GetBonds() if bond.GetBondType() == Chem.rdchem.BondType.SINGLE)
        x[i][17] = sum(1 for bond in mol.GetBonds() if bond.GetBondType() == Chem.rdchem.BondType.DOUBLE)
        x[i][18] = sum(1 for atom in mol.GetAtoms() if atom.GetHybridization().name == 'SP2')
    return x

## Neural Net

In [16]:
class Net(nn.Module):

    def __init__(self, ad, hn1, hn2, do1, do2):
        super().__init__()
        self.fc1 = nn.Linear((ad), hn1)
        self.drop1 = nn.Dropout(do1)
        self.fc3 = nn.Linear(hn1, hn2)
        self.drop2 = nn.Dropout(do2)
        self.fc2 = nn.Linear(hn2, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.drop1(x)
        x = F.relu(self.fc3(x))
        x = self.drop2(x)
        x = self.fc2(x)
        x = x.squeeze()
        return x

## Feature extractor

In [17]:
def make_feature_extractor(x, smiles, y, batch_size=64, eval_size=1000, n_epochs=5):
    # Pretraining data loading
    in_features = x.shape[-1]
    x = get_other_features(smiles)
    #x = np.concatenate((x, xs), axis=1)
    x_tr, x_val, y_tr, y_val = train_test_split(x, y, test_size=eval_size, random_state=0, shuffle=True)
    
    x_tr, x_val = torch.tensor(x_tr, dtype=torch.float, requires_grad=False), torch.tensor(x_val, dtype=torch.float, requires_grad=False)
    y_tr, y_val = torch.tensor(y_tr, dtype=torch.float, requires_grad=False), torch.tensor(y_val, dtype=torch.float, requires_grad=False)

    dataset_tr = TensorDataset(x_tr, y_tr)
    dataset_val = TensorDataset(x_val, y_val)
    loader_tr = DataLoader(dataset=dataset_tr, batch_size=batch_size, shuffle=True, num_workers=8)
    loader_val = DataLoader(dataset=dataset_val, batch_size=batch_size, shuffle=True, num_workers=8)

    # model declaration
    model = Net(ad=19, hn1=250, hn2=40, do1=0.1, do2=0.1)
    model.train()
    
    validation = False
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.MSELoss()
    last_loss = 100

    for epoch in range(n_epochs):
        print(f'epoch: {epoch}')
        
        # validation  
        model.eval()
        running_loss = 0.0   
        for i, [X, y] in enumerate(loader_val):
            optimizer.zero_grad()
            output = model(X)
            loss = criterion(output, y)
            running_loss += loss.item()
        running_loss /= i
        print(f'validation loss: {running_loss :.3f}')
        if (validation and last_loss <= running_loss + 0.002): break
        last_loss = running_loss

        # training  
        model.train()
        running_loss = 0.0      
        for i, [X, y] in enumerate(loader_tr):
            output = model(X)
            loss = criterion(output, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'training loss: {running_loss / i:.3f}')
        running_loss = 0.0


    def make_features(x):
        xs = get_other_features(x[1])
        xst = torch.tensor(xs)
        x = np.concatenate((x[0], xs), axis=1)
        x = torch.tensor(x, dtype=torch.float, requires_grad=False)
        
        with torch.no_grad():
            model.eval()
            x_LUMO = torch.unsqueeze(model(x), 1)
            model_ = copy.deepcopy(model)
            model_.fc2 = nn.Sequential()
            model_.eval()
            x = model_(x)
            x = torch.cat((x, x_LUMO, xst), 1)
        return x

    return make_features

In [18]:
def make_pretraining_class(feature_extractors):

    class PretrainedFeatures(BaseEstimator, TransformerMixin):

        def __init__(self, *, feature_extractor=None, mode=None):
            self.feature_extractor = feature_extractor
            self.mode = mode

        def fit(self, X=None, y=None):
            return self

        def transform(self, X):
            assert self.feature_extractor is not None
            X_new = feature_extractors[self.feature_extractor](X)
            return X_new
        
    return PretrainedFeatures

## Training

In [None]:

torch.manual_seed(0)
feature_extractor =  make_feature_extractor(x_pretrain, smiles_pretrain, y_pretrain)
PretrainedFeatureClass = make_pretraining_class({"pretrain": feature_extractor})


## Validation

In [20]:
#regression_model = LinearRegression(fit_intercept=True)
#regression_model = MLPRegressor(hidden_layer_sizes=(10), activation='relu', solver='lbfgs')
regression_model = RidgeCV()

score = 0
variance = 0
iter = 100
for i in range(iter):
    pipe_split = Pipeline(steps=[('feature_extraction', PretrainedFeatureClass(feature_extractor="pretrain")), ('model', regression_model)])
    x_train_tr, x_train_val, smiles_train_tr, smiles_train_val, y_train_tr, y_train_val = train_test_split(x_train, smiles_train, y_train, test_size=10, shuffle=True, random_state=i)
    pipe_split.fit((x_train_tr, smiles_train_tr), y_train_tr)
    variance += explained_variance_score(y_train_val, pipe_split.predict((x_train_val, smiles_train_val)))
    score += mean_squared_error(y_train_val, pipe_split.predict((x_train_val, smiles_train_val)), squared=False)
print(score/iter)
print(variance/iter)


0.12833352477569837
0.8266200336923569


## Testing

In [21]:
pipe = Pipeline(steps=[('feature_extraction', PretrainedFeatureClass(feature_extractor="pretrain")), ('model', regression_model)])
pipe.fit((x_train, smiles_train), y_train)

x_test_ = x_test.to_numpy()
y_pred = pipe.predict((x_test_, smiles_test))

assert y_pred.shape == (x_test.shape[0],)
y_pred = pd.DataFrame({"y": y_pred}, index=x_test.index)
y_pred.to_csv("results.csv", index_label="Id")
print("Predictions saved, all done!")

Predictions saved, all done!
