In [31]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
import time
from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import  KFold
from src.utils import scale, eval_model
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
import seaborn as sns
from src.corr import non_corr_features
from src.maplight_gnn import get_representation
from src.utils import get_fps_cols, scale, OffsetScaler, to_submit, drop_nans_non_unique, BlendingClassifier

train = pd.read_csv('../data/processed/train.csv', index_col=0)
test = pd.read_csv('../data/processed/test.csv', index_col=0)

In [33]:
def prepare_data(**params):
    X_train = get_representation(train.smi, **params)
    X_test = get_representation(test.smi, **params)

    X_train = drop_nans_non_unique(X_train)
    X_test = X_test[X_train.columns]

    fps_offset = 1024 * params['morgan_fps'] + \
                 1024 * params['avalon_fps'] + \
                 315 * params['erg_fps']

    scaler = OffsetScaler(fps_offset)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    ohe = OneHotEncoder(sparse_output=False)
    prop = ohe.fit_transform(train[['prop']])
    X_train = pd.concat([X_train, pd.DataFrame(prop, columns=ohe.get_feature_names_out())], axis=1)

    prop = ohe.transform(test[['prop']])
    X_test = pd.concat([X_test, pd.DataFrame(prop, columns=ohe.get_feature_names_out())], axis=1)
    return X_train, train.target, X_test

params = {
    'morgan_fps':False,
    'avalon_fps':False,
    'erg_fps':False,
    'rdkit_feats':False,
    'mord_feats': True,
    'gin_gnn': False,
}

mord_train, y_train, mord_test = prepare_data(**params)
mord_train.shape[1]

930

In [34]:
from src.corr import non_corr_features

mord_train_uncor = non_corr_features(mord_train, y_train, threshold=0.8)
mord_test_uncor = mord_test[mord_train_uncor.columns]
mord_train_uncor.shape[1]

263

In [35]:
params = {
    'morgan_fps': True,
    'avalon_fps': True,
    'erg_fps': True,
    'rdkit_feats': True,
    'mord_feats': False,
    'gin_gnn': True,
}

X_train, y_train, X_test = prepare_data(**params)
X_train.shape[1], X_train.columns.str.contains('rd_').sum()

(2838, 192)

In [36]:
X_train = pd.concat([X_train, mord_train_uncor], axis=1)
X_test = pd.concat([X_test, mord_test_uncor], axis=1)

X_train.shape[1], X_train.columns.str.contains('rd_|md_').sum()

(3101, 452)

In [37]:
X_train.to_pickle('../data/processed/X_train_mpl_mord.pkl.zip')
X_test.to_pickle('../data/processed/X_test_mpl_mord.pkl.zip')

In [38]:
X_train.shape, X_test.shape

((7939, 3101), (1221, 3101))