In [35]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
import pandas as pd
import numpy as np
from src.utils import get_fps_offset, OffsetScaler
from src import config

train = pd.read_csv('../data/processed/X_train.csv')
test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv').values.ravel()

FPS_OFFSET = get_fps_offset(train.columns)
fps_cols = [str(c) for c in range(FPS_OFFSET)]
rd_cols = [c for c in train.columns if 'rd_' in str(c)]
md_cols = [c for c in train.columns if 'md_' in str(c)]
features = list(set(train.columns) - set(rd_cols + md_cols + fps_cols))
top_50_feats = ['rd_MolLogP',
                'md_FilterItLogS',
                'md_SMR_VSA9',
                'ALogPS_logP',
                'ALogPS_logS',
                'md_SlogP_VSA11',
                'md_AATS0v',
                'md_ATSC1i',
                'md_BertzCT',
                'SsOH(phen)',
                'md_ATSC1pe',
                'md_NaasC',
                'md_piPC2',
                'md_ATS8dv',
                'md_Mv',
                'md_ATSC1p',
                'rd_NOCount',
                'MW',
                'md_JGI6',
                'md_SsOH',
                'md_ATSC3Z',
                'md_AATS0i',
                'md_ATSC1m',
                'md_nBondsKD',
                'md_ATSC1v',
                'md_PEOE_VSA6',
                'md_SMR_VSA3',
                'md_CIC5',
                'md_NsssCH',
                'rd_VSA_EState5',
                'rd_VSA_EState8',
                'md_JGI7',
                'md_ATS8i',
                'rd_NumHAcceptors',
                'rd_VSA_EState9',
                'md_ATSC8m',
                'rd_VSA_EState3',
                'md_nHBDon',
                'Se1C2C3sd',
                'md_nAcid',
                'md_MIC5',
                'md_ATSC3i',
                'SdO(amid)',
                'rd_NumAliphaticHeterocycles',
                'md_JGI8',
                'md_IC5',
                'SeaC3C3aa',
                'md_ATSC5pe',
                'md_nS',
                'md_SMR_VSA4']

scaler = OffsetScaler(FPS_OFFSET)

X_train = scaler.fit_transform(train[fps_cols + top_50_feats].values)
X_test = scaler.transform(test[fps_cols + top_50_feats].values)

In [37]:
import time

import numpy as np
import pandas as pd

from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler

from src.utils import OffsetScaler, get_fps_offset
import xgboost as xgb
from catboost import CatBoostRegressor
import seaborn as sns


mae = 'neg_mean_absolute_error'
mse = 'neg_mean_squared_error'
rmse = 'neg_root_mean_squared_error'

RANDOM_SEED = 42
N_JOBS = 12

# prepare models
models = {}

# models['LR'] = LinearRegression()
# models['Ridge'] = Ridge()
models['PLS'] = PLSRegression()
models['DT'] = DecisionTreeRegressor(random_state=RANDOM_SEED)
models['Lasso'] = Lasso()
models['KNN'] = KNeighborsRegressor(n_jobs=N_JOBS)
models['SVR'] = SVR()
models['RF'] = RandomForestRegressor(n_estimators=200, random_state=RANDOM_SEED, n_jobs=N_JOBS)
models['XGB'] = xgb.XGBRegressor(n_estimators=1000, random_state=RANDOM_SEED, n_jobs=N_JOBS, verbosity=0)
models['CATB'] = CatBoostRegressor(loss_function='RMSE', verbose=False, random_seed=RANDOM_SEED, thread_count=N_JOBS)
# models['CATB GPU'] = CatBoostRegressor(loss_function='RMSE', verbose=False, random_seed=RANDOM_SEED, thread_count=N_JOBS, task_type='GPU')

scaler = OffsetScaler(offset=FPS_OFFSET)
ss = StandardScaler()

In [38]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingRegressor

In [42]:
def evaluate(X, seed=RANDOM_SEED):
    results = {}

    for name in models.keys():
        tic = time.time()
        
        model = models[name]
        kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
        results[name] = cross_val_score(model, X, y_train, cv=kfold, scoring=rmse)
        
        toc = time.time()
        mean = results[name].mean()
        std = results[name].std()
        final_score = mean - std
        print("%5s: %3.3f     %3.3f ± %3.3f      %.1fs" % (name, final_score, mean, std, toc - tic))
        
    return pd.DataFrame(results)

def evaluate_single(model, X, name=None, seed=RANDOM_SEED):
    tic = time.time()
    
    kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
    cv_res = cross_val_score(model, X, y_train, cv=kfold, scoring=rmse)
    
    toc = time.time()
    final_score = cv_res.mean() - cv_res.std()
    name = '' if name is None else name
    print("%5s: %3.3f     %3.3f ± %3.3f      %.1fs" % (name, final_score, cv_res.mean(), cv_res.std(), toc - tic))
    return cv_res

In [40]:
evaluate(X_train);

  PLS: -30.922     -29.249 ± 1.673      0.2s
   DT: -33.612     -32.761 ± 0.851      0.8s
Lasso: -29.341     -28.264 ± 1.077      1.1s
  KNN: -29.000     -27.593 ± 1.407      0.2s
  SVR: -35.017     -34.298 ± 0.718      2.7s
   RF: -25.241     -23.391 ± 1.849      12.6s
  XGB: -26.694     -24.906 ± 1.788      13.6s
 CATB: -24.281     -22.882 ± 1.399      93.1s


In [25]:
estimators = [
    ('CATB', models['CATB']),
    ('RF', models['RF']),
    ('XGB', models['XGB']),
    ('KNN', models['KNN']),
    ('Lasso', models['Lasso']),
]

clf = StackingRegressor(
    estimators=estimators, final_estimator=LinearRegression()
)

evaluate_single(clf, X_train);

 None: -24.352     -22.818 ± 1.534      428.7s


array([-23.85412768, -25.06966055, -22.77018122, -20.76684548,
       -21.62735586])

In [41]:
estimators = [
    ('RF', models['RF']),
    ('XGB', models['XGB']),
    ('KNN', models['KNN']),
    ('Lasso', models['Lasso']),
]

clf = StackingRegressor(
    estimators=estimators, final_estimator=LinearRegression()
)

evaluate_single(clf, X_train);

 None: -25.134     -23.297 ± 1.838      150.5s


In [43]:
evaluate_single(models['RF'], X_train);

     : -25.241     -23.391 ± 1.849      15.4s


In [44]:
evaluate_single(models['CATB'], X_train);

     : -24.281     -22.882 ± 1.399      91.1s


In [61]:
import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

class BlendingRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, models, weights=None):
        self.models = models
        self.weights = weights if weights is not None else [1/len(models)] * len(models)
    
    def fit(self, X, y):
        for name, model in self.models:
            model.fit(X, y)
        return self
    
    def predict(self, X):
        predictions = np.array([model.predict(X) for name, model in self.models])
        weighted_avg_predictions = np.average(predictions, axis=0, weights=self.weights)
        return weighted_avg_predictions

  PLS: -30.922     -29.249 ± 1.673      0.2s
  
   DT: -33.612     -32.761 ± 0.851      0.8s
   
Lasso: -29.341     -28.264 ± 1.077      1.1s

  KNN: -29.000     -27.593 ± 1.407      0.2s
  
  SVR: -35.017     -34.298 ± 0.718      2.7s
  
   RF: -25.241     -23.391 ± 1.849      12.6s
   
  XGB: -26.694     -24.906 ± 1.788      13.6s
  
 CATB: -24.281     -22.882 ± 1.399      93.1s

In [63]:
evaluate_single(models['KNN'], X_train);
evaluate_single(models['Lasso'], X_train);

     : -29.000     -27.593 ± 1.407      0.1s
     : -29.341     -28.264 ± 1.077      0.9s


In [64]:
estimators = [
    ('KNN', models['KNN']),
    ('Lasso', models['Lasso']),
]

clf = BlendingRegressor(estimators, weights=[0.5, 0.5])

evaluate_single(clf, X_train);

     : -27.715     -26.515 ± 1.200      1.5s


In [68]:
estimators = [
    ('RF', models['RF']),
    ('XGB', models['XGB']),        
    ('KNN', models['KNN']),
    ('Lasso', models['Lasso']),
]

clf = BlendingRegressor(estimators, weights=[1, 0.2, 0.1, 0.1])

evaluate_single(clf, X_train);

     : -25.144     -23.388 ± 1.756      29.4s


In [69]:
estimators = [
    ('RF', models['RF']),
    ('KNN', models['KNN']),
]

clf = BlendingRegressor(estimators, weights=[1, 0.1])
evaluate_single(clf, X_train);

     : -25.194     -23.393 ± 1.801      15.8s


In [70]:
clf = BlendingRegressor(estimators, weights=[1, 0.2])
evaluate_single(clf, X_train);

     : -25.218     -23.457 ± 1.761      12.6s


In [72]:
clf = BlendingRegressor(estimators, weights=[1, 1])
evaluate_single(clf, X_train);

     : -25.997     -24.405 ± 1.593      15.8s


In [71]:
clf = BlendingRegressor(estimators, weights=[1, 0.05])
evaluate_single(clf, X_train);

     : -25.206     -23.382 ± 1.824      15.6s


In [None]:
estimators = [
    ('RF', models['RF']),
    ('KNN', models['KNN']),
]

clf = BlendingRegressor(estimators, weights=[1, 0.1])

evaluate_single(clf, X_train);

In [76]:
estimators = [
    ('CATB', models['CATB']),
    ('RF', models['RF']),
]

clf = BlendingRegressor(estimators, weights=[1, 0.05])

evaluate_single(clf, X_train);

     : -24.265     -22.845 ± 1.420      108.0s


In [73]:
estimators = [
    ('CATB', models['CATB']),
    ('RF', models['RF']),
]

clf = BlendingRegressor(estimators, weights=[1, 0.1])

evaluate_single(clf, X_train);

     : -24.256     -22.817 ± 1.439      111.0s


In [74]:
estimators = [
    ('CATB', models['CATB']),
    ('RF', models['RF']),
]

clf = BlendingRegressor(estimators, weights=[1, 0.2])

evaluate_single(clf, X_train);

     : -24.252     -22.780 ± 1.473      111.4s


In [75]:
estimators = [
    ('CATB', models['CATB']),
    ('RF', models['RF']),
]

clf = BlendingRegressor(estimators, weights=[1, 0.5])

evaluate_single(clf, X_train);

     : -24.301     -22.753 ± 1.548      108.6s


In [77]:
estimators = [
    ('CATB', models['CATB']),
    ('RF', models['RF']),
    ('XGB', models['XGB']),
    ('KNN', models['KNN']),
    ('Lasso', models['Lasso']),
]

clf = BlendingRegressor(estimators, weights=[1, 0.1, 0.05, 0.01, 0.01])

evaluate_single(clf, X_train);

     : -24.293     -22.793 ± 1.500      134.2s


In [78]:
best_params = {
    'rf': {        
        'n_estimators': 200,
        'max_depth': 25,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'bootstrap': True,
        'max_features': 0.2,
    },
    'lasso': {
        'alpha': 1.0531212524084377,
    },
    'knn': {
        'n_neighbors': 10, 
        'weights': 'distance', 
        'algorithm': 'kd_tree',
    },
    'xgb': {
        'n_estimators': 500,
        'max_depth': 7,
        'learning_rate': 0.03009296959769392,
        'subsample': 0.8097241149092725,
        'colsample_bytree': 0.6696687175682526,
        'gamma': 0.8402157425699277,
        'reg_alpha': 0.26489296857909234,
        'reg_lambda': 0.35085412588178566
    }     
}

In [80]:
best_rf = RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=N_JOBS, **best_params['rf'])
best_xgb = xgb.XGBRegressor(random_state=RANDOM_SEED, n_jobs=N_JOBS, **best_params['xgb'])
best_knn = KNeighborsRegressor(**best_params['knn'])
best_lasso = Lasso(**best_params['lasso'])

evaluate_single('KNN', best_knn, X_train);
evaluate_single('Lasso', best_lasso, X_train);
evaluate_single('RF', best_rf, X_train);
evaluate_single('XGB', best_xgb, X_train);

     : -28.103     -26.203 ± 1.900      1.9s
     : -29.335     -28.287 ± 1.048      0.9s
     : -24.915     -23.320 ± 1.595      3.3s
     : -24.508     -22.662 ± 1.846      11.2s


In [82]:
estimators = [
    ('CATB', CatBoostRegressor(verbose=False, random_seed=RANDOM_SEED, thread_count=N_JOBS)),
    ('XGB', best_xgb),
    ('RF', best_rf),
    # ('KNN', best_knn),
    # ('Lasso', best_lasso),
]

clf = BlendingRegressor(estimators, weights=[1,  0.5,  0.2])

evaluate_single(clf, X_train);

     : -24.160     -22.601 ± 1.559      112.8s


In [None]:
estimators = [
    ('CATB', CatBoostRegressor(verbose=False, random_seed=RANDOM_SEED, thread_count=N_JOBS)),
    ('XGB', best_xgb),
    ('RF', best_rf),
    # ('KNN', best_knn),
    # ('Lasso', best_lasso),
]

clf = BlendingRegressor(estimators, weights=[1,  0.5,  0.2])

evaluate_single(clf, X_train);

In [83]:
y_pred = clf.fit(X_train, y_train).predict(X_test)

In [85]:
pd.Series(y_pred, name='Results').to_csv('../data/submissions/stack_xgb_cb_rf.csv', index=False)