# Поиск лекарства от COVID-19

## Постановка задачи
Необходимо предсказать тестовые данные на основе имеющихся значений pIC50 для тренировочного датасета и предположить, какая молекула или какой набор молекул могут иметь потенциальную активность против основной протеазы SARS-CoV-2.

## Исходные данные
Важно отметить, что мы ввели дополнительные десктрипторы:
- Docking score, рассчитанный посредством Шрёдингера (float)
- (Cl)ccc, структурный дескриптор (bool)
- C(=O)Nc2c, структурный дескриптор (bool)
- cncc2ccccc, структурный дескриптор (bool), однако в финальную версию он не вошёл из-за того, что оказался недостаточно селективен.

## Первая попытка решения
В первый раз мы пытались предсказывать точные значения pIC50, однако реальность оказалась крайне жестока и не порадовала нас достаточно высоким значением r2_score. Мы последовательно пробовали линейную регрессию, градиентный бустинг и рандом форест, однако наибольшее значение r2_score равнялось только 0.24, что мы сочли недостаточным и не остановились на достигнутом.

## Дальнейшие решения
В результате первого разочарования мы приняли решение: все молекулы, значение pIC50 которых было выше 6.0, мы отнесли к классу активных (1), а те, значение которых оказалось ниже, к классу неактивных (0). r2_score взлетел до 0.50 уже на baseline-решении, и мы продолжили наши изыскания. Ознакомиться с ними можно в ноутбуке ниже.
В частности, для появления новых дескрипторов мы просмотрели все молекулы датасета и вычленили из них наиболее часто встречающиеся фрагменты.

**Важно,** что по не зависящим от нас обстоятельствам некоторые молекулы были удалены из датасета.

## Подготовка тренировочного сета

In [1]:
from IPython.display import HTML

from tqdm import trange

import pandas as pd
from pandas import array
from pandas import DataFrame

import numpy as np
from numpy import zeros, array

from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, Descriptors, AllChem
import rdkit.Chem.AllChem as AllChem

from CGRtools import SDFRead
from CGRtools.utils import grid_depict, to_rdkit_molecule

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error, balanced_accuracy_score
from sklearn.model_selection import KFold, cross_val_predict, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
data_train = pd.read_csv("train.csv", sep=',')

In [3]:
data_train

Unnamed: 0,SMILES,Class,(Cl)ccc,C(=O)Nc2c,cncc2ccccc
0,O=C(NCC1(C(=O)Nc2cncc3ccccc23)CCOc2ccc(Cl)cc21...,0,0,1,0
1,CO[C@]1(C(=O)Nc2cncc3ccccc23)CCOc2c(F)cc(F)cc21,0,0,1,0
2,COC[C@]1(C(=O)Nc2cncc3ccccc23)CCOc2cc(Cl)c(Cl)...,0,0,1,0
3,COC[C@@]1(C(=O)Nc2cncc3ccccc23)CCOc2cc(Cl)c(Cl...,0,0,1,0
4,Cc1nc2n(n1)CC(C(=O)NC(C(=O)Nc1cncc3ccccc13)c1c...,0,0,0,0
...,...,...,...,...,...
472,Cc1ccncc1NC(=O)C(C)c1cccc(C#N)c1,0,0,0,0
473,N#Cc1cccc(NC(=O)Nc2cncc(N)c2)c1,0,0,1,0
474,N#Cc1cccc(NC(=O)Nc2c[nH]c3ncccc23)c1,0,0,1,0
475,N#Cc1cccc(NC(=O)Cc2cncc3ccccc23)c1,0,0,0,0


In [4]:
train = data_train.drop(index=[154, 264, 268, 419, 46, 189])

In [5]:
print(train['Class'].value_counts())

0    383
1     88
Name: Class, dtype: int64


In [6]:
# создаем словарь из дескриторов структуры
ConstDescriptors = {"HeavyAtomCount": Descriptors.HeavyAtomCount,
                    "NHOHCount": Descriptors.NHOHCount,
                    "NOCount": Descriptors.NOCount,
                    "NumHAcceptors": Descriptors.NumHAcceptors,
                    "NumHDonors": Descriptors.NumHDonors,
                    "NumHeteroatoms": Descriptors.NumHeteroatoms,
                    "NumRotatableBonds": Descriptors.NumRotatableBonds,
                    "NumValenceElectrons": Descriptors.NumValenceElectrons,
                    "NumAromaticRings": Descriptors.NumAromaticRings,
                    "NumAliphaticHeterocycles": Descriptors.NumAliphaticHeterocycles,
                    "RingCount": Descriptors.RingCount}

# создаем словарь из физико-химических дескрипторов                            
PhisChemDescriptors = {"MW": Descriptors.MolWt,
                       "LogP": Descriptors.MolLogP,
                       "MR": Descriptors.MolMR,
                       "TPSA": Descriptors.TPSA}

# объединяем все дескрипторы в один словарь
descriptors = {}
descriptors.update(ConstDescriptors)
descriptors.update(PhisChemDescriptors)
print(f"Количество дескрипторов в словаре: {len(descriptors)}")


# функция для генерации дескрипторов из молекул
def mol_dsc_calc(mols): 
    return DataFrame({k: f(m) for k, f in descriptors.items()} 
             for m in mols)

# оформляем sklearn трансформер для использования в конвеерном моделировании (sklearn Pipeline)
descriptors_transformer = FunctionTransformer(mol_dsc_calc, validate=False)

Количество дескрипторов в словаре: 15


In [7]:
molecules = [mol for mol in Chem.SDMolSupplier("DOCK_FOR_DESCR.sdf") if mol is not None]
print(f'Количество молекул = {len(molecules)}')

Количество молекул = 471


In [8]:
X = descriptors_transformer.transform(molecules)

In [9]:
data_sdf = SDFRead('DOCK_FOR_DESCR.sdf').read()

In [10]:
dock_score = [float(x.meta['r_i_docking_score']) for x in data_sdf]

In [11]:
col1 = train['(Cl)ccc']

In [12]:
col2 = train['C(=O)Nc2c']

In [13]:
def calc_morgan(mols):
    """ генерация молекулярных отпечатков по методу Моргана с радиусом 2
    """
    for_df = []
    for m in mols:
        arr = zeros((1,), dtype='float32')
        DataStructs.ConvertToNumpyArray(AllChem.GetMorganFingerprintAsBitVect(m, 2, 2048), arr)
        for_df.append(arr)
    return DataFrame(for_df)

In [14]:
morgan_transformer = FunctionTransformer(calc_morgan, validate=False)
dddf= pd.DataFrame(X)
molecules_csv = train

In [15]:
moleculesSMILES = list(molecules_csv['SMILES'])
M = morgan_transformer.transform(molecules)

In [16]:
scaler = StandardScaler()
scaler.fit(X.values)
X_norm_SS = DataFrame(scaler.transform(X.values), index=X.index, columns=X.columns)
X_norm_SS

Unnamed: 0,HeavyAtomCount,NHOHCount,NOCount,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumAliphaticHeterocycles,RingCount,MW,LogP,MR,TPSA
0,1.662233,0.974558,1.079315,1.005237,1.131169,2.397814,0.475307,1.817429,-0.220817,2.628334,1.558548,1.915834,-0.020747,1.323411,0.736866
1,-0.194842,-0.387510,-0.401337,-0.181396,-0.373861,0.000000,-0.784747,-0.181356,-0.220817,0.918465,0.368797,-0.320091,-0.164002,-0.573499,-0.525873
2,-0.040085,-0.387510,-0.401337,-0.181396,-0.373861,0.000000,-0.364729,-0.004993,-0.220817,0.918465,0.368797,0.224749,0.854858,0.081909,-0.525873
3,-0.040085,-0.387510,-0.401337,-0.181396,-0.373861,0.000000,-0.364729,-0.004993,-0.220817,0.918465,0.368797,0.224749,0.854858,0.081909,-0.525873
4,0.888452,0.974558,1.079315,1.005237,1.131169,0.959126,0.055289,0.818036,1.095699,0.918465,1.558548,0.894060,-0.096636,0.903969,1.088668
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
466,-1.278135,-0.387510,-0.894888,-0.774713,-0.373861,-1.438688,-0.784747,-1.298324,-1.537333,-0.791405,-2.010704,-1.539374,-0.898350,-1.377010,-0.317759
467,-1.432891,3.698695,0.092213,-0.181396,2.636200,-0.479563,-1.204765,-1.474687,-1.537333,-0.791405,-2.010704,-1.679260,-1.682562,-1.583991,1.167931
468,-1.123379,2.336627,0.092213,-0.774713,2.636200,-0.479563,-1.204765,-1.239536,-0.220817,-0.791405,-0.820953,-1.400415,-0.827313,-1.258090,0.768493
469,-0.968623,-0.387510,-0.894888,-0.774713,-0.373861,-1.438688,-0.784747,-1.121960,-0.220817,-0.791405,-0.820953,-1.283930,-0.628411,-1.024243,-0.317759


In [17]:
X_norm_SS['(Cl)ccc'] = col1

In [18]:
X_norm_SS['C(=O)Nc2c'] = col2

In [19]:
Y = train['Class']

In [20]:
Y

0      0
1      0
2      0
3      0
4      0
      ..
472    0
473    0
474    0
475    0
476    0
Name: Class, Length: 471, dtype: int64

In [21]:
normal_descriptors_transformer = Pipeline([('gen', descriptors_transformer), ('norm', scaler)])

X_norm_SS = normal_descriptors_transformer.fit_transform(molecules)
X_norm_SS

array([[ 1.66223262,  0.97455845,  1.07931478, ..., -0.0207472 ,
         1.32341071,  0.73686577],
       [-0.19484166, -0.38750989, -0.40133744, ..., -0.16400218,
        -0.57349926, -0.52587295],
       [-0.04008547, -0.38750989, -0.40133744, ...,  0.85485846,
         0.08190891, -0.52587295],
       ...,
       [-1.1233788 ,  2.33662678,  0.0922133 , ..., -0.8273127 ,
        -1.25809018,  0.76849281],
       [-0.96862261, -0.38750989, -0.89488818, ..., -0.62841086,
        -1.02424286, -0.31775924],
       [-1.27813499, -0.38750989, -1.38843892, ..., -0.50634925,
        -1.23065856, -1.24665702]])

In [22]:
kf = KFold(n_splits=5, random_state=1, shuffle=True)

In [23]:
type(X_norm_SS)

numpy.ndarray

In [24]:
X_norm_SS = pd.DataFrame(X_norm_SS)

In [25]:
type(X_norm_SS)

pandas.core.frame.DataFrame

## Градиентный бустинг

In [26]:
arr = array(Y) > 5.

In [28]:
clf = GradientBoostingClassifier(n_estimators=500, learning_rate=.1, max_depth=5, random_state=1488)

pred = cross_val_predict(clf, X_norm_SS.fillna(0.), Y, cv=kf)
balanced_accuracy_score(Y, pred)

0.6020353667220508

In [32]:
parameters = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10]
    }
#passing the scoring function in the GridSearchCV
clf = GridSearchCV(GradientBoostingClassifier(), parameters, refit=False, cv=2, n_jobs=-1)

clf.fit(X_norm_SS.fillna(0.), Y)

print(clf.best_params_)
print(clf.best_score_)

{'criterion': 'friedman_mse', 'learning_rate': 0.2, 'loss': 'deviance', 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.24545454545454548, 'n_estimators': 10, 'subsample': 0.95}
0.8259015506671474


## Обработка тестовых данных

In [88]:
data_test = pd.read_csv("test.csv", sep=',')

In [89]:
data_test

Unnamed: 0,SMILES,(Cl)ccc,C(=O)Nc2c,Docking_score
0,CC(C)C[C@H](NC(=O)OCc1ccccc1)C(=O)N[C@@H](CC1C...,0,0,-6.561
1,CC(C)(C)OC(=O)NCCCNC(=O)C(c1cccnc1)N(C(=O)c1co...,0,0,-3.803
2,CC(C)(C)OC(=O)CNC(=O)C(c1cccnc1)N(C(=O)c1cocn1...,0,0,-1.833
3,CCCCc1nc(-c2nccn2CCc2ccccn2)c[nH]1,0,0,-4.956
4,O=C(c1cc(=O)[nH]c2ccccc12)N1CCN(c2cccc(C(F)(F)...,0,0,-7.136
...,...,...,...,...
94,Cc1nnc(C2Cc3ccccc3CN2C(=O)CCl)o1,0,0,-5.214
95,O=C(CCl)N1CCN(Cc2cc(O)cc(Cl)c2)CC1,0,0,-6.758
96,O=C(CCl)Nc1cccc(N2CCCC2=O)c1,0,0,-6.336
97,O=C(CCl)N1CCN(S(=O)(=O)c2c(F)cccc2F)CC1,0,0,-6.879


In [90]:
# создаем словарь из дескриторов структуры
ConstDescriptors = {"HeavyAtomCount": Descriptors.HeavyAtomCount,
                    "NHOHCount": Descriptors.NHOHCount,
                    "NOCount": Descriptors.NOCount,
                    "NumHAcceptors": Descriptors.NumHAcceptors,
                    "NumHDonors": Descriptors.NumHDonors,
                    "NumHeteroatoms": Descriptors.NumHeteroatoms,
                    "NumRotatableBonds": Descriptors.NumRotatableBonds,
                    "NumValenceElectrons": Descriptors.NumValenceElectrons,
                    "NumAromaticRings": Descriptors.NumAromaticRings,
                    "NumAliphaticHeterocycles": Descriptors.NumAliphaticHeterocycles,
                    "RingCount": Descriptors.RingCount}

# создаем словарь из физико-химических дескрипторов                            
PhisChemDescriptors = {"MW": Descriptors.MolWt,
                       "LogP": Descriptors.MolLogP,
                       "MR": Descriptors.MolMR,
                       "TPSA": Descriptors.TPSA}

# объединяем все дескрипторы в один словарь
descriptors = {}
descriptors.update(ConstDescriptors)
descriptors.update(PhisChemDescriptors)
print(f"Количество дескрипторов в словаре: {len(descriptors)}")


# функция для генерации дескрипторов из молекул
def mol_dsc_calc(mols): 
    return DataFrame({k: f(m) for k, f in descriptors.items()} 
             for m in mols)

# оформляем sklearn трансформер для использования в конвеерном моделировании (sklearn Pipeline)
descriptors_transformer = FunctionTransformer(mol_dsc_calc, validate=False)

Количество дескрипторов в словаре: 15


In [91]:
molecules_test = [mol for mol in Chem.SDMolSupplier("test.sdf") if mol is not None]
print(f'Количество молекул = {len(molecules_test)}')

Количество молекул = 99


In [92]:
X_test = descriptors_transformer.transform(molecules_test)

In [93]:
col1_test = data_test['(Cl)ccc']

In [94]:
col2_test = data_test['C(=O)Nc2c']

In [95]:
col3_test = data_test['Docking_score']

In [96]:
def calc_morgan(mols):
    """ генерация молекулярных отпечатков по методу Моргана с радиусом 2
    """
    for_df = []
    for m in mols:
        arr = zeros((1,), dtype='float32')
        DataStructs.ConvertToNumpyArray(AllChem.GetMorganFingerprintAsBitVect(m, 2, 2048), arr)
        for_df.append(arr)
    return DataFrame(for_df)

In [97]:
X_test['Docking_score'] = col3_test

In [98]:
X_test

Unnamed: 0,HeavyAtomCount,NHOHCount,NOCount,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumAliphaticHeterocycles,RingCount,MW,LogP,MR,TPSA,Docking_score
0,33,5,11,7,5,12,11,184,1,1,2,485.559,0.54470,118.2255,171.13,-6.561
1,39,2,10,7,2,10,9,208,3,0,3,535.645,4.78620,147.2959,126.66,-3.803
2,36,1,9,7,1,9,7,190,3,0,3,492.576,4.21310,134.2872,114.63,-1.833
3,22,1,5,4,1,5,7,114,3,0,3,295.390,3.25350,86.3617,59.39,-4.956
4,29,1,5,3,1,8,2,150,3,1,4,401.388,3.50930,104.1292,56.41,-7.136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,20,0,5,4,0,6,2,104,2,1,3,291.738,2.24282,73.1920,59.23,-5.214
95,19,1,4,3,1,6,3,104,1,1,2,303.189,1.92860,75.6938,43.78,-6.758
96,17,1,4,2,1,5,3,90,1,1,2,252.701,1.99070,67.3787,49.41,-6.336
97,21,0,5,3,0,9,3,116,1,1,2,338.763,1.03650,72.4168,57.69,-6.879


In [99]:
morgan_transformer = FunctionTransformer(calc_morgan, validate=False)
dddf = pd.DataFrame(X_test)
molecules_csv_test = data_test

In [100]:
moleculesSMILES = list(molecules_csv_test['SMILES'])
M = morgan_transformer.transform(molecules_test)

In [101]:
scaler_test = StandardScaler()
scaler_test.fit(X_test.values)
X_norm_SS_test = DataFrame(scaler_test.transform(X_test.values), index=X_test.index, columns=X_test.columns)
X_norm_SS_test

Unnamed: 0,HeavyAtomCount,NHOHCount,NOCount,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumAliphaticHeterocycles,RingCount,MW,LogP,MR,TPSA,Docking_score
0,0.598795,4.247829,2.796119,1.656702,4.540862,2.538087,2.317248,1.007578,-1.877790,0.852493,-1.60783,0.878796,-2.545028,0.313645,3.745387,-0.961395
1,1.490235,0.962399,2.246881,1.656702,1.046986,1.459672,1.505186,1.704319,0.068820,-0.835443,-0.53954,1.468778,0.613743,1.494269,2.019265,1.220587
2,1.044515,-0.132745,1.697644,1.656702,-0.117639,0.920465,0.693124,1.181763,0.068820,-0.835443,-0.53954,0.961452,0.186938,0.965952,1.552315,2.779145
3,-1.035511,-0.132745,-0.499307,-0.221315,-0.117639,-1.236364,0.693124,-1.024586,0.068820,-0.835443,-0.53954,-1.361279,-0.527704,-0.980425,-0.591849,0.308395
4,0.004502,-0.132745,-0.499307,-0.847321,-0.117639,0.381258,-1.337032,0.020527,0.068820,0.852493,0.52875,-0.112687,-0.337202,-0.258841,-0.707519,-1.416304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,-1.332657,-1.227888,-0.499307,-0.221315,-1.282264,-0.697157,-1.337032,-1.314895,-0.904485,0.852493,-0.53954,-1.404297,-1.280387,-1.515280,-0.598060,0.104279
95,-1.481231,-0.132745,-1.048545,-0.847321,-0.117639,-0.697157,-0.931001,-1.314895,-1.877790,0.852493,-1.60783,-1.269411,-1.514396,-1.413676,-1.197758,-1.117250
96,-1.778377,-0.132745,-1.048545,-1.473327,-0.117639,-1.236364,-0.931001,-1.721327,-1.877790,0.852493,-1.60783,-1.864129,-1.468149,-1.751373,-0.979227,-0.783387
97,-1.184084,-1.227888,-0.499307,-0.847321,-1.282264,0.920465,-0.931001,-0.966524,-1.877790,0.852493,-1.60783,-0.850371,-2.178770,-1.546763,-0.657835,-1.212979


In [102]:
X_norm_SS_test['(Cl)ccc'] = col1

In [103]:
X_norm_SS_test['C(=O)Nc2c'] = col2

In [104]:
X_norm_SS_test

Unnamed: 0,HeavyAtomCount,NHOHCount,NOCount,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumValenceElectrons,NumAromaticRings,NumAliphaticHeterocycles,RingCount,MW,LogP,MR,TPSA,Docking_score,(Cl)ccc,C(=O)Nc2c
0,0.598795,4.247829,2.796119,1.656702,4.540862,2.538087,2.317248,1.007578,-1.877790,0.852493,-1.60783,0.878796,-2.545028,0.313645,3.745387,-0.961395,0.0,1.0
1,1.490235,0.962399,2.246881,1.656702,1.046986,1.459672,1.505186,1.704319,0.068820,-0.835443,-0.53954,1.468778,0.613743,1.494269,2.019265,1.220587,0.0,1.0
2,1.044515,-0.132745,1.697644,1.656702,-0.117639,0.920465,0.693124,1.181763,0.068820,-0.835443,-0.53954,0.961452,0.186938,0.965952,1.552315,2.779145,0.0,1.0
3,-1.035511,-0.132745,-0.499307,-0.221315,-0.117639,-1.236364,0.693124,-1.024586,0.068820,-0.835443,-0.53954,-1.361279,-0.527704,-0.980425,-0.591849,0.308395,0.0,1.0
4,0.004502,-0.132745,-0.499307,-0.847321,-0.117639,0.381258,-1.337032,0.020527,0.068820,0.852493,0.52875,-0.112687,-0.337202,-0.258841,-0.707519,-1.416304,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,-1.332657,-1.227888,-0.499307,-0.221315,-1.282264,-0.697157,-1.337032,-1.314895,-0.904485,0.852493,-0.53954,-1.404297,-1.280387,-1.515280,-0.598060,0.104279,0.0,0.0
95,-1.481231,-0.132745,-1.048545,-0.847321,-0.117639,-0.697157,-0.931001,-1.314895,-1.877790,0.852493,-1.60783,-1.269411,-1.514396,-1.413676,-1.197758,-1.117250,0.0,0.0
96,-1.778377,-0.132745,-1.048545,-1.473327,-0.117639,-1.236364,-0.931001,-1.721327,-1.877790,0.852493,-1.60783,-1.864129,-1.468149,-1.751373,-0.979227,-0.783387,0.0,0.0
97,-1.184084,-1.227888,-0.499307,-0.847321,-1.282264,0.920465,-0.931001,-0.966524,-1.877790,0.852493,-1.60783,-0.850371,-2.178770,-1.546763,-0.657835,-1.212979,0.0,0.0


In [105]:
normal_descriptors_transformer_test = Pipeline([('gen', descriptors_transformer), ('norm', scaler_test)])

X_norm_SS_test = normal_descriptors_transformer_test.fit_transform(molecules_test)
X_norm_SS_test

array([[ 0.59879543,  4.24782919,  2.79611919, ..., -2.54502765,
         0.31364547,  3.74538727],
       [ 1.49023524,  0.9623988 ,  2.2468815 , ...,  0.61374276,
         1.49426868,  2.01926502],
       [ 1.04451533, -0.13274466,  1.6976438 , ...,  0.18693825,
         0.96595214,  1.55231537],
       ...,
       [-1.7783774 , -0.13274466, -1.0485447 , ..., -1.46814871,
        -1.75137298, -0.97922709],
       [-1.18408419, -1.22788813, -0.499307  , ..., -2.17876966,
        -1.54676286, -0.65783531],
       [-1.6298041 , -1.22788813, -0.499307  , ..., -2.34015255,
        -1.62957186, -0.65783531]])

In [111]:
type(X_norm_SS_test)

numpy.ndarray

## Предсказание тестовых данных

In [124]:
best_params = {
    'criterion': ['friedman_mse'], 
    'learning_rate': [0.2], 
    'loss': ['deviance'], 
    'max_depth': [8], 
    'max_features': ['sqrt'], 
    'min_samples_leaf': [0.1], 
    'min_samples_split': [0.24545454545454548], 
    'n_estimators': [10], 
    'subsample': [0.95],
}

In [129]:
clf_best = GradientBoostingClassifier()
clf_best.set_params(**clf.best_params_)

GradientBoostingClassifier(learning_rate=0.2, max_depth=8, max_features='sqrt',
                           min_samples_leaf=0.1,
                           min_samples_split=0.24545454545454548,
                           n_estimators=10, subsample=0.95)

In [137]:
clf_best.fit(X_norm_SS.fillna(0.), Y)

GradientBoostingClassifier(learning_rate=0.2, max_depth=8, max_features='sqrt',
                           min_samples_leaf=0.1,
                           min_samples_split=0.24545454545454548,
                           n_estimators=10, subsample=0.95)

In [138]:
pred_prob_test = clf_best.predict_proba(X_norm_SS_test)

In [139]:
preds = np.argmax(pred_prob_test, axis=1)

In [140]:
preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])