In [2]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, ParameterGrid, RepeatedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, balanced_accuracy_score

##### **Descriptor Normalization**
ML methods used in Chemoinformatics are sometimes sensitive to the magnitude of values, therefore it is often important to normalize values. Normalization procedure is a remapping of values into one range, that would be the same across different features, used in Machine Learning.

In [4]:
import pandas as pd

In [6]:
X = pd.read_csv('X_ISIDAfragmentor.csv', index_col = 0)
X.head()

Unnamed: 0,"(C-C),(C-H),(C-H),(C-H),xC","(C-C-Cl),(C-C-Cl),(C-C-Cl),xC","(C-C),(C-Cl),(C-Cl),(C-Cl),xC","(C-C-H),(C-C-H),(C-C-H),xC","(Cl-C),xCl","(Cl-C-C),(Cl-C-Cl),(Cl-C-Cl),xCl","(Cl-C-C-H),(Cl-C-C-H),(Cl-C-C-H),xCl","(H-C),xH","(H-C-C),(H-C-H),(H-C-H),xH","(H-C-C-Cl),(H-C-C-Cl),(H-C-C-Cl),xH",...,"(C-C*C*C),(C-C*C*N),(C-C*C-H),(C-C*C-H),(C-N-C-C),(C-N-C-C),(C-N-C-H),(C-N-C-H),(C-N-C-H),(C-N-C-H),xC","(C*C*C),(C*C*N),(C*C-H),(C*C-H),(C-C-H),(C-C-H),(C-C-N),xC","(C*C*C*C),(C*C*C-H),(C*C*N*C),(C-C-N-C),(C-C-N-C),xC","(C*C*C*N),(C*C*C*N),(C*C*C-H),(C*C*C-H),(C*C-C-H),(C*C-C-H),(C*C-C-N),xC","(C*C*C*C),(C*C*C-H),(C*C-C-H),(C*C-C-H),(C*C-C-N),(C*N*C*C),(C*N*C-H),xC","(H-C-C-H),(H-C-C-H),(H-C-C-N),(H-C-C-N),(H-C-C=O),(H-C-N-C),(H-C-N-C),xH","(H-C-C-C),(H-C-C-C),(H-C-C-H),(H-C-C-H),(H-C-C-H),(H-C-C-H),(H-C-C-N),(H-C-C=O),xH","(H-C-C*C),(H-C-C*C),(H-C-C-C),(H-C-C-H),(H-C-C-O),(H-C-N-C),(H-C-N-H),xH","(H-C-C*C),(H-C-C*C),(H-C-C-C),(H-C-C-H),(H-C-C-O),xH","(H-C-C-C),(H-C-C-C),(H-C-C-H),(H-C-C-H),(H-C-C-H),(H-C-C-N),(H-C-O-H),xH"
0,1.0,1.0,1.0,1.0,3.0,3.0,3.0,3.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [8]:
scaler = MinMaxScaler()
scaler.fit(X)
X_norm = pd.DataFrame(scaler.transform(X), index=X.index, columns=X.columns)
X_norm.head()

Unnamed: 0,"(C-C),(C-H),(C-H),(C-H),xC","(C-C-Cl),(C-C-Cl),(C-C-Cl),xC","(C-C),(C-Cl),(C-Cl),(C-Cl),xC","(C-C-H),(C-C-H),(C-C-H),xC","(Cl-C),xCl","(Cl-C-C),(Cl-C-Cl),(Cl-C-Cl),xCl","(Cl-C-C-H),(Cl-C-C-H),(Cl-C-C-H),xCl","(H-C),xH","(H-C-C),(H-C-H),(H-C-H),xH","(H-C-C-Cl),(H-C-C-Cl),(H-C-C-Cl),xH",...,"(C-C*C*C),(C-C*C*N),(C-C*C-H),(C-C*C-H),(C-N-C-C),(C-N-C-C),(C-N-C-H),(C-N-C-H),(C-N-C-H),(C-N-C-H),xC","(C*C*C),(C*C*N),(C*C-H),(C*C-H),(C-C-H),(C-C-H),(C-C-N),xC","(C*C*C*C),(C*C*C-H),(C*C*N*C),(C-C-N-C),(C-C-N-C),xC","(C*C*C*N),(C*C*C*N),(C*C*C-H),(C*C*C-H),(C*C-C-H),(C*C-C-H),(C*C-C-N),xC","(C*C*C*C),(C*C*C-H),(C*C-C-H),(C*C-C-H),(C*C-C-N),(C*N*C*C),(C*N*C-H),xC","(H-C-C-H),(H-C-C-H),(H-C-C-N),(H-C-C-N),(H-C-C=O),(H-C-N-C),(H-C-N-C),xH","(H-C-C-C),(H-C-C-C),(H-C-C-H),(H-C-C-H),(H-C-C-H),(H-C-C-H),(H-C-C-N),(H-C-C=O),xH","(H-C-C*C),(H-C-C*C),(H-C-C-C),(H-C-C-H),(H-C-C-O),(H-C-N-C),(H-C-N-H),xH","(H-C-C*C),(H-C-C*C),(H-C-C-C),(H-C-C-H),(H-C-C-O),xH","(H-C-C-C),(H-C-C-C),(H-C-C-H),(H-C-C-H),(H-C-C-H),(H-C-C-N),(H-C-O-H),xH"
0,0.142857,1.0,1.0,1.0,0.75,1.0,1.0,0.054545,0.142857,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.75,0.0,0.0,0.054545,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.018182,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.163636,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.145455,0.285714,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
from CIMtools.preprocessing import Fragmentor
from CGRtools.files import SDFRead

data = SDFRead("logBB.sdf").read()
fragmentor = Fragmentor(fragment_type=9, max_length=4)

##### **Modelling conveyors**

In [20]:
desc = Pipeline([
    ('fr', fragmentor),('scale', scaler)
])
X = desc.fit_transform(data)

In [24]:
les = RandomForestRegressor(n_estimators = 100, n_jobs=10)

In [28]:
params = {'max_features': ('sqrt', 'log2', None), 'n_estimators': (10,50,100,150)}

In [30]:
cv = RepeatedKFold(n_repeats = 5, n_splits = 5)
gscv = GridSearchCV(param_grid=params, cv=cv, estimator=les)

In [38]:
from collections import defaultdict

In [40]:
values = defaultdict(list)
for x in data:
    values[x].append(float(x.meta['logBB']))
duplicates = []
uniques = []
for x, y in values.items():
    if len(y) > 1:
        print(x, y)
        duplicates.append(x)
    else:
        uniques.append((x, y[0]))

In [43]:
Y = [x for _, x in uniques]

In [45]:
gscv.fit(X, Y)

In [47]:
import pickle
with open('gscv_models.pkl', 'wb') as file:
    pickle.dump(gscv, file)

In [49]:
gscv.cv_results_['mean_test_score']

array([0.43490578, 0.45687087, 0.45276739, 0.46036661, 0.36303179,
       0.40730212, 0.40835753, 0.41104391, 0.48211503, 0.5015773 ,
       0.51080183, 0.51271106])

In [51]:
gscv.best_params_

{'max_features': None, 'n_estimators': 150}

##### Classificational modelling

In [57]:
Y_c = [x>0 for x in Y]