In [2]:
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import metricsimport pandas as pd
import numpy as np
import scipy
import multiprocessing
import pickle


In [3]:
multiprocessing.cpu_count()

128

In [4]:
def prepareDataFrame(df, categorical_features):
    
    for ft in categorical_features:
        
        cat_ft = pd.get_dummies(df[ft], prefix='cat')
        df = pd.concat([df,cat_ft], axis=1)
        df = df.drop(ft, 1)
    
        
    print("Dataframe after dummy variables:")
    print(df.shape)

    Y = df[['ba']] 
    Y = Y.values.ravel()
    
    X = df.drop('ba', axis=1)
    
    return X, Y

In [5]:
def evaluate(model, test_features, test_labels, model_type):
    
    predictions = model.predict(test_features)
    
    pred_dict = {'observed':test_labels,'predicted':predictions}
    
    pred_df = pd.DataFrame(pred_dict)
    pred_df.to_csv("work/predictions/"+model_type+".csv", encoding="utf-8", index=False)
    
    pearson_r = scipy.stats.pearsonr(test_labels, predictions)
    print('Pearson R: ', pearson_r)
    print('Pearson R squared: ', pearson_r[0]**2)

    print('R Squared Error:', metrics.r2_score(test_labels, predictions))
    print('Mean Absolute Error:', metrics.mean_absolute_error(test_labels, predictions))
    print('Mean Squared Error:', metrics.mean_squared_error(test_labels, predictions))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test_labels, predictions)))

In [8]:
def train_test(model_type):
    
    categorical_features = []
    
    if(model_type == "first_approach_unbalanced"):
        
        train = pd.read_csv("work/training/first-approach/train-random-unbalanced.csv", sep=",", encoding="utf-8")
        test = pd.read_csv("work/training/first-approach/test-random-unbalanced.csv", sep=",", encoding="utf-8")
        
    elif(model_type == "first_approach_balanced"):
        
        train = pd.read_csv("work/training/first-approach/train-random-balanced.csv", sep=",", encoding="utf-8")
        test = pd.read_csv("work/training/first-approach/test-random-balanced.csv", sep=",", encoding="utf-8")
                
    elif(model_type == "second_approach_pl_split_random"):
     
        train = pd.read_csv("work/training/second-approach/train-random.csv", sep=",", encoding="utf-8")
        test = pd.read_csv("work/training/second-approach/test-random.csv", sep=",", encoding="utf-8")
        
    elif(model_type == "second_approach_pl_split_protein"):
     
        train = pd.read_csv("work/training/second-approach/train-protein.csv", sep=",", encoding="utf-8")
        test = pd.read_csv("work/training/second-approach/test-protein.csv", sep=",", encoding="utf-8")
        
    elif(model_type == "second_approach_pl_split_pocket"):
     
        train = pd.read_csv("work/training/second-approach/train-pocket.csv", sep=",", encoding="utf-8")
        test = pd.read_csv("work/training/second-approach/test-pocket.csv", sep=",", encoding="utf-8")
                
    elif(model_type == "second_approach_pl_split_ligand_weight"):
     
        train = pd.read_csv("work/training/second-approach/train-ligand-weight.csv", sep=",", encoding="utf-8")
        test = pd.read_csv("work/training/second-approach/test-ligand-weight.csv", sep=",", encoding="utf-8")
                
    elif(model_type == "second_approach_pl_split_ligand_tpsa"):
     
        train = pd.read_csv("work/training/second-approach/train-ligand-tpsa.csv", sep=",", encoding="utf-8")
        test = pd.read_csv("work/training/second-approach/test-ligand-tpsa.csv", sep=",", encoding="utf-8")
                
    elif(model_type == "second_approach_pl_split_ligand_volume"):
     
        train = pd.read_csv("work/training/second-approach/train-ligand-volume.csv", sep=",", encoding="utf-8")
        test = pd.read_csv("work/training/second-approach/test-ligand-volume.csv", sep=",", encoding="utf-8")
            
        
    print(train.shape)
    print(test.shape)
    
    train_X, train_Y = prepareDataFrame(train, categorical_features)
    test_X, test_Y = prepareDataFrame(test, categorical_features)
    
    num_of_features = round(train_X.shape[1]/2)
    
    regressor = RandomForestRegressor(n_jobs=-1, max_features=num_of_features, n_estimators=500, random_state = 123, criterion= 'mse') 

    cv_scores = cross_validate(regressor, train_X, train_Y, cv=10, scoring=('r2', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'explained_variance'))
    
    print("CV score r2: ", end = ' ')
    print(*cv_scores['test_r2'], sep=', ')
    
    print("CV score neg_mean_absolute_error: ", end = ' ')
    print(*cv_scores['test_neg_mean_absolute_error'], sep=', ')
    
    print("CV score neg_mean_squared_error: ", end = ' ')
    print(*cv_scores['test_neg_mean_squared_error'], sep=', ')
    
    print("CV score neg_root_mean_squared_error: ", end = ' ')
    print(*cv_scores['test_neg_root_mean_squared_error'], sep=', ')
    
    print("CV score explained_variance: ", end = ' ')
    print(*cv_scores['test_explained_variance'], sep=', ')

    

    print("CV score mean r2: ", end = ' ')
    print(cv_scores['test_r2'].mean())
    
    print("CV score mean neg_mean_absolute_error: ", end = ' ')
    print(cv_scores['test_neg_mean_absolute_error'].mean())
    
    print("CV score mean neg_mean_squared_error: ", end = ' ')
    print(cv_scores['test_neg_mean_squared_error'].mean())
    
    print("CV score mean neg_root_mean_squared_error: ", end = ' ')
    print(cv_scores['test_neg_root_mean_squared_error'].mean())
    
    print("CV score mean explained_variance: ", end = ' ')
    print(cv_scores['test_explained_variance'].mean())
    
    regressor.fit(train_X, train_Y)
    
    print("Evaluation with the test set: ")
    evaluate(regressor, test_X, test_Y, model_type)
    
    pickle.dump(regressor, open("work/models/"+model_type+".mdl", 'wb'))
    
    return cv_scores, regressor

In [9]:
cv_scores, reg_model = train_test("second_approach_pl_split_random")

(6738, 126)
(1683, 126)
Dataframe after dummy variables:
(6738, 126)
Dataframe after dummy variables:
(1683, 126)
CV score r2:  0.6864927396581418, 0.7230096353021711, -6.652715506560543, 0.6810978703360895, 0.02785232098244883, 0.3462333396971847, 0.4966275862922771, 0.45688684280651026, 0.6265104516978208, -0.3929889006452123
CV score neg_mean_absolute_error:  -0.6496356083086062, -0.610859050445104, -2.2038, -0.516787537091989, -1.0997937685459924, -0.6054940652818988, -0.7636673590504447, -0.7425961424332339, -0.6838062407132238, -1.2818517087667167
CV score neg_mean_squared_error:  -0.6444556929376861, -0.6769075357270041, -6.323589446172102, -0.477021870979229, -1.920891190919877, -0.6588598335311567, -1.0342933297922814, -1.0504193262314534, -0.7958237343239234, -3.310004141396732
CV score neg_root_mean_squared_error:  -0.8027799779128065, -0.8227439065268147, -2.5146748191708808, -0.6906676993889529, -1.3859621895707968, -0.8117018131870575, -1.017002128705875, -1.0248996664217

In [10]:
cv_scores, reg_model = train_test("second_approach_pl_split_pocket")

(6969, 126)
(1452, 126)
Dataframe after dummy variables:
(6969, 126)
Dataframe after dummy variables:
(1452, 126)
CV score r2:  0.40352648399338975, 0.15004876661415045, -2.444870464331055, 0.6380331700206532, 0.5623689392424559, 0.5685172709419508, 0.6115883982638934, 0.5354296989110889, 0.4073982337312245, -0.8647858461177498
CV score neg_mean_absolute_error:  -0.8502530846484945, -1.0576436154949787, -1.370372453371592, -0.5199876614060256, -0.5471015781922524, -0.5056846484935444, -0.809379340028699, -0.732271736011477, -0.773889526542324, -1.857600862068964
CV score neg_mean_squared_error:  -1.0828140806886661, -1.6278896618651364, -2.714491185423242, -0.4630004360975601, -0.5415906003443327, -0.48992861985652914, -1.0805739338307108, -0.928983810215206, -1.1402829063414628, -4.987456429712637
CV score neg_root_mean_squared_error:  -1.040583528933966, -1.275887793603002, -1.6475712990408766, -0.6804413539002168, -0.7359283934897014, -0.6999490123262759, -1.0395065819083162, -0.963

In [11]:
cv_scores, reg_model = train_test("second_approach_pl_split_protein")

(6731, 126)
(1690, 126)
Dataframe after dummy variables:
(6731, 126)
Dataframe after dummy variables:
(1690, 126)
CV score r2:  0.6383132787298564, 0.45468208033275037, -2.850677814763661, -0.028875456807887945, 0.5310062580744853, -0.32825214452819984, -0.10058954299371736, 0.7626687817875597, -0.5663566461199243, -0.421139618984947
CV score neg_mean_absolute_error:  -0.6113724035608309, -0.8063693907875182, -1.6113634472511145, -0.8639069836552764, -0.7976998514115907, -1.0365878157503696, -1.4525426448736984, -0.5226445765230315, -1.1843227340267473, -1.3759200594353638
CV score neg_mean_squared_error:  -0.6422017259940646, -1.0283579224368504, -4.507481132243688, -1.2173488287072838, -1.0660522686478473, -1.706298296344722, -3.182698005349177, -0.4692192693610701, -1.9956057137592895, -3.607807263774146
CV score neg_root_mean_squared_error:  -0.801374897282205, -1.014079840267447, -2.1230829310801047, -1.1033353201576046, -1.0324980719826296, -1.306253534481236, -1.7840117727608125

In [12]:
cv_scores, reg_model = train_test("second_approach_pl_split_ligand_weight")

(6770, 126)
(1651, 126)
Dataframe after dummy variables:
(6770, 126)
Dataframe after dummy variables:
(1651, 126)
CV score r2:  0.6855807941917444, 0.6246312530696403, -6.665967378545494, 0.6842139465079209, 0.16684011424503742, 0.3260868561029593, 0.4899033799026127, 0.3740334726258313, 0.596014438947789, -0.3777304492944369
CV score neg_mean_absolute_error:  -0.6569503692762197, -0.6918035450516997, -2.104182274741505, -0.5128215657311656, -1.0044132939438692, -0.6082936484490402, -0.8124691285081248, -0.7830590841949776, -0.7016977843426887, -1.2681456425406192
CV score neg_mean_squared_error:  -0.6567048375184648, -0.9053475932053209, -5.902417712319042, -0.4665993196454923, -1.6197473901920214, -0.6545250528803557, -1.1601021648449044, -1.145204088094535, -0.8448945116100457, -3.1848393920827145
CV score neg_root_mean_squared_error:  -0.8103732704861784, -0.9514975529160971, -2.4294891875287368, -0.6830807563132578, -1.2726929677624612, -0.8090272262911525, -1.077080389221206, -1.

In [13]:
cv_scores, reg_model = train_test("second_approach_pl_split_ligand_tpsa")

(6765, 126)
(1656, 126)
Dataframe after dummy variables:
(6765, 126)
Dataframe after dummy variables:
(1656, 126)
CV score r2:  0.6812975356879218, 0.7467927998838304, -7.334141156545673, 0.7052342130368852, 0.07410219848792465, 0.3804430116477693, 0.4766870230020295, 0.4213288473385215, 0.6444480114252992, -0.3785376838635197
CV score neg_mean_absolute_error:  -0.6378679468242255, -0.612768389955687, -2.287734416543574, -0.5092428360413583, -1.0779161004431315, -0.5785775147929, -0.8368772189349118, -0.7512982248520703, -0.6663340236686386, -1.2912275147928993
CV score neg_mean_squared_error:  -0.6454260849039885, -0.5947554046085672, -6.751518974239282, -0.45474102670605493, -1.867640483663219, -0.5996446553846159, -1.2229642511834335, -1.0989329454437853, -0.7531337987573963, -3.264678152485207
CV score neg_root_mean_squared_error:  -0.8033841452904011, -0.7712038670861079, -2.598368521638007, -0.6743448870615503, -1.3666164361894741, -0.7743672613073308, -1.1058771410891146, -1.048

In [14]:
cv_scores, reg_model = train_test("second_approach_pl_split_ligand_volume")

(6770, 126)
(1651, 126)
Dataframe after dummy variables:
(6770, 126)
Dataframe after dummy variables:
(1651, 126)
CV score r2:  0.6745252651858239, 0.7467564476362836, -6.859239410299979, 0.6899926270346375, -0.046141538504187896, 0.29848590942814146, 0.4283538856921103, 0.46351061775462266, 0.5627837018443873, -0.366856481829158
CV score neg_mean_absolute_error:  -0.6640853766617436, -0.6001867060561299, -2.1970289512555374, -0.4994644017725256, -1.13690457902511, -0.6056818316100442, -0.8129926144756275, -0.7476023633677991, -0.7140974889217144, -1.2177923190546533
CV score neg_mean_squared_error:  -0.6726583615361891, -0.6116785605908418, -6.236484095420961, -0.44743190599704546, -2.04183773116691, -0.6639595485376664, -1.2116663318168366, -1.0607439987001472, -0.9075057304579074, -2.88514153601182
CV score neg_root_mean_squared_error:  -0.8201575223920031, -0.7820988176636261, -2.4972953560644284, -0.6689035102292747, -1.4289288754752316, -0.8148371300681299, -1.1007571629641284, -

In [15]:
cv_scores, reg_model = train_test("first_approach_balanced")

(317513, 270)
(79377, 270)
Dataframe after dummy variables:
(317513, 270)
Dataframe after dummy variables:
(79377, 270)
CV score r2:  0.6702865699669326, 0.7755924157290957, -0.41850719681210746, 0.7175648925625067, 0.6547247847595441, 0.4054792806711177, 0.8339498275533521, 0.45286421697296775, 0.6747178518589102, -0.28418843549044337
CV score neg_mean_absolute_error:  -0.6462290375409425, -0.5046733497102549, -1.3607678193499633, -0.6270197978016447, -0.5789030077792824, -0.514250826745615, -0.41768061478378576, -0.8720773707914714, -0.6461129980158108, -1.1827276369248207
CV score neg_mean_squared_error:  -0.6694494218594105, -0.47798553228646995, -3.1842254417876084, -0.7262072160398109, -0.6486161140890073, -0.4992692951251927, -0.31036179222071764, -1.4030917928040065, -0.7081521136279174, -2.7970863965053057
CV score neg_root_mean_squared_error:  -0.8181988889380201, -0.6913649776250385, -1.784439811758191, -0.8521779251070817, -0.8053670679193478, -0.7065899059038366, -0.557101

In [None]:
cv_scores, reg_model = train_test("first_approach_unbalanced")

(847815, 278)
(211953, 278)
Dataframe after dummy variables:
(847815, 278)
Dataframe after dummy variables:
(211953, 278)


In [57]:

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 800, num = 15)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', round(train_X.shape[1]/2), round(train_X.shape[1]/3), round(train_X.shape[1]/4)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 242, 285, 328, 371, 414, 457, 500, 542, 585, 628, 671, 714, 757, 800], 'max_features': ['auto', 'sqrt', 98, 65, 49], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [58]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=123, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_X, train_Y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt', 98, 65,
                                                         49],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 242, 285, 328,
                                                         371, 414, 457, 500,
                                                         542, 585, 628, 671,
                                                         714, 757, 800]},
                   rand

In [140]:
rf_random.best_params_

{'n_estimators': 457,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 60,
 'bootstrap': False}

In [61]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, test_X, test_Y)

Mean Absolute Error: 0.41645679037006417
Mean Squared Error: 0.32449833562823804
R Squared Error: 0.8611018175740554
Root Mean Squared Error: 0.5696475538683881


In [129]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [None, 60, 80, 90],
    'max_features': [round(train_X.shape[1]/2), round(train_X.shape[1]/4), round(train_X.shape[1]/3)],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [300, 400, 500, 800]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

print(grid_search)

GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [None, 60, 80, 90],
                         'max_features': [160, 80, 107],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [300, 400, 500, 800]},
             verbose=2)


In [None]:
grid_search.fit(train_X, train_Y)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


In [137]:
grid_search.best_estimator_

RandomForestRegressor(max_depth=60, max_features=107, min_samples_leaf=4,
                      min_samples_split=5, n_estimators=300)

In [156]:
best_grid = grid_search.best_estimator_
evaluate(grid_search, test_X, test_Y)

Pearson R:  (0.9820782493152663, 0.0)
Pearson R squared:  0.9644776877781384
NumPy Pearson correlation:  [[1.         0.98207825]
 [0.98207825 1.        ]]
Mean Absolute Error: 0.15549328930528236
Mean Squared Error: 0.08323586487391017
R Squared Error: 0.9644602037287737
Root Mean Squared Error: 0.28850626487809616


In [136]:
print("done")

done
