In [72]:
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
import scipy
import multiprocessing
import pickle
from sklearn import metrics

In [73]:
multiprocessing.cpu_count()

128

In [74]:
def prepareDataFrame(df, categorical_features):
    
    for ft in categorical_features:
        
        cat_ft = pd.get_dummies(df[ft], prefix='cat')
        df = pd.concat([df,cat_ft], axis=1)
        df = df.drop(ft, 1)
    
        
    print("Dataframe after dummy variables:")
    print(df.shape)

    Y = df[['ba']] 
    Y = Y.values.ravel()
    
    X = df.drop('ba', axis=1)
    
    return X, Y

In [75]:
def evaluate(model, test_features, test_labels, model_type):
    
    predictions = model.predict(test_features)
    
    pred_dict = {'observed':test_labels,'predicted':predictions}
    
    pred_df = pd.DataFrame(pred_dict)
    pred_df.to_csv("work/predictions/"+model_type+".csv", encoding="utf-8", index=False)
    
    pearson_r = scipy.stats.pearsonr(test_labels, predictions)
    print('Pearson R: ', pearson_r)
    print('Pearson R squared: ', pearson_r[0]**2)

    print('R Squared Error:', metrics.r2_score(test_labels, predictions))
    print('Mean Absolute Error:', metrics.mean_absolute_error(test_labels, predictions))
    print('Mean Squared Error:', metrics.mean_squared_error(test_labels, predictions))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test_labels, predictions)))

In [76]:
def train_test(model_type):
    
    if(model_type == "first_approach_unbalanced"):
        
        train = pd.read_csv("work/training/first-approach/train-random-unbalanced.csv", sep=",", encoding="utf-8")
        test = pd.read_csv("work/training/first-approach/test-random-unbalanced.csv", sep=",", encoding="utf-8")
        
        categorical_features = ["secStruct", "secStructSimple", "CysteineMutation", "GlycineMutation", "ProlineMutation", "ChargeGroupChange", "HydroGroupChange", "VanDerWaalsVolumeGroupChange",
                       "PloarityGroupChange", "PolarizabilityGroupChange",  "SSWTGroupChange", "AsaGroupChange", "DominantSS", "RuleOfFiveDescriptor"]
        
    elif(model_type == "first_approach_balanced"):
        
        train = pd.read_csv("work/training/first-approach/train-random-balanced.csv", sep=",", encoding="utf-8")
        test = pd.read_csv("work/training/first-approach/test-random-balanced.csv", sep=",", encoding="utf-8")
        
        categorical_features = ["secStruct", "secStructSimple", "CysteineMutation", "GlycineMutation", "ProlineMutation", "ChargeGroupChange", "HydroGroupChange", "VanDerWaalsVolumeGroupChange",
                       "PloarityGroupChange", "PolarizabilityGroupChange",  "SSWTGroupChange", "AsaGroupChange", "DominantSS", "RuleOfFiveDescriptor"]  
        
    elif(model_type == "second_approach_pl_split_random"):
     
        train = pd.read_csv("work/training/second-approach/train-random.csv", sep=",", encoding="utf-8")
        test = pd.read_csv("work/training/second-approach/test-random.csv", sep=",", encoding="utf-8")
        
        categorical_features = ["RuleOfFiveDescriptor", "DominantSS"] 

    elif(model_type == "second_approach_pl_split_protein"):
     
        train = pd.read_csv("work/training/second-approach/train-protein.csv", sep=",", encoding="utf-8")
        test = pd.read_csv("work/training/second-approach/test-protein.csv", sep=",", encoding="utf-8")
        
        categorical_features = ["RuleOfFiveDescriptor", "DominantSS"]

    elif(model_type == "second_approach_pl_split_pocket"):
     
        train = pd.read_csv("work/training/second-approach/train-pocket.csv", sep=",", encoding="utf-8")
        test = pd.read_csv("work/training/second-approach/test-pocket.csv", sep=",", encoding="utf-8")
        
        categorical_features = ["RuleOfFiveDescriptor", "DominantSS"]
        
    elif(model_type == "second_approach_pl_split_ligand_weight"):
     
        train = pd.read_csv("work/training/second-approach/train-ligand-weight.csv", sep=",", encoding="utf-8")
        test = pd.read_csv("work/training/second-approach/test-ligand-weight.csv", sep=",", encoding="utf-8")
        
        categorical_features = ["RuleOfFiveDescriptor", "DominantSS"]
        
    elif(model_type == "second_approach_pl_split_ligand_tpsa"):
     
        train = pd.read_csv("work/training/second-approach/train-ligand-tpsa.csv", sep=",", encoding="utf-8")
        test = pd.read_csv("work/training/second-approach/test-ligand-tpsa.csv", sep=",", encoding="utf-8")
        
        categorical_features = ["RuleOfFiveDescriptor", "DominantSS"]
        
    elif(model_type == "second_approach_pl_split_ligand_volume"):
     
        train = pd.read_csv("work/training/second-approach/train-ligand-volume.csv", sep=",", encoding="utf-8")
        test = pd.read_csv("work/training/second-approach/test-ligand-volume.csv", sep=",", encoding="utf-8")
        
        categorical_features = ["RuleOfFiveDescriptor", "DominantSS"]
        
        
    print(train.shape)
    
    train_X, train_Y = prepareDataFrame(train, categorical_features)
    test_X, test_Y = prepareDataFrame(test, categorical_features)
    
    num_of_features = round(train_X.shape[1]/2)
    
    regressor = RandomForestRegressor(n_jobs=-1, max_features=num_of_features, n_estimators=500, random_state = 123, criterion= 'mse') 

    cv_scores = cross_validate(regressor, train_X, train_Y, cv=10, scoring=('r2', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'explained_variance'))
    
    print("CV score r2: ", end = ' ')
    print(*cv_scores['test_r2'], sep=', ')
    
    print("CV score neg_mean_absolute_error: ", end = ' ')
    print(*cv_scores['test_neg_mean_absolute_error'], sep=', ')
    
    print("CV score neg_mean_squared_error: ", end = ' ')
    print(*cv_scores['test_neg_mean_squared_error'], sep=', ')
    
    print("CV score neg_root_mean_squared_error: ", end = ' ')
    print(*cv_scores['test_neg_root_mean_squared_error'], sep=', ')
    
    print("CV score explained_variance: ", end = ' ')
    print(*cv_scores['test_explained_variance'], sep=', ')

    

    print("CV score mean r2: ", end = ' ')
    print(cv_scores['test_r2'].mean())
    
    print("CV score mean neg_mean_absolute_error: ", end = ' ')
    print(cv_scores['test_neg_mean_absolute_error'].mean())
    
    print("CV score mean neg_mean_squared_error: ", end = ' ')
    print(cv_scores['test_neg_mean_squared_error'].mean())
    
    print("CV score mean neg_root_mean_squared_error: ", end = ' ')
    print(cv_scores['test_neg_root_mean_squared_error'].mean())
    
    print("CV score mean explained_variance: ", end = ' ')
    print(cv_scores['test_explained_variance'].mean())
    
    regressor.fit(train_X, train_Y)
    
    print("Evaluation with the test set: ")
    evaluate(regressor, test_X, test_Y, model_type)
    
    pickle.dump(regressor, open("work/models/"+model_type+".mdl", 'wb'))
    
    return cv_scores, regressor

In [77]:
cv_scores, reg_model = train_test("second_approach_pl_split_random")

(6738, 190)
Dataframe after dummy variables:
(6738, 196)
Dataframe after dummy variables:
(1683, 196)
CV score r2:  0.6931258025525306, 0.7207699516224186, -6.519387188799307, 0.6642552861186667, 0.022891502602585767, 0.35062214382489443, 0.41775111182640134, 0.46006343564151186, 0.6192721013328832, -0.38916680139209214
CV score neg_mean_absolute_error:  -0.6476587537091997, -0.6178801186943627, -2.1974195845697317, -0.5277091988130573, -1.1052548961424322, -0.6015967359050447, -0.8182083086053422, -0.7459979228486645, -0.6952493313521536, -1.2848332838038634
CV score neg_mean_squared_error:  -0.6308205537091996, -0.6823808624332351, -6.213417632997022, -0.5022154344213656, -1.930693397448069, -0.6544368383976265, -1.1963630205341258, -1.0442755706231441, -0.8112470602080226, -3.3009221132838036
CV score neg_root_mean_squared_error:  -0.7942421253680767, -0.8260634736103728, -2.4926727889951823, -0.7086715984300243, -1.3894939357363416, -0.8089727055949579, -1.0937838088644967, -1.0218

In [78]:
cv_scores, reg_model = train_test("second_approach_pl_split_pocket")

(6969, 190)
Dataframe after dummy variables:
(6969, 196)
Dataframe after dummy variables:
(1452, 194)
CV score r2:  0.36426875175193174, 0.1138727762807511, -2.61393423838692, 0.5929485830271909, 0.5447921177146389, 0.5488211256434978, 0.5723306252474132, 0.532058366471192, 0.3940538131267032, -0.8472359600929313
CV score neg_mean_absolute_error:  -0.8788837876614063, -1.0783939741750355, -1.4077569583931129, -0.5518140602582501, -0.5648849354375904, -0.5200895265423251, -0.8581724533715955, -0.7438593974175036, -0.7843753228120515, -1.8471497126436764
CV score neg_mean_squared_error:  -1.1540809921377322, -1.697176602525107, -2.8477101639598263, -0.5206692104447632, -0.5633428071162135, -0.5122926790243909, -1.189790357934007, -0.9357253372740315, -1.1659602086657104, -4.940518443735625
CV score neg_root_mean_squared_error:  -1.074281616773615, -1.30275730760764, -1.6875159744310055, -0.72157411985517, -0.7505616611020134, -0.7157462392666768, -1.0907751179477863, -0.9673289705545014,

ValueError: X has 193 features, but DecisionTreeRegressor is expecting 195 features as input.

In [None]:
cv_scores, reg_model = train_test("second_approach_pl_split_protein")

In [None]:
cv_scores, reg_model = train_test("second_approach_pl_split_ligand_weight")

In [None]:
cv_scores, reg_model = train_test("second_approach_pl_split_ligand_tpsa")

In [None]:
cv_scores, reg_model = train_test("second_approach_pl_split_ligand_volume")

In [57]:

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 800, num = 15)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', round(train_X.shape[1]/2), round(train_X.shape[1]/3), round(train_X.shape[1]/4)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 242, 285, 328, 371, 414, 457, 500, 542, 585, 628, 671, 714, 757, 800], 'max_features': ['auto', 'sqrt', 98, 65, 49], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [58]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=123, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_X, train_Y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt', 98, 65,
                                                         49],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 242, 285, 328,
                                                         371, 414, 457, 500,
                                                         542, 585, 628, 671,
                                                         714, 757, 800]},
                   rand

In [140]:
rf_random.best_params_

{'n_estimators': 457,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 60,
 'bootstrap': False}

In [61]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, test_X, test_Y)

Mean Absolute Error: 0.41645679037006417
Mean Squared Error: 0.32449833562823804
R Squared Error: 0.8611018175740554
Root Mean Squared Error: 0.5696475538683881


In [129]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [None, 60, 80, 90],
    'max_features': [round(train_X.shape[1]/2), round(train_X.shape[1]/4), round(train_X.shape[1]/3)],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [300, 400, 500, 800]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

print(grid_search)

GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [None, 60, 80, 90],
                         'max_features': [160, 80, 107],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [300, 400, 500, 800]},
             verbose=2)


In [None]:
grid_search.fit(train_X, train_Y)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


In [137]:
grid_search.best_estimator_

RandomForestRegressor(max_depth=60, max_features=107, min_samples_leaf=4,
                      min_samples_split=5, n_estimators=300)

In [None]:
grid_search.

In [150]:
grid_search.cv_results_["mean_fit_time"]

array([ 5549.10518901,  6378.27942157,  8798.03574951, 12620.05044993,
        4638.24831621,  6121.86386291,  8680.99602811, 13116.78823797,
        3775.53101643,  5024.54086709,  6414.86339696, 11533.12431598,
        3969.33918317,  5193.83567921,  6407.63299362, 11787.14106822,
        5225.03930863,  6999.11755784,  7356.44456697, 10446.1521647 ,
        4458.91054551,  5140.4910202 ,  7326.96681897, 10141.34237297,
        4435.03293101,  4944.5830694 ,  6048.85737824,  9817.01047166,
        3584.05753342,  5735.45569324,  6024.13991435, 11288.15585677,
        3729.40168834,  4972.93275277,  7080.05846628,  9898.68486516,
        2093.06616743,  2862.69705137,  3416.07099287,  6494.51300263,
        1998.5945584 ,  2733.98084219,  3370.05164123,  5393.83623195,
        1973.96410481,  2706.57173514,  3261.83661485,  5222.81320079,
        1962.41946634,  2631.33160694,  3365.42564527,  5328.55077195,
        1976.95033534,  2691.48537906,  3401.24896399,  5468.76827844,
      

In [156]:
best_grid = grid_search.best_estimator_
evaluate(grid_search, test_X, test_Y)

Pearson R:  (0.9820782493152663, 0.0)
Pearson R squared:  0.9644776877781384
NumPy Pearson correlation:  [[1.         0.98207825]
 [0.98207825 1.        ]]
Mean Absolute Error: 0.15549328930528236
Mean Squared Error: 0.08323586487391017
R Squared Error: 0.9644602037287737
Root Mean Squared Error: 0.28850626487809616


In [136]:
print("done")

done


In [135]:
grid_search.best_estimator_

RandomForestRegressor(max_depth=60, max_features=107, min_samples_leaf=4,
                      min_samples_split=5, n_estimators=300)