In [1]:
# Now we are going to test and implement more advanced machine learning algorithms

In [21]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
import matplotlib.pyplot as plt
import numpy as np
import os, pandas, warnings
from sklearn.neural_network import MLPRegressor

Seed = 72993
np.random.seed(Seed)
nJobs = 2

warnings.filterwarnings("ignore")

# Load our data as before

In [3]:
parameters = pandas.read_csv("Judred.csv", index_col=0)
#print(parameters)
targets = pandas.read_csv("APs.csv", index_col = 0)
#print(targets)

Forcefield = "2.1"
targets = targets[targets["FF"] == Forcefield]
targets.index = targets["pep"]

targets = targets["mean"]
X_train, X_val, y_train, y_val = train_test_split(parameters, targets, test_size=0.33, random_state=9876, shuffle=True)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=9876, shuffle=True)

##### We will now perform hyperparameter optimization for several models and rank them against each other. Due to limit time we are keeping the number of combinations of hyperparameters relatively small but you can play around with it to make further improvements.

Due to the way Jupyter notebooks works, dont move onto the next block of code after running a hyperparameter optimization until you see the console print: "Best params from grid search: ...."

# SVM rbf

In [4]:
# Define the hyperparameters we want to test
SVRrbf_param_grid = {
        "kernel": ["rbf"],
        "gamma": ["scale", "auto"],
        "C": np.linspace(0.1, 1, 10), 
        "epsilon": np.linspace(0.1, 1, 10), 
        "max_iter": np.linspace(1000, 10000, 10).astype(np.int64),
        "tol": [0.01, 0.001, 0.0001], 
        "verbose":[0]}

In [19]:
model = SVR()
HPO_model = GridSearchCV(estimator = model, param_grid = SVRrbf_param_grid, cv = 5, n_jobs = nJobs, verbose = True, error_score=0.0)
HPO_model.fit(X_train.values, y_train.values.reshape(-1))
print("\nBest params from grid search:")
print(HPO_model.best_params_)
SVMrbf_hyperparameters = HPO_model.best_params_
    

Fitting 5 folds for each of 6000 candidates, totalling 30000 fits

Best params from grid search:
{'C': 0.6, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': 100, 'tol': 0.0001, 'verbose': 0}




# Random forest

In [11]:
RF_param_grid = {'bootstrap': [True, False],
                  'criterion': ['mse', 'absolute_error'],
                  'max_depth': [None],
                  'max_features': ["sqrt", "log2", None],
                  'max_leaf_nodes': [None],
                  'min_impurity_decrease': [0.0],
                  'min_samples_leaf': [1, 2],
                  'min_samples_split': [0.5, 1.0],
                  'min_weight_fraction_leaf': [0.0, 0.01, 0.1],
                  'n_estimators': [10, 100],
                  'n_jobs': [nJobs],
                  'oob_score': [False],
                  'verbose': [False],
                  'warm_start': [False, True],
                  "random_state":[Seed]}

In [12]:
model = RandomForestRegressor()

HPO_model = GridSearchCV(estimator=model, param_grid=RF_param_grid, cv = 5, n_jobs = nJobs, verbose = True, error_score=0.0)
HPO_model.fit(X_train.values, y_train.values.reshape(-1))
print("\nBest params from grid search:")
print(HPO_model.best_params_)
RF_hyperparameters = HPO_model.best_params_


Fitting 5 folds for each of 576 candidates, totalling 2880 fits

Best params from grid search:
{'bootstrap': True, 'criterion': 'mae', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 0.5, 'min_weight_fraction_leaf': 0.1, 'n_estimators': 100, 'n_jobs': 2, 'oob_score': False, 'random_state': 72993, 'verbose': False, 'warm_start': False}


  warn(


In [13]:
print(RF_hyperparameters)

{'bootstrap': True, 'criterion': 'mae', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 0.5, 'min_weight_fraction_leaf': 0.1, 'n_estimators': 100, 'n_jobs': 2, 'oob_score': False, 'random_state': 72993, 'verbose': False, 'warm_start': False}


# Deep Neural Network

In [16]:
MLP_param_grid = {
        "activation": ["relu"],
        "alpha":[0.0001, 0.1],
        "batch_size": [20],
        "early_stopping":[True],
        #epsilon
        "hidden_layer_sizes": [(10,), (10,2), (10,3)] + [(100,), (100,2), (100,3)],
        "learning_rate":["adaptive"],
        "learning_rate_init": [0.001],
        "max_iter":[100, 1000],
        #momentum
        #"power_t":[0.25, 0.5, 0.75], 
        "random_state":[Seed], 
        "shuffle":[False], 
        "solver": ["sgd", "adam"],
        "tol": [0.01, 0.1], 
        "validation_fraction":[0.1],
        "n_iter_no_change": [5, 10],
        "verbose":[0]}

In [17]:
model = MLPRegressor()
HPO_model = GridSearchCV(estimator = model, param_grid = MLP_param_grid, cv = 5, n_jobs = nJobs, verbose = True, error_score=0.0)
HPO_model.fit(X_train.values, y_train.values.reshape(-1))

print("\nBest params from grid search:")
print(HPO_model.best_params_)
MLP_hyperparameters = HPO_model.best_params_

Fitting 5 folds for each of 192 candidates, totalling 960 fits

Best params from grid search:
{'activation': 'relu', 'alpha': 0.1, 'batch_size': 20, 'early_stopping': True, 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'learning_rate_init': 0.001, 'max_iter': 100, 'n_iter_no_change': 10, 'random_state': 72993, 'shuffle': False, 'solver': 'adam', 'tol': 0.01, 'validation_fraction': 0.1, 'verbose': 0}


# Test and compare

In [22]:
SVMmodel = SVR(**SVMrbf_hyperparameters)
RFmodel = RandomForestRegressor(**RF_hyperparameters)
DNNmodel = MLPRegressor(**MLP_hyperparameters)

SVMmodel.fit(X_train, y_train)
RFmodel.fit(X_train, y_train)
DNNmodel.fit(X_train, y_train)

SVM_predictions = SVMmodel.predict(X_val)
RF_predictions = RFmodel.predict(X_val)
DNN_predictions = DNNmodel.predict(X_val)

SVM_rmse = RMSE = mean_squared_error(y_val, predictions, squared=False)

MLPRegressor(alpha=0.1, batch_size=20, early_stopping=True,
             learning_rate='adaptive', max_iter=100, random_state=72993,
             shuffle=False, tol=0.01, verbose=0)