# Machine learning typically works better with more data to learn from

In [None]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
import matplotlib.pyplot as plt
import numpy as np
import os, pandas, warnings
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

Seed = 72993
np.random.seed(Seed)
nJobs = 2

warnings.filterwarnings("ignore")

letters_1 = np.array(["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"])
letters_3 = np.array(['ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HSE', 'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER', 'THR', 'VAL', 'TRP', 'TYR'], dtype='<U3')


In [None]:
Hyperparameters = {"C": 100, "epsilon": 0.1, "gamma": "scale", "kernel": "rbf", "max_iter": -1, "tol": 0.0001, "verbose": 0}

model = SVR(**Hyperparameters)

L = 3

In [None]:
parameters = pandas.read_csv("Tripeptides_Judred.csv", index_col=0)


# This dataset stores peptides as 3-letter codes, so we need to translate to make it concur with the input parameters dataset
targets = pandas.read_csv("APs_Beyond.txt", index_col = 0, header=None, sep=" ")

def translate3to1(string):
    code = string.split("-")
    new_string = ""
    for AA in code:
        if AA == "HIS":
            AA = "HSE"
        index = np.where(letters_3 == AA)[0]
        new_string = new_string + str(letters_1[index][0])
    return "".join(new_string)

print("ALA-ALA-ARG-PRO-MET-GLY", "-->", translate3to1("ALA-ALA-ARG-PRO-MET-GLY"))

targets.index = [translate3to1(index) for index in targets.index]

#print(targets)

Forcefield = "2.1" # These are all martini 2.1

# Filter for tripeptides
targets = targets.reindex([x for x in targets.index if len(x) == 3])

#Make sure that parameters and their targets are indexed in the same order

parameters = parameters.reindex(targets.index)
print(parameters)
print(targets)



In [None]:
# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(parameters, targets, test_size=0.33, random_state=9876, shuffle=True)
#X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=9876, shuffle=True)

In [None]:
# Define the hyperparameters we want to test
SVRrbf_param_grid = {
        "kernel": ["rbf"],
        "gamma": ["scale", "auto"],
        "C": [0.1, 1, 10, 100], 
        "epsilon": np.linspace(0.1, 1, 10), 
        "max_iter": [-1],
        "tol": [0.01, 0.001, 0.0001], 
        "verbose":[0]}

model = SVR()
HPO_model = GridSearchCV(estimator = model, param_grid = SVRrbf_param_grid, cv = 5, n_jobs = nJobs, verbose = True, error_score=0.0)
HPO_model.fit(X_train.values, y_train.values.reshape(-1))
print("\nBest params from grid search:")
print(HPO_model.best_params_)
SVMrbf_hyperparameters = HPO_model.best_params_


In [None]:
SVMmodel = SVR(**SVMrbf_hyperparameters)
SVMmodel.fit(X_train, y_train)
SVM_predictions = SVMmodel.predict(X_val)
SVM_rmse = mean_squared_error(y_val, SVM_predictions, squared=False)
print("Support vector machine RMSE:", SVM_rmse)

plt.scatter(SVM_predictions, y_val, label="SVM")

plt.plot([1,2.7], [1,2.7], lw=1, c="black")
plt.xlabel("Predicted AP")
plt.ylabel("True AP")
plt.gcf().set_dpi(100)
plt.legend()
plt.show()