In [None]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
import matplotlib.pyplot as plt
import numpy as np
import os, pandas, warnings
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

Seed = 72993
np.random.seed(Seed)
nJobs = 2

warnings.filterwarnings("ignore")

letters_1 = np.array(["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"])
letters_3 = np.array(['ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HSE', 'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER', 'THR', 'VAL', 'TRP', 'TYR'], dtype='<U3')


#### You will need to run Judred with L = 4 to get the parameters for running this

In [None]:
Hyperparameters = {"C": 100, "epsilon": 0.1, "gamma": "scale", "kernel": "rbf", "max_iter": -1, "tol": 0.0001, "verbose": 0}

model = SVR(**Hyperparameters)

L = 4

In [None]:
parameters = pandas.read_csv("Tetrapeptides_Judred.csv", index_col=0)


# This dataset stores peptides as 3-letter codes, so we need to translate to make it concur with the input parameters dataset
targets = pandas.read_csv("APs_Beyond.txt", index_col = 0, header=None, sep=" ")

def translate3to1(string):
    code = string.split("-")
    new_string = ""
    for AA in code:
        if AA == "HIS":
            AA = "HSE"
        index = np.where(letters_3 == AA)[0]
        new_string = new_string + str(letters_1[index][0])
    return "".join(new_string)

print("ALA-ALA-ARG-PRO-MET-GLY", "-->", translate3to1("ALA-ALA-ARG-PRO-MET-GLY"))

targets.index = [translate3to1(index) for index in targets.index]


# Filter for tetrapeptides
targets = targets.reindex([x for x in targets.index if len(x) == L])

#Make sure that parameters and their targets are indexed in the same order
# In this case we havent got results for all 20^4 tetrapeptides so we also remove parameters we dont have AP's for

parameters = parameters.reindex(targets.index)
print(parameters)
print(targets)

## Lets take a single value for our starting point

In [None]:
starting_point = "W"*L
print("Starting with dataset:", starting_point)

In [None]:

TrainingSet = [starting_point]

Iterations = 5
print("Running for", Iterations, "iterations")
for i in range(Iterations):
    print("Iteration:", i)
    # Train the model
    model.fit(parameters.reindex(TrainingSet), targets.reindex(TrainingSet))
    #Predict AP scores for the rest of the dataset
    predictions = model.predict(parameters.drop(TrainingSet))
    # Measure RMSE of the model
    rmse = mean_squared_error(targets.drop(TrainingSet).values.flatten(), predictions, squared=False)
    #Take the top scoring peptides and move them to the training set
    predictions = pandas.DataFrame(predictions, index=parameters.drop(TrainingSet).index, columns=["Predicted AP"])
    predictions = predictions.sort_values("Predicted AP")
    TrainingSet += list(predictions.iloc[-5:].index)
    
    #See what our best peptide is
    Best = targets.reindex(TrainingSet).sort_values(1).iloc[-1]
    print(i, "Best AP scoring peptide found:", Best.name, float(Best))
    print("Training set:", TrainingSet)

## You may notice the chemical diversity is limited with this method.
### We can force it to work hard by not allowing it access to charge neutral, insoluble peptides

In [None]:
# Filter to the parameters file to only include Judred_Z != 0
# Reindex the APs so it lines up with the parameters file
# Rerun the for loop

### We can also a small Monte Carlo operation to diverisy our choices

In [None]:
def MC():
    pass
#Define a function that randomly modifies AP scores in a way that would leave most of the top scores in their relative positions
#Apply it to the predictions and select the new training data from the new list

# Challenge
## Remake this using a different machine learning algorithm

In [None]:
#...