In [None]:
import os, sys
import tensorflow as tf
import numpy as np
import pandas as pd

os.environ["HOME"]
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
from tensorflow import keras 
from tensorflow.keras import layers
from tensorflow.keras import models
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score
from tensorflow.keras import optimizers
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve,average_precision_score, confusion_matrix, f1_score,matthews_corrcoef
from inspect import signature
import seaborn as sns

In [None]:
def one_hot_encoder(s):
    seq_num = ' '.join([str(ints) for ints in range(4)])
    seq_ref = 'A T C G'
    d = dict(zip(seq_ref.split(' '), seq_num.split(' ')))
    
    x = np.zeros((len(d), len(s)))
    x[[int(d[c]) for c in s], range(len(s))] = 1
    return x

In [None]:
folder_path = sys.path[0]
train_seq = pd.read_csv('{}/Seq_with_freq_train.csv'.format(folder_path), sep = ",")
test_seq = pd.read_csv('{}/Seq_with_freq_test.csv'.format(folder_path), sep = ",")

In [None]:
#Top or bottom 2,000 sequences

train_top = train_seq.sort_values(by = ["indel_freq"], axis = 0, ascending = False)
train_bot = train_seq.sort_values(by = ["indel_freq"], axis = 0, ascending = True)
top_2000 = train_top.head(2000)
bot_2000 = train_bot.head(2000)

In [None]:
total_4000 = pd.concat([top_2000, bot_2000], axis = 0)
total_4000.shape

In [None]:
#Sequences to One-hot encoding vector for CNN
seq = total_4000['#bseq'].tolist()
seq_freq = total_4000['indel_freq'].tolist()
X_seq = [one_hot_encoder(x) for x in seq]
X_seq = np.transpose(np.asarray(X_seq), (0, 2, 1))
X_seq = X_seq.astype('float32')
Y_val = np.array(seq_freq)

In [None]:
print('Length of data:', X_seq.shape[0])
print('Length of nucleic acids:', X_seq.shape[1])
print('Type of nucleic acids:', X_seq.shape[2])

In [None]:
#Model path

current_path = os.getcwd()
model_path = os.path.join(current_path,'models')
model_path = os.path.join(model_path,'regression_30')
os.makedirs(model_path, exist_ok = True)
print(model_path)

In [None]:
#Hyperparameter

BATCH_SIZE = 8
SEQ_length = 30
BASE_type  = 4
CLASS_NAMES = ['enriched','not-enriched']
params = [['CONV', 400, 3, 1],
          ['DROP', 0.5],
          ['POOL', 2, 1],
          ['FLAT'],
          ['DENSE', 50]]
activation_func = 'relu'
regularizer_params = None

In [None]:
model_version = 1
dict_history = {}
steps_per_epoch = 450
Epochs = 50

In [None]:
import numpy as np
from sklearn.model_selection import ShuffleSplit
from keras.optimizers import Adam
%matplotlib inline
import matplotlib.pyplot as plt
split_data = ShuffleSplit(n_splits = 10, train_size = None, test_size = 0.1, random_state = 1)
n_iter =0
rmse_per_fold = []
val_rmse_per_fold = []
r2_per_fold = []

for train_idx, test_idx in split_data.split(X_seq, Y_val):
    X_train = np.array(X_seq[train_idx])
    X_test = np.array(X_seq[test_idx])
    y_train = np.array(Y_val[train_idx])
    y_test = np.array(Y_val[test_idx])
    model = models.Sequential()
    model.add(layers.Conv1D(filters = params[0][1],kernel_size = params[0][2], strides = params[0][3], activation = activation_func, input_shape = (SEQ_length, BASE_type), kernel_regularizer = regularizer_params, bias_regularizer = regularizer_params,padding = 'same'))
    model.add(layers.Dropout(rate = params[1][1]))
    model.add(layers.MaxPool1D(pool_size = params[2][1], strides = params[2][2]))
    model.add(layers.Flatten())
    model.add(layers.Dense(params[4][1], activation = activation_func, kernel_regularizer = regularizer_params, bias_regularizer = regularizer_params))
    model.add(layers.Dense(1))
    model.compile(loss = 'mean_squared_error', optimizer = optimizers.Adam(learning_rate = 0.000075), metrics = [tf.keras.metrics.RootMeanSquaredError()])
    dict_history[model_version] = model.fit(x = X_train, y = y_train, shuffle = True, steps_per_epoch = steps_per_epoch, epochs = Epochs, batch_size = BATCH_SIZE, validation_data = (X_test, y_test), verbose = 2)
    history = dict_history[model_version]
    rmse = history.history['root_mean_squared_error']
    val_rmse = history.history['val_root_mean_squared_error']
    epochs = range(1, len(rmse) + 1)
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]
    pred = model.predict(X_test, verbose = 0)
    r2_y_pred = r2_score(y_test, pred)
    rmse_per_fold.append(rmse)
    val_rmse_per_fold.append(val_rmse)
    arr_rmse = np.array(rmse_per_fold)
    arr_val_rmse = np.array(val_rmse_per_fold)
    r2_per_fold.append(r2_y_pred)
    print('r2_score: {} for fold {}'.format(r2_y_pred, n_iter+1))
    globals()['model_fname_{}'.format(n_iter+1)] = f'BBBphagedisplay_TEST_{n_iter+1}_{model_version:03}.h5'
    model_spath = os.path.join(model_path,f'model_fname_{n_iter+1}')
    model.save(model_spath)
    fig = plt.figure(figsize=(10.5,3.5))
    plt.plot(epochs, rmse, 'bo', label='Training RMSE')
    plt.plot(epochs, val_rmse, 'r', alpha=0.7, label='Validation RMSE')
    plt.title(f'Training and validation RMSE of fold {n_iter+1}')
    plt.ylabel('RMSE')
    plt.legend()
    plt.show()
    plt.cla
    plt.clf
    n_iter += 1

In [None]:
result = sum(r2_per_fold)
print(f"average : {result / len(r2_per_fold)}")

In [None]:
#Model save
model_fname = f'Cas9Predictor_TEST_{model_version:03}.cv.h5'
model_spath = os.path.join(model_path, model_fname)
print(model_spath)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from cycler import cycler
import matplotlib.lines as mlines
num_plots = 10
n_iter = 0

fig, ax = plt.subplots(figsize=(10.5,5.5))
ax.set_prop_cycle('color',[plt.cm.jet(i) for i in np.linspace(0, 1, num_plots)])
b_o = mlines.Line2D([], [], color='black', marker='o', linestyle='None', markersize=8)
b_line = mlines.Line2D([], [], color='black', linestyle='solid')
for i in range(n_iter, num_plots):
    plt.plot(epochs, arr_rmse[i], 'o', markersize = 5.5, alpha= 0.8)
    i += 1
for i in range(n_iter, num_plots):
    plt.plot(epochs, arr_val_rmse[i], alpha= 0.8, label= f'Fold {i+1}')
    i += 1
legend = plt.legend(loc='upper right', bbox_to_anchor=(1.15, 1), frameon = True, fontsize = 11)
art_legend = plt.gca().add_artist(legend)
legend_2 = plt.legend(handles = [b_o, b_line], labels = ['RMSE', 'Validation RMSE'], loc='upper right', frameon = True, fontsize = 11)
art_legend_2 = plt.gca().add_artist(legend_2)

In [None]:
pred = model.predict(x=X_test)

In [None]:
#Test the model (Endogenous dataset : 542 sequences)
select_test = test_seq.sort_values(by = ["indel_freq"], axis = 0, ascending = False)
select_test_top = test_seq.head(271)
select_test_bot = test_seq.tail(271)

In [None]:
test_set = pd.concat([select_test_top, select_test_bot], axis = 0)

In [None]:
val_pep = test_set['#bseq'].tolist()
val_class = test_set['indel_freq'].tolist()
val_encode = [one_hot_encoder(x) for x in val_pep]
val_encode = np.transpose(np.asarray(val_encode), (0, 2, 1))

In [None]:
#Insert best model number to "num" from 10 cross-validated model
model_fit = keras.models.load_model("{}/model_fname_num".format(model_path))
val_score = model_fit.predict(val_encode)
val_score_flat = val_score.flatten()
model_fit.fit(val_encode, val_score_flat)

In [None]:
model_fit.fit(val_encode, val_score_flat)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["font.size"] = 16
plt.figure(figsize = (10, 8), dpi = 80)
plt.scatter(test_table["Indel frequency"], test_table["Predicted frequency"], color = 'green', alpha =  0.5, s = 20)
hist, xbins, ybins, im  = plt.hist2d(test_table["Indel frequency"], test_table["Predicted frequency"], (5, 5), range = [[0, 100], [0, 100]], alpha = 0.3, cmap = 'Blues')
plt.xlabel('Validated indel frequency')
plt.ylabel('Predicted score')
plt.gca().xaxis.set_major_locator(plt.MultipleLocator(5))
plt.gca().yaxis.set_major_locator(plt.MultipleLocator(5))
plt.xticks(np.arange(0, 100, 20), fontsize = 16)
plt.yticks(np.arange(0, 100, 20), fontsize = 16)
plt.xlim(0, 100)
plt.ylim(0, 100)
plt.plot([0, 100], [0, 100], 'k--')
plt.grid(True, color = 'black', alpha = 0.35, linestyle = '--')
plt.colorbar()
plt.clim(0, 200)
plt.show()