In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

In [7]:
def one_hot_encode_sequence(sequence):

    base_to_idx = {'A': 0, 'C': 1, 'T': 2, 'G': 3}

    integer_encoded = [base_to_idx[base] for base in sequence]

    one_hot_encoded = np.eye(len(base_to_idx))[integer_encoded]

    return one_hot_encoded


def prepare_data(input_file_path, sol):

    df = pd.read_csv(input_file_path, low_memory=False)

    df = df.dropna(subset=[f'{sol}_seq', f'{sol}_FRET'])
    X = df[f'{sol}_seq']
    y = df[f'{sol}_FRET']

    one_hot_encoded_X = []

    for seq in X :
        one_hot_encoded_X.append(one_hot_encode_sequence(seq))

    return np.array(one_hot_encoded_X), y


def compile_model_RNN(X_train, activation='sigmoid', optimizer='adam', loss='mae', metrics=['mae']):

    batch_size, time_steps, features = X_train.shape

    model = Sequential()
    model.add(SimpleRNN(32, activation=activation, input_shape=(time_steps, features)))
    model.add(Dense(1, activation='linear'))

    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    return model


def train_model(model, X_train, y_train, val_size, batch_size=64, epochs=30):

    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=42)
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val))

    fig = plt.figure()
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend(loc='upper right')
    plt.grid(True)
    
    plt.show()

    return model, fig


def test_model(model, X_test, y_test):

    y_pred = model.predict(X_test)

    fig = plt.figure()
    
    y_pred = y_pred.flatten()
    y_diff = y_test-y_pred

    y_abs = np.abs(y_diff)
    
    plt.plot(y_abs, label='|test-pred|')
    plt.legend()

    test_loss = mean_absolute_error(y_test, y_pred)

    print(f"test_loss:{test_loss:.4f}")
    return fig


def predict_FRET(model, new_seq):

    new_seq_encoded = one_hot_encode_sequence(new_seq)
    predicted_FRET = model.predict(new_seq_encoded)

    return predicted_FRET


def pad_sequences(sequences, max_length):
    padded_sequences = []
    for seq in sequences:
        padding = np.zeros((max_length - len(seq), seq.shape[1]))  # 패딩할 모양 생성
        padded_seq = np.vstack([seq, padding])  # 시퀀스와 패딩 결합
        padded_sequences.append(padded_seq)
    return np.array(padded_sequences)

In [None]:
solution = ['N5', 'N50', 'N500', 'N5M10', 'N5M100']

units = 32
batch_size = 128
epochs = 300
max_length = 9

output_directory_path = f'C:\\Users\\chw10\\2024_BNEM\\aug_08\\result'


for sol in solution:

    print(sol)
    
    input_file_path = f'C:\\Users\\chw10\\2024BNEM\\data\\sorted.csv'
    output_file_path_model = f'{output_directory_path}\\rnn\\model\\{sol}_un{units}_ep{epochs}_bs{batch_size}_RNN_padded.keras'
    output_file_path_fig_train=f'{output_directory_path}\\rnn\\fig_train\\{sol}_un{units}_ep{epochs}_bs{batch_size}_train_RNN_padded.png'
    output_file_path_fig_test=f'{output_directory_path}\\rnn\\fig_test\\{sol}_un{units}_ep{epochs}_bs{batch_size}_test_RNN_padded.png'

    # prepare X, y
    X, y = prepare_data(input_file_path, sol)
    
    # 9개의 염기서열로 패딩
    X = pad_sequences(X, max_length)
    print(X[0])
    
    # Split into train and test dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # complie model
    model = compile_model_RNN(X_train)

    # train
    model, fig_train = train_model(model, X_train, y_train, val_size=0.15, batch_size=batch_size, epochs=epochs)

    # test
    fig_test = test_model(model, X_test, y_test)

    # save model and fig
    model.save(output_file_path_model)
    fig_train.savefig(output_file_path_fig_train)
    fig_test.savefig(output_file_path_fig_test)

    