In [17]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, roc_auc_score

In [4]:
%cd /content/drive/MyDrive/Colab\ Notebooks/DSA4266
%ls

/content/drive/MyDrive/Colab Notebooks/DSA4266
'Copy of Introduction_Genomics_1_GoogleColab.ipynb'   label_df_train.csv
'Copy of Introduction_Genomics_2_GoogleColab.ipynb'   label_df_valid.csv


In [20]:
train_df = pd.read_csv('label_df_train.csv')
test_df = pd.read_csv('label_df_valid.csv')

In [6]:
train_df = train_df.drop(columns = ['transcript_id', 'transcript_position','five_mer'])

In [7]:
data, gene_ids = train_df[[i for i in train_df.columns if i!='gene_id']],train_df['gene_id']

In [8]:
unique_ids = gene_ids.unique()
np.random.shuffle(unique_ids)

In [9]:
len(unique_ids)

3081

In [None]:
from sklearn.metrics import precision_recall_curve,auc, roc_auc_score, average_precision_score
class ScoreCallback(tf.keras.callbacks.Callback):
  def __init__(self, validation_data):
        super(ScoreCallback, self).__init__()
        self.validation_data = validation_data
        self.scores = []
  def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.validation_data[0], verbose = 0)
        y_true = self.validation_data[1]
        ap = average_precision_score(y_true, y_pred)
        roc = roc_auc_score(y_true, y_pred)
        print(0.5*(ap+roc))
        self.scores.append(0.5*(ap+roc))

# Create an instance of the custom callback


In [None]:
def kfoldpipeline(n_fold=5):
  total_score = 0
  total_epochs = 0
  fold_ranges = np.arange(0,len(unique_ids)+1,int(len(unique_ids)/n_fold))
  for i in range(n_fold):
    data_train = data[~train_df['gene_id'].isin(unique_ids[fold_ranges[i]:fold_ranges[i+1]])]
    data_test = data[train_df['gene_id'].isin(unique_ids[fold_ranges[i]:fold_ranges[i+1]])]
    X_train, y_train = data_train[[i for i in data_train if i!='label']], data_train['label']
    X_valid, y_valid = data_test[[i for i in data_train if i!='label']], data_test['label']
    scaler = StandardScaler()

    X_train_scaled = scaler.fit_transform(X_train)
    X_valid_scaled = scaler.transform(X_valid)
    if sampling_strategy:
      smote = SMOTE(sampling_strategy = sampling_strategy)
      X_train_scaled_res, y_train_res = smote.fit_resample(X_train_scaled,y_train)
    else:
      X_train_scaled_res, y_train_res = X_train_scaled, y_train

    score_callback = ScoreCallback((X_valid_scaled, y_valid))
    adam_optimizer = Adam(learning_rate=initial_learning_rate)
    model = keras.Sequential([
        keras.layers.Input(shape=(X_train_scaled_res.shape[1],)),  # Input layer with 2 features
        keras.layers.Dense(num_nodes, activation=hidden_activation,kernel_regularizer=l2(l2_reg_strength)),
        keras.layers.Dropout(dropout_ratio),
        keras.layers.Dense(num_nodes, activation=hidden_activation,kernel_regularizer=l2(l2_reg_strength)),
        keras.layers.Dropout(dropout_ratio),
        keras.layers.Dense(1, activation='sigmoid')  # Output layer with 1 neuron and sigmoid activation for binary classification
    ])
    model.compile(optimizer=adam_optimizer, loss='binary_crossentropy')
    if class_weight:
      neg = y_train_res.value_counts()[0]
      pos = y_train_res.value_counts()[1]
      total = neg+pos
      weight_for_0 = (1 / neg) * (total / 2.0)
      weight_for_1 = (1 / pos) * (total / 2.0)
      class_weights = {0: weight_for_0, 1: weight_for_1}
    else:
      class_weights = {0:1, 1:1}
    model.fit(X_train_scaled_res, y_train_res, batch_size = batch_size, epochs = epochs, shuffle =True, callbacks=[score_callback], verbose = 0, class_weight = class_weights)
    total_score += max(score_callback.scores)
    total_epochs += (np.argmax(score_callback.scores)+1)
  print(f"CV Score : {total_score/n_fold}")
  print(f"Suggested Number of Epochs :{total_epochs/n_fold}")


In [14]:
#Best HyperParameters
sampling_strategy = 0
class_weight = True
initial_learning_rate = 0.001
l2_reg_strength = 0.0001
batch_size = 32 #Best is actually 32 but is too slow
hidden_activation = "relu"
num_nodes = 64
dropout_ratio = 0
#TBA
epochs = 20
#2 layers is the best

In [None]:
#3 layer = 0.6782452356597547
#1 layer = 0.6785021020145741
#2 layer = 0.6816

CV Score around 0.67, suggested epochs: 5 to 6

In [10]:
class EarlyStoppingByAP(keras.callbacks.Callback):
    def __init__(self, validation_data, patience=10, restore_best_weights=True):
        super(EarlyStoppingByAP, self).__init__()
        self.validation_data = validation_data
        self.patience = patience
        self.restore_best_weights = restore_best_weights
        self.best_score = -1
        self.wait = 0

    def on_epoch_end(self, epoch, logs=None):
        X_val, y_val = self.validation_data
        y_pred = self.model.predict(X_val)
        ap = average_precision_score(y_val, y_pred)
        roc = roc_auc_score(y_val, y_pred)
        score = 0.5*(ap+roc)

        if score > self.best_score:
            self.best_score = score
            self.wait = 0
            if self.restore_best_weights:
                self.best_weights = self.model.get_weights()
        else:
            self.wait += 1
            if self.wait >= self.patience:
                print(f"Early stopping due to no improvement in Average Precision for {self.patience} epochs.")
                self.model.stop_training = True
                if self.restore_best_weights:
                    print("Restoring best weights.")
                    self.model.set_weights(self.best_weights)

In [12]:
def final_fitting(num_epochs):
  X_train, y_train = data[[i for i in data if i!='label']], data['label']
  X_test,y_test = test_df[[i for i in X_train.columns]], test_df['label']

  scaler = StandardScaler()
  neg = y_train.value_counts()[0]
  pos = y_train.value_counts()[1]
  total = neg+pos
  weight_for_0 = (1 / neg) * (total / 2.0)
  weight_for_1 = (1 / pos) * (total / 2.0)
  class_weights = {0: weight_for_0, 1: weight_for_1}
  #smote = SMOTE(sampling_strategy = sampling_strategy)
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)
  adam_optimizer = Adam(learning_rate=initial_learning_rate)
  early_stopping = EarlyStoppingByAP(validation_data=(X_test_scaled, y_test), patience=10)
  model = keras.Sequential([
      keras.layers.Input(shape=(X_train_scaled.shape[1],)),  # Input layer with 2 features
      keras.layers.Dense(num_nodes, activation=hidden_activation,kernel_regularizer=l2(l2_reg_strength)),  # Hidden layer with 64 neurons and ReLU activation
      keras.layers.Dense(num_nodes, activation=hidden_activation,kernel_regularizer=l2(l2_reg_strength)),  # Hidden layer with 32 neurons and ReLU activation
      keras.layers.Dense(1, activation='sigmoid')  # Output layer with 1 neuron and sigmoid activation for binary classification
  ])
  model.compile(optimizer=adam_optimizer, loss='binary_crossentropy')
  model.fit(X_train_scaled, y_train, batch_size = batch_size, epochs = num_epochs, shuffle =True, class_weight = class_weights,callbacks=[early_stopping])
  y_pred = model.predict(X_test_scaled)
  print(roc_auc_score(y_test,y_pred))
  print(average_precision_score(y_test,y_pred))

In [18]:
final_fitting(40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Early stopping due to no improvement in Average Precision for 10 epochs.
Restoring best weights.
0.9046375149783967
0.4490274048658146


In [19]:
0.5*(0.9046375149783967+0.4490274048658146)

0.6768324599221056

In [None]:
for i in train_df.columns:
  print(i)

weighted_mean_neg1
weighted_mean_0
weighted_mean_1
weighted_sd_neg1
weighted_sd_0
weighted_sd_1
mean_25_neg1
mean_25_0
mean_25_1
mean_50_neg1
mean_50_0
mean_50_1
mean_75_neg1
mean_75_0
mean_75_1
sd_25_neg1
sd_25_0
sd_25_1
sd_50_neg1
sd_50_0
sd_50_1
sd_75_neg1
sd_75_0
sd_75_1
5-mer-0_A
5-mer-0_C
5-mer-0_G
5-mer-0_T
5-mer-1_A
5-mer-1_G
5-mer-1_T
5-mer-2_A
5-mer-2_G
5-mer-5_A
5-mer-5_C
5-mer-5_T
5-mer-6_A
5-mer-6_C
5-mer-6_G
5-mer-6_T
5-mer_window-1_AAAAC
5-mer_window-1_AAGAC
5-mer_window-1_AGAAC
5-mer_window-1_AGGAC
5-mer_window-1_ATAAC
5-mer_window-1_ATGAC
5-mer_window-1_CAAAC
5-mer_window-1_CAGAC
5-mer_window-1_CGAAC
5-mer_window-1_CGGAC
5-mer_window-1_CTAAC
5-mer_window-1_CTGAC
5-mer_window-1_GAAAC
5-mer_window-1_GAGAC
5-mer_window-1_GGAAC
5-mer_window-1_GGGAC
5-mer_window-1_GTAAC
5-mer_window-1_GTGAC
5-mer_window-1_TAAAC
5-mer_window-1_TAGAC
5-mer_window-1_TGAAC
5-mer_window-1_TGGAC
5-mer_window-1_TTAAC
5-mer_window-1_TTGAC
5-mer_window0_AAACA
5-mer_window0_AAACC
5-mer_window0_AAACT
