In [52]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [53]:
%cd 'gdrive/MyDrive/Colab Notebooks/CHL/semester-project'
%ls

[Errno 2] No such file or directory: 'gdrive/MyDrive/Colab Notebooks/CHL/semester-project'
/content/gdrive/MyDrive/Colab Notebooks/CHL/semester-project
ANN-merged.ipynb  ANN-mut-complete.ipynb  [0m[01;34mdataset[0m/


## Libraries

In [54]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, f1_score
import numpy as np
import pandas as pd


In [55]:
np.random.seed(42)

## Data Processing

In [56]:
df = pd.read_csv("dataset/merged.csv", index_col=False)
df = df.drop("Unnamed: 0", axis=1)
# print(df.head())

# exit()

df = df.loc[:, ~df.columns.isin(['AccessionNumber', '1stpfs event', 'dpfs', 'dos'])]
# df = df[df["safety"]==1] #select safety analysis
# df = df.drop('PatientCode',axis=1) #drop one row with Nan value
# df = df[~df.isin([-99]).any(axis=1)] #drop any rows with -99 value
# # For now, remove all rows with Nan values
# # remove rows with NaN values, it's actually only one row
# df = df.dropna()

print(len(df))

296


In [57]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
# train, validation = train_test_split(train, teast_size=0.1, random_state=42)

In [58]:
# Separate the target variable from the features
y_train = train['os event']
X_train = train.drop('os event', axis=1)
# y_val = validation["os event"]
# X_val = validation.drop("os event", axis=1)
y_test = test['os event']
X_test = test.drop('os event', axis=1)

## Normalizing Data

In [59]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

## ANN Model

In [60]:
early_stopping_monitor = EarlyStopping(
    monitor="val_loss",
    verbose=1,
    patience=5,
    restore_best_weights=True
)

In [61]:
num_features = X_train.shape[1]

def create_model():
  model = tf.keras.Sequential()

  model.add(tf.keras.Input((num_features,)))
  model.add(tf.keras.layers.Dense(8))
  model.add(tf.keras.layers.Activation("relu"))
  model.add(tf.keras.layers.Dense(16))
  model.add(tf.keras.layers.Activation("relu"))
  model.add(tf.keras.layers.Dense(16))
  model.add(tf.keras.layers.Activation("relu"))
  model.add(tf.keras.layers.Dense(8))
  model.add(tf.keras.layers.Activation("relu"))
  model.add(tf.keras.layers.Dense(1))
  model.add(tf.keras.layers.Activation("sigmoid"))

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  return model

In [62]:
model = KerasClassifier(build_fn=create_model, verbose=1)
# define the grid search parameters
batch_size = [4, 8, 16, 32, 64]
epochs = [5, 7, 10, 12, 15, 17, 20]
param_grid = dict(batch_size=batch_size, epochs=epochs)


# model_history = model.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_val, y_val), callbacks=[early_stopping_monitor])


grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

  model = KerasClassifier(build_fn=create_model, verbose=1)


Epoch 1/17
Epoch 2/17
Epoch 3/17
Epoch 4/17
Epoch 5/17
Epoch 6/17
Epoch 7/17
Epoch 8/17
Epoch 9/17
Epoch 10/17
Epoch 11/17
Epoch 12/17
Epoch 13/17
Epoch 14/17
Epoch 15/17
Epoch 16/17
Epoch 17/17
Best: 0.745754 using {'batch_size': 4, 'epochs': 17}
0.669209 (0.059408) with: {'batch_size': 4, 'epochs': 5}
0.715947 (0.057751) with: {'batch_size': 4, 'epochs': 7}
0.690306 (0.074555) with: {'batch_size': 4, 'epochs': 10}
0.686195 (0.070091) with: {'batch_size': 4, 'epochs': 12}
0.716001 (0.049435) with: {'batch_size': 4, 'epochs': 15}
0.745754 (0.062031) with: {'batch_size': 4, 'epochs': 17}
0.711782 (0.063523) with: {'batch_size': 4, 'epochs': 20}
0.669209 (0.047408) with: {'batch_size': 8, 'epochs': 5}
0.673429 (0.054205) with: {'batch_size': 8, 'epochs': 7}
0.656443 (0.076369) with: {'batch_size': 8, 'epochs': 10}
0.690306 (0.074555) with: {'batch_size': 8, 'epochs': 12}
0.694796 (0.055373) with: {'batch_size': 8, 'epochs': 15}
0.694580 (0.069606) with: {'batch_size': 8, 'epochs': 17}
0.

In [63]:
# model.summary()
print("Tuned ANN Parameters: {}".format(grid_result.best_params_))
print("Best score on validation is {}".format(grid_result.best_score_))
train_grid_predictions = grid_result.predict(X_train)
grid_predictions = grid_result.predict(X_test)

Tuned ANN Parameters: {'batch_size': 4, 'epochs': 17}
Best score on validation is 0.745753546555837


In [64]:
acc = accuracy_score(y_train, train_grid_predictions)
fpr, tpr, threshold = roc_curve(y_train, train_grid_predictions)
auc_score = auc(fpr, tpr)
f1_score_train = f1_score(y_train, train_grid_predictions, zero_division=1)

print("Metrics on Training Set")
print("Accuracy: ", acc)
print("AUC score: ", auc_score)
print("F1 score: ", f1_score_train)

Metrics on Training Set
Accuracy:  0.7669491525423728
AUC score:  0.7165318957771787
F1 score:  0.8328267477203647


In [65]:
accuracy_score(y_test, grid_predictions)

0.7833333333333333

In [66]:
fpr, tpr, threshold = roc_curve(y_test, grid_predictions)

In [67]:
auc(fpr, tpr)

0.688782489740082

In [68]:
f1_score_test = f1_score(y_test, grid_predictions, zero_division=1)
f1_score_test

0.8571428571428572