In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd 'gdrive/MyDrive/Colab Notebooks/CHL/semester-project'
%ls

/content/gdrive/MyDrive/Colab Notebooks/CHL/semester-project
ANN-merged.ipynb  ANN-mut-complete.ipynb  [0m[01;34mdataset[0m/


## Libraries

In [3]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, f1_score
import numpy as np
import pandas as pd


In [4]:
np.random.seed(42)

## Data Processing

In [5]:
df = pd.read_csv("dataset/merged.csv", index_col=False)
df = df.drop("Unnamed: 0", axis=1)
# print(df.head())

# exit()

df = df.loc[:, ~df.columns.isin(['AccessionNumber', '1stpfs event', 'dpfs', 'dos'])]
# df = df[df["safety"]==1] #select safety analysis
# df = df.drop('PatientCode',axis=1) #drop one row with Nan value
# df = df[~df.isin([-99]).any(axis=1)] #drop any rows with -99 value
# # For now, remove all rows with Nan values
# # remove rows with NaN values, it's actually only one row
# df = df.dropna()

print(len(df))

296


In [6]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
# train, validation = train_test_split(train, teast_size=0.1, random_state=42)

In [7]:
# Separate the target variable from the features
y_train = train['os event']
X_train = train.drop('os event', axis=1)
# y_val = validation["os event"]
# X_val = validation.drop("os event", axis=1)
y_test = test['os event']
X_test = test.drop('os event', axis=1)

## Normalizing Data

In [8]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

## ANN Model

In [9]:
early_stopping_monitor = EarlyStopping(
    monitor="val_loss",
    verbose=1,
    patience=5,
    restore_best_weights=True
)

In [10]:
num_features = X_train.shape[1]

def create_model():
  model = tf.keras.Sequential()

  model.add(tf.keras.Input((num_features,)))
  model.add(tf.keras.layers.Dense(8))
  model.add(tf.keras.layers.Activation("relu"))
  model.add(tf.keras.layers.Dense(16))
  model.add(tf.keras.layers.Activation("relu"))
  model.add(tf.keras.layers.Dense(16))
  model.add(tf.keras.layers.Activation("relu"))
  model.add(tf.keras.layers.Dense(8))
  model.add(tf.keras.layers.Activation("relu"))
  model.add(tf.keras.layers.Dense(1))
  model.add(tf.keras.layers.Activation("sigmoid"))

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  return model

In [11]:
model = KerasClassifier(build_fn=create_model, verbose=1)
# define the grid search parameters
batch_size = [4, 8, 16, 32, 64]
epochs = [5, 7, 10, 12, 15, 17, 20]
param_grid = dict(batch_size=batch_size, epochs=epochs)


# model_history = model.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_val, y_val), callbacks=[early_stopping_monitor])


grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

  model = KerasClassifier(build_fn=create_model, verbose=1)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Best: 0.720383 using {'batch_size': 4, 'epochs': 15}
0.660770 (0.041481) with: {'batch_size': 4, 'epochs': 5}
0.673429 (0.054205) with: {'batch_size': 4, 'epochs': 7}
0.664936 (0.059115) with: {'batch_size': 4, 'epochs': 10}
0.690414 (0.084671) with: {'batch_size': 4, 'epochs': 12}
0.720383 (0.047086) with: {'batch_size': 4, 'epochs': 15}
0.703235 (0.057785) with: {'batch_size': 4, 'epochs': 17}
0.703127 (0.067814) with: {'batch_size': 4, 'epochs': 20}
0.673429 (0.054205) with: {'batch_size': 8, 'epochs': 5}
0.677702 (0.048673) with: {'batch_size': 8, 'epochs': 7}
0.703343 (0.031973) with: {'batch_size': 8, 'epochs': 10}
0.690360 (0.064181) with: {'batch_size': 8, 'epochs': 12}
0.686141 (0.058861) with: {'batch_size': 8, 'epochs': 15}
0.715785 (0.070410) with: {'batch_size': 8, 'epochs': 17}
0.720167 (0.054686) with: 

In [12]:
# model.summary()
print("Tuned ANN Parameters: {}".format(grid_result.best_params_))
print("Best score on validation is {}".format(grid_result.best_score_))
grid_predictions = grid_result.predict(X_test)

Tuned ANN Parameters: {'batch_size': 4, 'epochs': 15}
Best score on validation is 0.7203829884529114


In [13]:
accuracy_score(y_test, grid_predictions)

0.7833333333333333

In [14]:
fpr, tpr, threshold = roc_curve(y_test, grid_predictions)

In [15]:
auc(fpr, tpr)

0.6709986320109438

In [16]:
f1_score(y_test, grid_predictions, zero_division=1)

0.8602150537634408