In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd 'gdrive/MyDrive/Colab Notebooks/CHL/semester-project'
%ls

/content/gdrive/MyDrive/Colab Notebooks/CHL/semester-project
ANN.ipynb  [0m[01;34mdataset[0m/


## Libraries


In [3]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd


In [4]:
np.random.seed(42)

## Data Processing

In [5]:
df = pd.read_csv("dataset/mutationsComplete.csv", index_col=False)
df = df.drop("Unnamed: 0", axis=1)
# print(df.head())

# exit()

df = df.loc[:, ~df.columns.isin(['AccessionNumber', '1stpfs event', 'dpfs', 'dos'])]
# df = df[df["safety"]==1] #select safety analysis
# df = df.drop('PatientCode',axis=1) #drop one row with Nan value
# df = df[~df.isin([-99]).any(axis=1)] #drop any rows with -99 value
# # For now, remove all rows with Nan values
# # remove rows with NaN values, it's actually only one row
# df = df.dropna()

print(len(df))

296


In [6]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
# train, validation = train_test_split(train, test_size=0.1, random_state=42)

In [7]:
# Separate the target variable from the features
y_train = train['os event']
X_train = train.drop('os event', axis=1)
# y_val = validation["os event"]
# X_val = validation.drop("os event", axis=1)
y_test = test['os event']
X_test = test.drop('os event', axis=1)

## Normalizing Data

In [8]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

## ANN Model

In [9]:
early_stopping_monitor = EarlyStopping(
    monitor="val_loss",
    verbose=1,
    patience=5,
    restore_best_weights=True
)

In [10]:
num_features = X_train.shape[1]

def create_model():
  model = tf.keras.Sequential()

  model.add(tf.keras.Input((num_features,)))
  model.add(tf.keras.layers.Dense(8))
  model.add(tf.keras.layers.Activation("relu"))
  model.add(tf.keras.layers.Dense(16))
  model.add(tf.keras.layers.Activation("relu"))
  model.add(tf.keras.layers.Dense(16))
  model.add(tf.keras.layers.Activation("relu"))
  model.add(tf.keras.layers.Dense(8))
  model.add(tf.keras.layers.Activation("relu"))
  model.add(tf.keras.layers.Dense(1))
  model.add(tf.keras.layers.Activation("sigmoid"))

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  return model

In [11]:
model = KerasClassifier(build_fn=create_model, verbose=1)
# define the grid search parameters
batch_size = [4, 8, 16, 32, 64]
epochs = [5, 7, 10, 12, 15, 17, 20]
param_grid = dict(batch_size=batch_size, epochs=epochs)


# model_history = model.fit(X_train, y_train, batch_size=32, epochs=20, validation_data=(X_val, y_val), callbacks=[early_stopping_monitor])


grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

  model = KerasClassifier(build_fn=create_model, verbose=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Best: 0.677648 using {'batch_size': 32, 'epochs': 5}
0.656551 (0.056271) with: {'batch_size': 4, 'epochs': 5}
0.580493 (0.062064) with: {'batch_size': 4, 'epochs': 7}
0.618793 (0.052702) with: {'batch_size': 4, 'epochs': 10}
0.588986 (0.036178) with: {'batch_size': 4, 'epochs': 12}
0.618522 (0.022824) with: {'batch_size': 4, 'epochs': 15}
0.601861 (0.034558) with: {'batch_size': 4, 'epochs': 17}
0.593476 (0.045416) with: {'batch_size': 4, 'epochs': 20}
0.669209 (0.049610) with: {'batch_size': 8, 'epochs': 5}
0.664990 (0.045335) with: {'batch_size': 8, 'epochs': 7}
0.618630 (0.020798) with: {'batch_size': 8, 'epochs': 10}
0.639619 (0.038524) with: {'batch_size': 8, 'epochs': 12}
0.601807 (0.024265) with: {'batch_size': 8, 'epochs': 15}
0.601699 (0.056848) with: {'batch_size': 8, 'epochs': 17}
0.631234 (0.020273) with: {'batch_size': 8, 'epochs': 20}
0.673429 (0.054205) with: {'batch_size': 16, 'epochs': 5}
0.656551 (0.039549) with: {'bat

In [12]:
# model.summary()
print("Tuned ANN Parameters: {}".format(grid_result.best_params_))
print("Best score on validation is {}".format(grid_result.best_score_))
grid_predictions = grid_result.predict(X_test)

Tuned ANN Parameters: {'batch_size': 32, 'epochs': 5}
Best score on validation is 0.6776479482650757


In [13]:
accuracy_score(y_test, grid_predictions)

0.7166666666666667