<a href="https://colab.research.google.com/github/alexander-harmaty/Breast-Cancer-Prognosis-Prediction/blob/main/Clinical_Data_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment Setup

In [31]:
# Import tools and libraries
import torch
import torch.nn as nn
import torch.optim as optim
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Mount dataset folder from Google Drive to environment
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My\ Drive/Breast_Cancer_Datasets/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Breast_Cancer_Datasets


# Data Preprocessing
**data encoding is currently incomplete**

In [32]:
# Function to merge multi-index headers to single index headers
def merge_headers(col_tuple):

    # Unpack the tuple: first-level and second-level names
    first, second = col_tuple

    # If newline characters exist, remove and replace with space
    if isinstance(first, str):
        first = first.replace('\n', ' ').strip()
    if isinstance(second, str):
        second = second.replace('\n', ' ').strip()

    # If blank second-headers exist, return first-header only
    if not second or 'Unnamed' in second:
        return first
    # Otherwise, return merged header
    else:
        return f"{first} - {second}"

In [33]:
# Load clinical dataset with a multi-index header
file_path = 'Clinical_and_Other_Features.xlsx'
clinical_df = pd.read_excel(file_path, header=[1, 2])

# Drop the column at index 67. (col BP in .xlsx is a blank grouping divider)
col_to_drop = clinical_df.columns[67]
clinical_df = clinical_df.drop(columns=[col_to_drop])

# Merge multi-index headers for all columns
new_columns = [merge_headers(col) for col in clinical_df.columns]
clinical_df.columns = new_columns

# Test to check columns
print("\nList of columns with their indices:")
for idx, col in enumerate(clinical_df.columns.tolist()):
    print(idx, col)


List of columns with their indices:
0 Patient ID
1 Days to MRI (From the Date of Diagnosis)
2 Manufacturer - GE MEDICAL SYSTEMS=0, MPTronic software=1, SIEMENS=2
3 Manufacturer Model Name - Avanto=0, Optima MR450w=1, SIGNA EXCITE=2, SIGNA HDx=3, Signa HDxt=4, Skyra=5, Trio=6, TrioTim=7
4 Scan Options - FAST_GEMS\SAT_GEMS\ACC_GEMS\PFP\FS=0,FAST_GEMS\SAT_GEMS\MP_GEMS\ACC_GEMS\PFP\FS=1,FAST_GEMS\SAT_GEMS\MP_GEMS\PFP\FS=2,FAST_GEMS\SAT_GEMS\PFP\FS=3,FS=4,PFP\FS=5,PFP\SFS=6,SAT_GEMS\PFP\FS=7,SFS=8
5 Field Strength (Tesla) - 1.494=0,1.5=1,2.8936=2,3=3
6 Patient Position During MRI - FFP=0,HFP=1
7 Image Position of Patient
8 Contrast Agent - GADAVIST=0,MAGNEVIST=1,MMAGNEVIST=2,MULTIHANCE=3,Name of agent not stated(but ContrastBolusAgent tag was present)=4, ContrastBolusAgent Tag Absent = 5
9 Contrast Bolus Volume (mL) - 6=0,7=1,8=2,9=3,10=4,11=5,11.88=6,12=7,13=8,13.6=9,14=10,14.5=11,15=12,16=13,17=14,18=15,19=16,20=17,25=18
10 TR (Repetition Time)
11 TE (Echo Time)
12 Acquisition Matrix - 3

In [39]:
from sklearn.model_selection import train_test_split

# Specify the target variable "Recurrence event(s)"
target_col = clinical_df.columns[60]

# Separate features (X) and target (y)
X = clinical_df.drop(columns=[target_col])
y = clinical_df[target_col]

# First split: 70% training, 30% temporary (for validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42)

# Second split: Split the temporary set into validation and test (each 15% of the total data)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

print("Target column name:", target_col)
print("Training set size:", X_train.shape, y_train.shape)
print("Validation set size:", X_val.shape, y_val.shape)
print("Test set size:", X_test.shape, y_test.shape)

Target column name: Recurrence event(s) - {0 = no, 1 = yes}
Training set size: (645, 96) (645,)
Validation set size: (138, 96) (138,)
Test set size: (139, 96) (139,)


# RNN Model Training Pipeline
**must complete encoding before training**

In [41]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout
from tensorflow.keras.optimizers import Adam

X_train_seq = np.expand_dims(X_train.values, axis=1)  # shape: (645, 1, 96)
X_val_seq   = np.expand_dims(X_val.values, axis=1)      # shape: (138, 1, 96)
X_test_seq  = np.expand_dims(X_test.values, axis=1)     # shape: (139, 1, 96)

# Build a simple RNN model
model = Sequential()

# Add a SimpleRNN layer with 32 units
model.add(SimpleRNN(32, input_shape=(X_train_seq.shape[1], X_train_seq.shape[2]), activation='relu'))
model.add(Dropout(0.2))

# Final Dense layer for binary classification (Recurrence event: 0 or 1)
model.add(Dense(1, activation='sigmoid'))

# Compile the model with binary crossentropy loss and the Adam optimizer.
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary to verify the architecture
model.summary()

# Train the model
history = model.fit(
    X_train_seq,
    y_train,
    validation_data=(X_val_seq, y_val),
    epochs=20,
    batch_size=32
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_seq, y_test)
print("Test accuracy:", test_accuracy)


ValueError: Invalid dtype: object