In [10]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pickle
from sklearn.impute import KNNImputer
import imblearn
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, balanced_accuracy_score
from scipy.stats import skew
from sklearn.manifold import TSNE # TSNE module
from sklearn.decomposition import PCA

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, MinMaxScaler
from sklearn.model_selection import train_test_split


from utils.general_utils import get_outlier_info, get_outlier_val_counts, clean_outliers
from utils.saver_utils import save_normalizer, load_normalizer
from utils.saver_utils import save_dataset, load_dataset 



In [11]:
og_df = pd.read_csv('dataset/TrainDataset2023.csv')
og_df.rename(columns={'pCR (outcome)': 'pcr', 'RelapseFreeSurvival (outcome)': 'rfs'}, inplace=True)

In [4]:
int_col_mask = og_df.drop(columns=['ID']).dtypes == 'int64'
categorical_features = list(og_df.drop(columns=['ID']).columns[int_col_mask])

# Ignoring 'original_shape_VoxelVolume' because even though it is a int column it is not a categorical feature

categorical_features.remove('original_shape_VoxelVolume')
categorical_features

['pcr',
 'ER',
 'PgR',
 'HER2',
 'TrippleNegative',
 'ChemoGrade',
 'Proliferation',
 'HistologyType',
 'LNStatus',
 'TumourStage']

In [5]:
int_col_index = [i for i, x in enumerate(np.array(int_col_mask)) if x]
int_col_index

[0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 25]

In [6]:
missing_vals = np.sum(og_df == 999)
missing_cols_valuecounts = missing_vals[missing_vals > 0]
missing_cols_valuecounts

pcr                5
PgR                1
HER2               1
TrippleNegative    1
ChemoGrade         3
Proliferation      2
HistologyType      3
LNStatus           1
dtype: int64

In [7]:
impute_knn = KNNImputer(n_neighbors=3, missing_values=999)
imputed_arr = impute_knn.fit_transform(og_df.drop(columns=['ID']))


In [8]:
for idx in int_col_index:
    imputed_arr[:, idx] = np.rint(imputed_arr[:, idx])

imputed_df = pd.DataFrame.from_records(imputed_arr, columns=og_df.columns[1:])

# skew_processing_cols = imputed_df.drop(columns = set(categorical_features).union({'pcr', 'rfs'})).columns


In [9]:
multiclass_categorical_features = []
others = []

for cat_col_name in categorical_features:
    temp_len = len(imputed_df[cat_col_name].value_counts())
    if temp_len > 2:
        multiclass_categorical_features.append(cat_col_name)
    else:
        others.append(temp_len)

multiclass_categorical_features, others

(['ChemoGrade', 'Proliferation', 'TumourStage'], [2, 2, 2, 2, 2, 2, 2])

In [10]:
imputed_df = pd.get_dummies(imputed_df, columns=multiclass_categorical_features)

# Converting bool one hot encoding to integer one hot encoding

In [11]:
multi_category_variables = ['ChemoGrade_1.0', 'ChemoGrade_2.0', 'ChemoGrade_3.0', 'Proliferation_1.0', 'Proliferation_2.0',
       'Proliferation_3.0', 'TumourStage_1.0', 'TumourStage_2.0', 'TumourStage_3.0', 'TumourStage_4.0']

for column in multi_category_variables:
    imputed_df[column] = imputed_df[column].astype(np.float32)

In [12]:
nan_cols = imputed_df.columns[imputed_df.isnull().sum() > 0]

print(f'NaN Columns : {nan_cols}')
if len(nan_cols) > 0:
    imputed_df.drop(columns=nan_cols, inplace=True)

NaN Columns : Index([], dtype='object')


In [13]:
categorical_features_labels = categorical_features + multi_category_variables
temp_categorical_features_labels = categorical_features_labels

# Removing multiclass_categorical_features because these features are 
#converted to one-hot encoding

for col_name in multiclass_categorical_features:
    temp_categorical_features_labels.remove(col_name)

temp_categorical_features_labels.remove('pcr')

continous_features_labels = imputed_df.drop(columns=temp_categorical_features_labels+['pcr', 'rfs']).columns

In [14]:
train, test = train_test_split(imputed_df, test_size=0.20)

In [16]:
train_continous_feature_array = train[continous_features_labels].to_numpy()

test_continous_feature_array = test[continous_features_labels].to_numpy()

scalerTrainX = StandardScaler()
scalerTrainY = StandardScaler()
# scalerTrainX = RobustScaler()
# scalerTrainY = RobustScaler()

# scalerTrainX = MinMaxScaler()
# scalerTrainY = MinMaxScaler()


train_cont_X = scalerTrainX.fit_transform(train_continous_feature_array)
train_cat_X = train[temp_categorical_features_labels].to_numpy()
trainX = np.hstack((train_cat_X, train_cont_X))
                   
trainY = scalerTrainY.fit_transform(train['rfs'].to_numpy().reshape(-1, 1))
# trainY = train['rfs'].to_numpy().reshape(-1, 1)


                   
test_cont_X = scalerTrainX.transform(test_continous_feature_array)
test_cat_X = test[temp_categorical_features_labels].to_numpy()
testX = np.hstack((test_cat_X, test_cont_X))
                   
testY = scalerTrainY.transform(test['rfs'].to_numpy().reshape(-1, 1))
# testY = test['rfs'].to_numpy().reshape(-1, 1)

In [17]:

# Standardize the features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# Convert NumPy arrays to PyTorch tensors
X_train_tensor = torch.tensor(trainX, dtype=torch.float32)
y_train_tensor = torch.tensor(trainY, dtype=torch.float32).view(-1, 1)  # Ensure y is a column vector

X_test_tensor = torch.tensor(testX, dtype=torch.float32)
y_test_tensor = torch.tensor(testY, dtype=torch.float32).view(-1, 1)


In [7]:

# Define the neural network model
class SimpleANN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.3):
        super(SimpleANN, self).__init__()
        self.fc1 = nn.Linear(input_size, 32)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.relu1 = nn.ReLU()
        
        self.fc2 = nn.Linear(32, 64)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_rate)
        
        # self.fc3 = nn.Linear(64, 128)
        # self.relu3 = nn.ReLU()
        # self.dropout3 = nn.Dropout(dropout_rate)
        
        self.fc3 = nn.Linear(64, 32)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(dropout_rate)
        
        self.fc4 = nn.Linear(32, output_size)

    def forward(self, x):
        
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.dropout3(x)
        
        x = self.fc4(x)
        
        return x



In [19]:
# Initialize the model
input_size = trainX.shape[1]
hidden_size = 64  # You can adjust this as needed
output_size = 1
model = SimpleANN(input_size, hidden_size, output_size, dropout_rate=0.5)

# Define L2 regularization strength
l2_lambda = 0.001
# l2_lambda = 0


# Define the loss function with L2 regularization
criterion = nn.MSELoss()
# criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=l2_lambda)

# Create DataLoader for batch training
batch_size = 32
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Training loop
num_epochs = 200
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # L2 regularization
        l2_reg = sum(torch.norm(param) for param in model.parameters())
        loss += l2_lambda * l2_reg

        loss.backward()
        optimizer.step()

# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    test_predictions = model(X_test_tensor)
    test_r2 = r2_score(y_test_tensor.numpy(), test_predictions.numpy())
    test_mae = mean_absolute_error(y_test_tensor.numpy(), test_predictions.numpy())

print("Test R2:", test_r2)
print("Test MAE:", test_mae)

Test R2: -0.1484821265388756
Test MAE: 0.8794056


In [20]:
# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    test_predictions = model(X_train_tensor)
    test_r2 = r2_score(y_train_tensor.numpy(), test_predictions.numpy())
    test_mae = mean_absolute_error(y_train_tensor.numpy(), test_predictions.numpy())

print("Train R2:", test_r2)
print("Train MAE:", test_mae)

Train R2: 0.690174789518419
Train MAE: 0.43967095


# With Dataset V2

In [2]:
datasetv2_save_path = "dataset/dataset_v2"
imputed_df = pd.read_csv(f'{datasetv2_save_path}/imputed_df.csv', index_col=0)
imputed_df.head()

Unnamed: 0,pcr,rfs,Age,ER,PgR,LNStatus,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,...,ChemoGrade_1.0,ChemoGrade_2.0,ChemoGrade_3.0,Proliferation_1.0,Proliferation_2.0,Proliferation_3.0,TumourStage_1.0,TumourStage_2.0,TumourStage_3.0,TumourStage_4.0
0,1.0,144.0,41.0,0.0,0.0,1.0,0.813912,0.72408,23.781937,32.84437,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.0,142.0,39.0,1.0,1.0,1.0,0.666118,0.476173,20.715461,43.504095,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,135.0,31.0,0.0,0.0,0.0,0.645083,0.59447,21.659822,36.435505,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,12.0,35.0,0.0,0.0,1.0,0.770842,0.501228,26.590504,53.050724,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,109.0,61.0,1.0,0.0,0.0,0.861035,0.750267,20.456571,27.265716,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [3]:
train, test = train_test_split(imputed_df, test_size=0.20)

multi_category_variables = ['ChemoGrade_1.0', 'ChemoGrade_2.0', 'ChemoGrade_3.0', 'Proliferation_1.0', 'Proliferation_2.0',
       'Proliferation_3.0', 'TumourStage_1.0', 'TumourStage_2.0', 'TumourStage_3.0', 'TumourStage_4.0']

categorical_features_labels = ['ER', 'PgR', 'LNStatus'] + multi_category_variables

continous_features_labels = imputed_df.drop(columns=categorical_features_labels+['pcr', 'rfs']).columns

In [4]:
train_continous_feature_array = train[continous_features_labels].to_numpy()

test_continous_feature_array = test[continous_features_labels].to_numpy()

In [5]:
# scalerTrainX = StandardScaler()
# scalerTrainY = StandardScaler()
# scalerTrainX = RobustScaler()
# scalerTrainY = RobustScaler()

scalerTrainX = MinMaxScaler()
scalerTrainY = MinMaxScaler()


train_cont_X = scalerTrainX.fit_transform(train_continous_feature_array)
train_cat_X = train[categorical_features_labels].to_numpy()
trainX = np.hstack((train_cat_X, train_cont_X))
                   
trainY = scalerTrainY.fit_transform(train['rfs'].to_numpy().reshape(-1, 1))

                   
test_cont_X = scalerTrainX.transform(test_continous_feature_array)
test_cat_X = test[categorical_features_labels].to_numpy()
testX = np.hstack((test_cat_X, test_cont_X))
                   
testY = scalerTrainY.transform(test['rfs'].to_numpy().reshape(-1, 1))


In [6]:

# Standardize the features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# Convert NumPy arrays to PyTorch tensors
X_train_tensor = torch.tensor(trainX, dtype=torch.float32)
y_train_tensor = torch.tensor(trainY, dtype=torch.float32).view(-1, 1)  # Ensure y is a column vector

X_test_tensor = torch.tensor(testX, dtype=torch.float32)
y_test_tensor = torch.tensor(testY, dtype=torch.float32).view(-1, 1)


In [None]:
trainX.shape, trainY.shape, testX.shape, testY.shape

In [8]:
# Initialize the model
input_size = trainX.shape[1]
hidden_size = 64  # You can adjust this as needed
output_size = 1
model = SimpleANN(input_size, hidden_size, output_size, dropout_rate=0.5)

# Define L2 regularization strength
l2_lambda = 0.001
# l2_lambda = 0


# Define the loss function with L2 regularization
criterion = nn.MSELoss()
# criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=l2_lambda)

# Create DataLoader for batch training
batch_size = 32
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Training loop
num_epochs = 200
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # L2 regularization
        l2_reg = sum(torch.norm(param) for param in model.parameters())
        loss += l2_lambda * l2_reg

        loss.backward()
        optimizer.step()

# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    test_predictions = model(X_test_tensor)
    test_r2 = r2_score(y_test_tensor.numpy(), test_predictions.numpy())
    test_mae = mean_absolute_error(y_test_tensor.numpy(), test_predictions.numpy())

print("Test R2:", test_r2)
print("Test MAE:", test_mae)

Test R2: 0.03278682452449455
Test MAE: 0.14895019


In [9]:
# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    test_predictions = model(X_train_tensor)
    test_r2 = r2_score(y_train_tensor.numpy(), test_predictions.numpy())
    test_mae = mean_absolute_error(y_train_tensor.numpy(), test_predictions.numpy())

print("Train R2:", test_r2)
print("Train MAE:", test_mae)

Train R2: 0.19732078520909635
Train MAE: 0.13220277
