In [7]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, balanced_accuracy_score
import numpy as np
import pickle
from sklearn.impute import KNNImputer
import imblearn

from utils.general_utils import get_outlier_info, get_outlier_val_counts

## TODO 
1. Stratified split
2. Handle class imbalance
3. Handle outliers

In [8]:
og_df = pd.read_csv('dataset/TrainDataset2023.csv')
og_df.rename(columns={'pCR (outcome)': 'pcr', 'RelapseFreeSurvival (outcome)': 'rfs'}, inplace=True)

In [14]:
columns_drop_path = 'dataset/dataset_v1/columns_dropped.pkl'

with open(columns_drop_path, 'rb') as f:
    to_drop_90 = pickle.load(f)

In [16]:
fs_df_v1 = og_df.drop(to_drop_90, axis=1)

In [17]:
impute_knn = KNNImputer(n_neighbors=3, missing_values=999)
imputed_arr = impute_knn.fit_transform(fs_df_v1.drop(columns=['ID']))
imputed_df = pd.DataFrame.from_records(imputed_arr, columns=fs_df_v1.columns[1:])

In [26]:
# imputed_df.to_csv('dataset/dataset_v1/imputed_df.csv')

In [24]:
imputed_df.head()

Unnamed: 0,pcr,rfs,Age,ER,PgR,ChemoGrade,Proliferation,LNStatus,TumourStage,original_shape_Elongation,...,original_glszm_HighGrayLevelZoneEmphasis,original_glszm_LargeAreaEmphasis,original_glszm_LargeAreaHighGrayLevelEmphasis,original_glszm_LargeAreaLowGrayLevelEmphasis,original_glszm_SizeZoneNonUniformityNormalized,original_glszm_SmallAreaEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_ngtdm_Busyness,original_ngtdm_Strength
0,1.0,144.0,41.0,0.0,0.0,3.0,3.0,1.0,2.0,0.813912,...,1.272727,4067578.818,15655220.0,1170667.0,0.1809,0.403535,3.325332,0.002314,473.464852,0.000758
1,0.0,142.0,39.0,1.0,1.0,3.0,3.0,1.0,2.0,0.666118,...,1.0375,2403756.075,9614769.0,601002.7,0.198125,0.444391,3.032144,0.005612,59.45971,0.003685
2,1.0,135.0,31.0,0.0,0.0,2.0,1.0,0.0,2.0,0.645083,...,1.040541,1561963.432,6247801.0,390504.0,0.275749,0.534549,2.485848,0.006752,33.935384,0.006447
3,0.0,12.0,35.0,0.0,0.0,3.0,3.0,1.0,3.0,0.770842,...,1.029703,7007670.723,28030630.0,1751932.0,0.253014,0.506185,2.606255,0.003755,46.859265,0.004543
4,0.0,109.0,61.0,1.0,0.0,2.0,1.0,0.0,2.0,0.861035,...,1.051724,1288913.69,5155465.0,322276.0,0.216409,0.462282,2.809279,0.006521,39.621023,0.005626


## Identified that below three columns had float values when the feature itself is categorical. So we round it to the closest integer
1. ChemoGrade
2. Proliferation
3. LNStatus

In [39]:
imputed_df['ChemoGrade'] = imputed_df['ChemoGrade'].round()
imputed_df['Proliferation'] = imputed_df['Proliferation'].round()
imputed_df['LNStatus'] = imputed_df['LNStatus'].round()

In [35]:
imputed_df['TumourStage'].value_counts()

TumourStage
2.0    183
3.0    104
4.0     84
1.0     29
Name: count, dtype: int64

In [42]:
imputed_df['ChemoGrade'].value_counts(), imputed_df['Proliferation'].value_counts(), imputed_df['TumourStage'].value_counts()

(ChemoGrade
 2.0    236
 3.0    162
 1.0      2
 Name: count, dtype: int64,
 Proliferation
 1.0    239
 2.0     93
 3.0     68
 Name: count, dtype: int64,
 TumourStage
 2.0    183
 3.0    104
 4.0     84
 1.0     29
 Name: count, dtype: int64)

# Converting categorical value to one hot encoding 
### Following are the categorical features which has more than 2 categories
1. ChemoGrade
2. Proliferation
3. TumourStage

In [46]:
imputed_df = pd.get_dummies(imputed_df, columns=['ChemoGrade', 'Proliferation', 'TumourStage'])

In [55]:
imputed_df.head()

Unnamed: 0,pcr,rfs,Age,ER,PgR,LNStatus,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,...,ChemoGrade_1.0,ChemoGrade_2.0,ChemoGrade_3.0,Proliferation_1.0,Proliferation_2.0,Proliferation_3.0,TumourStage_1.0,TumourStage_2.0,TumourStage_3.0,TumourStage_4.0
0,1.0,144.0,41.0,0.0,0.0,1.0,0.813912,0.72408,23.781937,32.84437,...,False,False,True,False,False,True,False,True,False,False
1,0.0,142.0,39.0,1.0,1.0,1.0,0.666118,0.476173,20.715461,43.504095,...,False,False,True,False,False,True,False,True,False,False
2,1.0,135.0,31.0,0.0,0.0,0.0,0.645083,0.59447,21.659822,36.435505,...,False,True,False,True,False,False,False,True,False,False
3,0.0,12.0,35.0,0.0,0.0,1.0,0.770842,0.501228,26.590504,53.050724,...,False,False,True,False,False,True,False,False,True,False
4,0.0,109.0,61.0,1.0,0.0,0.0,0.861035,0.750267,20.456571,27.265716,...,False,True,False,True,False,False,False,True,False,False


### Converting bool one hot encoding to integer one hot encoding

In [85]:
multi_category_variables = ['ChemoGrade_1.0', 'ChemoGrade_2.0', 'ChemoGrade_3.0', 'Proliferation_1.0', 'Proliferation_2.0',
       'Proliferation_3.0', 'TumourStage_1.0', 'TumourStage_2.0', 'TumourStage_3.0', 'TumourStage_4.0']

In [86]:
for column in multi_category_variables:
    imputed_df[column] = imputed_df[column].astype(np.float32)

In [87]:
imputed_df[multi_category_variables]

Unnamed: 0,ChemoGrade_1.0,ChemoGrade_2.0,ChemoGrade_3.0,Proliferation_1.0,Proliferation_2.0,Proliferation_3.0,TumourStage_1.0,TumourStage_2.0,TumourStage_3.0,TumourStage_4.0
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
395,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
396,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
397,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
398,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [88]:
train, test = train_test_split(imputed_df, test_size=0.20)


In [89]:
train.shape, test.shape

((320, 62), (80, 62))

## Separating categorical and continous data separately to process

In [90]:
multi_category_variables = ['ChemoGrade_1.0', 'ChemoGrade_2.0', 'ChemoGrade_3.0', 'Proliferation_1.0', 'Proliferation_2.0',
       'Proliferation_3.0', 'TumourStage_1.0', 'TumourStage_2.0', 'TumourStage_3.0', 'TumourStage_4.0']

categorical_features_labels = ['ER', 'PgR', 'LNStatus'] + multi_category_variables

continous_features_labels = imputed_df.drop(columns=categorical_features_labels+['pcr', 'rfs']).columns

In [91]:
len(categorical_features_labels), len(continous_features_labels)

(13, 47)

In [92]:
train_continous_feature_array = train[continous_features_labels].to_numpy()

test_continous_feature_array = test[continous_features_labels].to_numpy()

In [94]:
scalerTrainX = StandardScaler()

train_cont_X = scalerTrainX.fit_transform(train_continous_feature_array)
train_cat_X = train[categorical_features_labels].to_numpy()
trainX = np.hstack((train_cat_X, train_cont_X))
                   
trainY = train['pcr'].to_numpy().reshape(-1, 1)

                   
test_cont_X = scalerTrainX.transform(test_continous_feature_array)
test_cat_X = test[categorical_features_labels].to_numpy()
testX = np.hstack((test_cat_X, test_cont_X))
                   
testY = test['pcr'].to_numpy().reshape(-1, 1)


# START FROM HERE!

In [95]:
trainX.shape, trainY.shape, testX.shape, testY.shape

((320, 60), (320, 1), (80, 60), (80, 1))

In [97]:
pd.DataFrame.from_records(trainX)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,-0.603619,-0.090074,-0.115905,-0.069010,-0.783980,-0.258971,1.262702,0.489896,-0.022666,-0.249233
1,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,-0.189301,-0.092728,-0.122429,-0.069710,0.595605,1.101637,-0.882756,-0.177296,-0.160194,-0.037246
2,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,-0.307927,-0.093039,-0.123193,-0.069792,-0.558338,-0.399425,0.387735,0.078239,-0.149321,-0.168009
3,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,-0.472081,-0.094169,-0.125973,-0.070090,-0.006815,0.651282,0.107351,1.602250,-0.134432,-0.202850
4,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,-0.575254,-0.079998,-0.091133,-0.066353,-0.870117,-0.373255,1.466050,-0.065036,0.064807,-0.254138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,-0.517921,-0.094526,-0.126848,-0.070184,-0.348418,0.251027,0.342948,3.212540,-0.136759,-0.202828
316,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,-0.421809,-0.094008,-0.125578,-0.070046,0.049025,0.427039,-0.435389,1.039791,-0.141803,-0.184616
317,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.284626,-0.092878,-0.122798,-0.069749,0.187853,0.795548,-0.007072,0.319211,-0.128887,-0.216517
318,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,3.298315,5.766145,14.282019,1.474688,1.815430,0.667098,-2.146073,-1.350039,-0.167051,8.702830


In [14]:
classification_y = imputed_arr[:,0]
regression_y = imputed_arr[:,1]
x_data =  imputed_arr[2:]

In [15]:
imputed_arr.shape, x_data.shape, regression_y.shape, classification_y.shape

((400, 55), (398, 55), (400,), (400,))

In [67]:
with open(f'dataset/dataset_v1/data.npy', 'wb') as f:
    np.save(f, imputed_arr)

In [16]:
train, test = train_test_split(imputed_arr, test_size=0.20)
train.shape, test.shape

((320, 55), (80, 55))

In [27]:
train_classification_y = train[:,0]
train_regression_y = train[:,1]

test_classification_y = test[:,0]
test_regression_y = test[:,1]

xTrain_data =  train[:,2:]
xTest_data =  test[:, 2:]

In [28]:
train_classification_y.shape, train_regression_y.shape, test_classification_y.shape, test_regression_y.shape, xTrain_data.shape, xTest_data.shape

((320,), (320,), (80,), (80,), (320, 53), (80, 53))

# Regression Task split and z-score normalization

In [29]:
scalerTrainX = StandardScaler()
scalerTrainY = StandardScaler()

trainX = scalerTrainX.fit_transform(xTrain_data)
trainY = scalerTrainY.fit_transform(train_regression_y.ravel())

testX = scalerTrainX.transform(xTest_data)
testY = scalerTrainY.transform(test_regression_y.ravel())

In [20]:
mean_absolute_error
balanced_accuracy_score

<function sklearn.metrics._classification.balanced_accuracy_score(y_true, y_pred, *, sample_weight=None, adjusted=False)>

# Model training - dry run
### Random Forest

In [54]:
from sklearn.ensemble import RandomForestRegressor
from utils.general_utils import save_model

In [44]:
regressor = RandomForestRegressor(random_state=0, verbose=1)


In [45]:
regressor.fit(trainX, trainY.ravel())

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.5s


RandomForestRegressor(random_state=0, verbose=1)

In [46]:
predicted = regressor.predict(testX)
test_mse_error = mean_squared_error(predicted, testY)
test_mae_error = mean_absolute_error(predicted, testY)
test_r2_error = r2_score(predicted, testY)

print(f'MSE error: {test_mse_error}')
print(f'MAE error: {test_mae_error}')
print(f'R2 error: {test_r2_error}')

MSE error: 0.9765851073017249
MAE error: 0.7654654184592893
R2 error: -5.57654164850365


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [59]:
def save_normalizer(path, Xnormalizer, Ynormalizer):
    
    print('Caution, saving normalizer in order!.. Check arguments')
    
    normalilzer_list = [Xnormalizer, Ynormalizer]
    name_list = ['scalerTrainX', 'scalerTrainY']
    
    for nlz, name in zip(normalilzer_list, name_list):
        with open(f'{path}/{name}.pkl','wb') as f:
            pickle.dump(nlz, f)
        
def load_normalizer(path):
    
    print('Caution, loading normalizer in order!.. Check arguments')
    
    name_list = ['scalerTrainX', 'scalerTrainY']
    normalilzer_list = []
    
    for name in name_list:
        with open(f'{path}/{name}.pkl','rb') as f:
            normalilzer_list.append(pickle.load(f))
            
    return normalilzer_list[0], normalilzer_list[1]
        
def save_dataset(xTrain, yTrain, xTest, yTest, path):
    print('Caution, saving dataset in order!.. Check arguments')
    data_list = [xTrain, yTrain, xTest, yTest]
    data_name = ['xTrain', 'yTrain', 'xTest', 'yTest']
    
    for arr, name in zip(data_list, data_name):
        with open(f'{path}/{name}.npy', 'wb') as f:
            np.save(f,arr)

def load_dataset(path):
    
    print('Caution, loading dataset in order!.. Check arguments')
    
    data_name = ['xTrain', 'yTrain', 'xTest', 'yTest']      
    data_list = []
    for name in data_name:
        with open(f'{path}/{name}.npy', 'rb') as f:
            data_list.append(np.load(f))
            
    return data_list[0], data_list[1], data_list[2], data_list[3]
            

In [61]:
save_normalizer('baseline_model/normalizer/', scalerTrainX, scalerTrainY)
save_dataset(xTrain=trainX, yTrain=trainY, xTest=testX, yTest=testY, path='baseline_model/dataset/')
save_model(model=regressor, save_path='baseline_model/model/', name="randomforest")

Caution, saving things in order!.. Check arguments


In [None]:
# Make sure not changing the oridinal feature, binary feature 