##### Data Understanding

In [68]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [47]:
seed = 42

In [48]:
treatment = pd.read_csv('treatment.csv')
outcome = pd.read_csv('outcome.csv')

In [49]:
outcome.head()

Unnamed: 0,SUBJECTID,DataExtractDt,sstat,survDtD2 (tx),RFS,rfs_ind,PCR,RCBClass
0,1001,3/9/2009,8,1264,751,1,0.0,2.0
1,1002,3/9/2009,8,1155,1043,1,0.0,3.0
2,1003,3/9/2009,7,2387,2387,0,0.0,3.0
3,1004,3/9/2009,7,2436,2436,0,0.0,
4,1005,3/9/2009,7,2220,2520,0,0.0,


In [50]:
treatment.head()

Unnamed: 0,SUBJECTID,DataExtractDt,age,race_id,ERpos,PgRpos,HR Pos,Her2MostPos,HR_HER2_CATEGORY,HR_HER2_STATUS,BilateralCa,Laterality,MRI LD Baseline,MRI LD 1-3dAC,MRI LD InterReg,MRI LD PreSurg
0,1001,3/9/2009,3873,1,1.0,0.0,1.0,0.0,1.0,HRposHER2neg,0,1,88.0,78.0,30.0,14.0
1,1002,3/9/2009,3779,1,1.0,1.0,1.0,0.0,1.0,HRposHER2neg,0,2,29.0,26.0,66.0,16.0
2,1003,3/9/2009,4983,1,1.0,1.0,1.0,0.0,1.0,HRposHER2neg,0,1,50.0,64.0,54.0,46.0
3,1004,3/9/2009,4828,1,0.0,0.0,0.0,0.0,3.0,TripleNeg,0,1,91.0,90.0,99.0,43.0
4,1005,3/9/2009,4580,1,1.0,1.0,1.0,0.0,1.0,HRposHER2neg,0,1,98.0,109.0,60.0,42.0


In [51]:
treatment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SUBJECTID         221 non-null    int64  
 1   DataExtractDt     221 non-null    object 
 2   age               221 non-null    object 
 3   race_id           221 non-null    int64  
 4   ERpos             219 non-null    float64
 5   PgRpos            219 non-null    float64
 6   HR Pos            219 non-null    float64
 7   Her2MostPos       216 non-null    float64
 8   HR_HER2_CATEGORY  216 non-null    float64
 9   HR_HER2_STATUS    216 non-null    object 
 10  BilateralCa       221 non-null    int64  
 11  Laterality        221 non-null    int64  
 12  MRI LD Baseline   219 non-null    float64
 13  MRI LD 1-3dAC     210 non-null    float64
 14  MRI LD InterReg   202 non-null    float64
 15  MRI LD PreSurg    208 non-null    float64
dtypes: float64(9), int64(4), object(3)
memory us

In [52]:
outcome.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SUBJECTID      221 non-null    int64  
 1   DataExtractDt  221 non-null    object 
 2   sstat          221 non-null    int64  
 3   survDtD2 (tx)  221 non-null    int64  
 4   RFS            221 non-null    int64  
 5   rfs_ind        221 non-null    int64  
 6   PCR            215 non-null    float64
 7   RCBClass       201 non-null    float64
dtypes: float64(2), int64(5), object(1)
memory usage: 13.9+ KB


##### Data Manipulation and Cleansing

In [53]:
# treatment = pd.read_excel('treatment_data.xlsx', sheet_name='TCIA Patient Clinical Subset')
# outcome = pd.read_excel('treatment_data.xlsx', sheet_name='TCIA Outcomes Subset')

In [54]:
completedf = pd.merge(outcome, treatment, on='SUBJECTID', how='outer')

In [55]:
completedf.shape

(221, 23)

In [56]:
completedf.isnull().sum().sort_values(ascending=False)

RCBClass            20
MRI LD InterReg     19
MRI LD PreSurg      13
MRI LD 1-3dAC       11
PCR                  6
HR_HER2_STATUS       5
HR_HER2_CATEGORY     5
Her2MostPos          5
PgRpos               2
MRI LD Baseline      2
HR Pos               2
ERpos                2
DataExtractDt_x      0
race_id              0
age                  0
DataExtractDt_y      0
rfs_ind              0
BilateralCa          0
Laterality           0
RFS                  0
survDtD2 (tx)        0
sstat                0
SUBJECTID            0
dtype: int64

In [57]:
# completedf = completedf.dropna()
completedf.drop(columns=['DataExtractDt_x','DataExtractDt_y','SUBJECTID'], inplace=True)

completedf

Unnamed: 0,sstat,survDtD2 (tx),RFS,rfs_ind,PCR,RCBClass,age,race_id,ERpos,PgRpos,HR Pos,Her2MostPos,HR_HER2_CATEGORY,HR_HER2_STATUS,BilateralCa,Laterality,MRI LD Baseline,MRI LD 1-3dAC,MRI LD InterReg,MRI LD PreSurg
0,8,1264,751,1,0.0,2.0,3873,1,1.0,0.0,1.0,0.0,1.0,HRposHER2neg,0,1,88.0,78.0,30.0,14.0
1,8,1155,1043,1,0.0,3.0,3779,1,1.0,1.0,1.0,0.0,1.0,HRposHER2neg,0,2,29.0,26.0,66.0,16.0
2,7,2387,2387,0,0.0,3.0,4983,1,1.0,1.0,1.0,0.0,1.0,HRposHER2neg,0,1,50.0,64.0,54.0,46.0
3,7,2436,2436,0,0.0,,4828,1,0.0,0.0,0.0,0.0,3.0,TripleNeg,0,1,91.0,90.0,99.0,43.0
4,7,2220,2520,0,0.0,,4580,1,1.0,1.0,1.0,0.0,1.0,HRposHER2neg,0,1,98.0,109.0,60.0,42.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216,7,1026,1026,0,1.0,0.0,6406,1,0.0,0.0,0.0,1.0,2.0,HER2pos,0,2,52.0,50.0,28.0,0.0
217,8,832,510,1,0.0,3.0,3861,3,0.0,1.0,1.0,0.0,1.0,HRposHER2neg,0,2,51.0,52.0,35.0,23.0
218,7,1031,1031,0,0.0,2.0,4646,1,1.0,1.0,1.0,1.0,2.0,HER2pos,0,2,60.0,60.0,37.0,36.0
219,7,1248,1248,0,0.0,,5905,3,1.0,1.0,1.0,1.0,2.0,HER2pos,0,2,48.0,,,


In [58]:
completedf.isna().sum()
completedf.dropna(inplace=True)

In [59]:
completedf['age'] = completedf['age'].str.replace(',', '.')
completedf['age'] = completedf['age'].astype('float64')

##### Pre-Processing

In [60]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline


In [61]:
X = completedf.drop(columns='sstat',axis=1)
Y = completedf['sstat']


In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.8, train_size=0.2, random_state=42)

In [63]:
#numerical columns processing pipeline
numeric_processor = Pipeline(steps = [('imputation_mean', SimpleImputer(missing_values=np.nan, strategy='mean')),('scaler', StandardScaler())])

#categorical columns processing pipeline
categorical_processor = Pipeline(steps = [('imputation_constant', SimpleImputer(fill_value='missing', strategy='constant')),('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [64]:
#combine processing techniques
numeric_columns = X.select_dtypes(include=['float','int']).columns.tolist()
categoric_columns = completedf.select_dtypes(include='object').columns.tolist()

preprocessor = ColumnTransformer(
    [('categorical', categorical_processor, categoric_columns),('numerical', numeric_processor, numeric_columns)]
)


#### Modeling and Evaluation

In [65]:

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,roc_auc_score

In [66]:
models ={
    # "LogisticRegression":LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "KNN":KNeighborsClassifier(),
    "SVM":SVC()
}


In [67]:
for name, clf in models.items():

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', clf)
    ])

    pipeline.fit(X_train, y_train)

    #prediction
    y_pred = pipeline.predict(X_test)

    #set performance
    model_accuracy = accuracy_score(y_test, y_pred)
    model_f1= f1_score(y_test, y_pred, average='weighted')
    model_precision = precision_score(y_test, y_pred,average='weighted', labels=[7, 8])
    model_recall = recall_score(y_test, y_pred, average='weighted',labels=[7, 8])
    # model_rocauc_score = roc_auc_score(y_test, y_pred)

    print(f"\n- {name} Performance")
    print("Accuracy:", model_accuracy)
    print("F1:", model_f1)
    print("Precision:", model_precision)
    print("Recall:", model_recall)


    # print(y_pred)
    # print(np.array(y_test))




- Decision Tree Performance
Accuracy: 0.9259259259259259
F1: 0.9177631065536671
Precision: 0.9243870545930043
Recall: 0.9398496240601504

- Random Forest Performance
Accuracy: 0.9333333333333333
F1: 0.9272141706924316
Precision: 0.9354636591478696
Recall: 0.9473684210526315

- KNN Performance
Accuracy: 0.9037037037037037
F1: 0.892544901860768
Precision: 0.9000817097579441
Recall: 0.9172932330827067

- SVM Performance
Accuracy: 0.9111111111111111
F1: 0.9016200470374748
Precision: 0.9076184108183343
Recall: 0.924812030075188
