In [5]:
# Library import
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
np.random.seed(123)

In [23]:
# Load data
df = pd.read_csv("compiled_metabric_data.csv")
df = df.drop(["Study_ID", "Patient_ID","SAMPLE_ID", "Number_of_Samples_Per_Patient", "X3_Gene_classifier_subtype",
"Sample_Type", "STUDY_ID.y.y", "STUDY_ID.x.x", "STUDY_ID.y", "Integrative_Cluster","Cancer_Type","Cohort", "Sex","Patient_s_Vital_Status","STUDY_ID.x", "Overall_Survival__Months_"], axis='columns')
print(df.shape)
df.head(5)

(1980, 1115)


Unnamed: 0,Age_at_Diagnosis,Type_of_Breast_Surgery,Cancer_Type_Detailed,Cellularity,Chemotherapy,Pam50___Claudin_low_subtype,ER_status_measured_by_IHC,ER_Status,Neoplasm_Histologic_Grade,HER2_status_measured_by_SNP6,...,ZMYM3,ZNF217,ZNF24,ZNF331,ZNF384,ZNF521,ZNF703,ZNF750,ZNRF3,ZRSR2
0,75.65,MASTECTOMY,Breast Invasive Ductal Carcinoma,,NO,claudin-low,Positve,Positive,3.0,NEUTRAL,...,-1.2353,-0.9108,0.8264,1.4323,0.2993,0.257,-1.1906,0.9601,0.0602,-0.8611
1,43.19,BREAST CONSERVING,Breast Invasive Ductal Carcinoma,High,NO,LumA,Positve,Positive,3.0,NEUTRAL,...,0.4498,1.9404,-1.0392,1.3811,-0.7523,3.2136,2.3105,-1.2409,0.5651,0.0359
2,48.87,MASTECTOMY,Breast Invasive Ductal Carcinoma,High,YES,LumB,Positve,Positive,2.0,NEUTRAL,...,-1.4384,1.904,0.8654,1.9693,-1.8649,2.5206,1.7521,-1.4089,-0.9516,-1.9698
3,47.68,MASTECTOMY,Breast Mixed Ductal and Lobular Carcinoma,Moderate,YES,LumB,Positve,Positive,2.0,NEUTRAL,...,-1.3139,5.0115,0.9851,0.6683,-1.9425,3.2689,2.5035,0.0868,-1.6895,-1.0688
4,76.97,MASTECTOMY,Breast Mixed Ductal and Lobular Carcinoma,High,YES,LumB,Positve,Positive,3.0,NEUTRAL,...,-1.0842,1.0616,-0.753,0.2302,-1.8535,-0.5395,0.995,-0.576,0.7988,-0.4626


In [7]:
# Transforming binary data
clean_df = df.copy()
clean_df["Overall_Survival_Status"] = clean_df["Overall_Survival_Status"].replace(["0:LIVING", "1:DECEASED"], [0, 1])
clean_df["Chemotherapy"] = clean_df["Chemotherapy"].replace(["NO", "YES"], [0, 1])
clean_df["ER_status_measured_by_IHC"] = clean_df["ER_status_measured_by_IHC"].replace(["Negative", "Positve"], [0, 1])
clean_df["ER_Status"] = clean_df["ER_Status"].replace(["Negative", "Positive"], [0, 1])
clean_df["HER2_Status"] = clean_df["HER2_Status"].replace(["Negative", "Positive"], [0, 1])
clean_df["PR_Status"] = clean_df["PR_Status"].replace(["Negative", "Positive"], [0, 1])
clean_df["Hormone_Therapy"] = clean_df["Hormone_Therapy"].replace(["NO", "YES"], [0, 1])
clean_df["Radio_Therapy"] = clean_df["Hormone_Therapy"].replace(["NO", "YES"], [0, 1])
clean_df["Inferred_Menopausal_State"] = clean_df["Inferred_Menopausal_State"].replace(["Pre", "Post"], [0, 1])
clean_df["Relapse_Free_Status"] = clean_df["Relapse_Free_Status"].replace(["0:Not Recurred", "1:Recurred"], [0, 1])
clean_df["Mutation_Count"] = clean_df["Mutation_Count"].fillna(0)
clean_df.iloc[:,:10].head(3)

Unnamed: 0,Age_at_Diagnosis,Type_of_Breast_Surgery,Cancer_Type_Detailed,Cellularity,Chemotherapy,Pam50___Claudin_low_subtype,ER_status_measured_by_IHC,ER_Status,Neoplasm_Histologic_Grade,HER2_status_measured_by_SNP6
0,75.65,MASTECTOMY,Breast Invasive Ductal Carcinoma,,0,claudin-low,1.0,1,3.0,NEUTRAL
1,43.19,BREAST CONSERVING,Breast Invasive Ductal Carcinoma,High,0,LumA,1.0,1,3.0,NEUTRAL
2,48.87,MASTECTOMY,Breast Invasive Ductal Carcinoma,High,1,LumB,1.0,1,2.0,NEUTRAL


In [8]:
clean_df.iloc[:,11:20].head(3)

Unnamed: 0,Tumor_Other_Histologic_Subtype,Hormone_Therapy,Inferred_Menopausal_State,Primary_Tumor_Laterality,Lymph_nodes_examined_positive,Mutation_Count,Nottingham_prognostic_index,Oncotree_Code,Overall_Survival_Status
0,Ductal/NST,1,1,Right,10.0,0.0,6.044,IDC,0
1,Ductal/NST,1,0,Right,0.0,2.0,4.02,IDC,0
2,Ductal/NST,1,0,Right,1.0,2.0,4.03,IDC,1


In [9]:
clean_df.iloc[:,21:30].head(3)

Unnamed: 0,Radio_Therapy,Relapse_Free_Status__Months_,Relapse_Free_Status,TMB__nonsynonymous_,Tumor_Size,Tumor_Stage,ABI1,ABL1,ABL2
0,1,138.65,0.0,0.0,22.0,2.0,-0.6831,0.4551,6.2356
1,1,83.52,0.0,2.615035,10.0,1.0,0.4743,0.3933,1.6754
2,1,151.28,1.0,2.615035,15.0,2.0,-0.0267,-0.9879,-0.2011


In [10]:
# Dropping individuals with NA in the clinical features
clean_df2 = clean_df.dropna(subset = clean_df.columns.values[0:27])
clean_df2.shape

(1263, 1115)

In [11]:
# Find genes that are entirely missing
removed_genes = list(clean_df2.isna().sum()[clean_df2.isna().sum() == 1263].index)
removed_genes

['CRTC3',
 'H2AC16',
 'H3C14',
 'H3C6',
 'H3P6',
 'IGH',
 'IGK',
 'IGL',
 'MDS2',
 'MEF2B',
 'MTCP1',
 'MYBL1',
 'NUTM2B',
 'PCLO',
 'PDCD11',
 'RNF217.AS1',
 'SDHAF2',
 'TMSB4XP8',
 'TRA',
 'TRB',
 'TRD',
 'TRG']

In [12]:
clean_df2 = clean_df2.drop(removed_genes, axis = "columns")
clean_df2.shape
# The final dimension of the cleaned dataset is 1263 samples with 1,093 predictors

(1263, 1093)

In [13]:
# Get categorical columns
cols = clean_df2.columns
num_cols = clean_df2._get_numeric_data().columns
cat_cols = list(set(cols) - set(num_cols))
cat_cols

['Pam50___Claudin_low_subtype',
 'Cancer_Type_Detailed',
 'Oncotree_Code',
 'Cellularity',
 'Primary_Tumor_Laterality',
 'Type_of_Breast_Surgery',
 'Tumor_Other_Histologic_Subtype',
 'HER2_status_measured_by_SNP6']

In [14]:
# Dummy variables for categorical columns
clean_df3 = pd.get_dummies(clean_df2, columns= cat_cols)
clean_df3

Unnamed: 0,Age_at_Diagnosis,Chemotherapy,ER_status_measured_by_IHC,ER_Status,Neoplasm_Histologic_Grade,HER2_Status,Hormone_Therapy,Inferred_Menopausal_State,Lymph_nodes_examined_positive,Mutation_Count,...,Tumor_Other_Histologic_Subtype_Lobular,Tumor_Other_Histologic_Subtype_Medullary,Tumor_Other_Histologic_Subtype_Mixed,Tumor_Other_Histologic_Subtype_Mucinous,Tumor_Other_Histologic_Subtype_Other,Tumor_Other_Histologic_Subtype_Tubular/ cribriform,HER2_status_measured_by_SNP6_GAIN,HER2_status_measured_by_SNP6_LOSS,HER2_status_measured_by_SNP6_NEUTRAL,HER2_status_measured_by_SNP6_UNDEF
1,43.19,0,1.0,1,3.0,0,1,0,0.0,2.0,...,0,0,0,0,0,0,0,0,1,0
2,48.87,1,1.0,1,2.0,0,1,0,1.0,2.0,...,0,0,0,0,0,0,0,0,1,0
3,47.68,1,1.0,1,2.0,0,1,0,3.0,1.0,...,0,0,1,0,0,0,0,0,1,0
4,76.97,1,1.0,1,3.0,0,1,1,8.0,2.0,...,0,0,1,0,0,0,0,0,1,0
5,78.77,0,1.0,1,3.0,0,1,1,0.0,4.0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1693,70.65,0,1.0,1,1.0,0,0,1,0.0,9.0,...,0,0,1,0,0,0,0,0,1,0
1695,75.62,0,0.0,0,3.0,0,0,1,0.0,4.0,...,0,0,0,0,0,0,1,0,0,0
1697,52.84,1,1.0,1,2.0,0,0,1,6.0,5.0,...,0,0,1,0,0,0,0,0,1,0
1702,48.59,1,0.0,0,3.0,0,0,0,1.0,6.0,...,0,0,0,0,0,0,0,1,0,0


In [15]:
X = clean_df3.drop(["Overall_Survival_Status"], axis = "columns")
X.shape

(1263, 1119)

In [16]:
Y = clean_df3["Overall_Survival_Status"]
Y.shape

(1263,)

In [17]:
# Training/Testing split
X_train, X_rem, y_train, y_rem = train_test_split(X,Y, train_size=0.8)

# Training/Validation split
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

In [18]:
print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

(1010, 1119)
(1010,)
(126, 1119)
(126,)
(127, 1119)
(127,)


(None, None)

In [22]:
clean_df2.to_csv("cleaned_data.csv", index = False)

In [36]:
X_train.to_csv("./Split_Data/X_train.csv", index = False)
y_train.to_csv("./Split_Data/y_train.csv", index = False)
X_valid.to_csv("./Split_Data/X_valid.csv", index = False)
y_valid.to_csv("./Split_Data/y_valid.csv", index = False)
X_test.to_csv("./Split_Data/X_test.csv", index = False)
y_test.to_csv("./Split_Data/y_test.csv", index = False)