# Data Preprocessing
1. Load raw data obtained using curatedTCGAData.R.
2. Select clinical and biospecimen data, combine with RNA expression based on patient ID.
3. Remove columns with NA values > 0.2.
4. Drop rows with "status" and "time" values being NA.
5. Imputation using SimpleImputer.
6. Encode categorical variables.
7. Perform Min-Max normalization.
8. Output data.

In [50]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
import pandas as pd

# Read the colData file
colData = pd.read_csv('/content/drive/My Drive/3799/tumor_colData.csv',low_memory=False)

# Read the RNASeq2GeneNorm-2 file
RNA = pd.read_csv('/content/drive/My Drive/3799/tumor_BRCA_RNASeq2GeneNorm-20160128.csv')

In [52]:
colData = colData.drop("Unnamed: 0", axis=1)
colData.head(2)

Unnamed: 0,patientID,years_to_birth,vital_status,days_to_death,days_to_last_followup,tumor_tissue_site,pathologic_stage,pathology_T_stage,pathology_N_stage,pathology_M_stage,...,SigClust.Unsupervised.mRNA,SigClust.Intrinsic.mRNA,miRNA.Clusters,methylation.Clusters,RPPA.Clusters,CN.Clusters,Integrated.Clusters..with.PAM50.,Integrated.Clusters..no.exp.,Integrated.Clusters..unsup.exp.,X60.Gene.classifier.Class.Assignment
0,TCGA-A1-A0SB,70.0,0,,259.0,breast,stage i,t1c,n0,m0,...,,,3.0,5.0,,1.0,,,,
1,TCGA-A1-A0SD,59.0,0,,437.0,breast,stage iia,t2,n0,m0,...,-3.0,-9.0,6.0,1.0,,2.0,,,,


In [53]:
# prepare survival data
import numpy as np
colData['time'] = np.where(colData['vital_status'] == 0, colData['days_to_last_followup'], colData['days_to_death'])

In [54]:
# select relevant clinical and biospecimen data

selected_cols = [  'patientID',
            # Survival data
            'vital_status', 'time',

            # staging parameters
            'pathologic_stage', 'pathology_T_stage', 'pathology_N_stage', 'pathology_M_stage',

            # Demographical variables
            'years_to_birth', 'gender', 'race', 'ethnicity',

            # Lymph_node related
            'number_of_lymph_nodes', 'patient.lymph_node_examined_count',
            'patient.axillary_lymph_node_stage_method_type',
            'patient.number_of_lymphnodes_positive_by_he',

            # ER_Status
            'patient.breast_carcinoma_estrogen_receptor_status',
            'patient.er_level_cell_percentage_category',

            # PR_Status
            'patient.breast_carcinoma_progesterone_receptor_status',
            'patient.progesterone_receptor_level_cell_percent_category',

            # HER_Status
            'patient.lab_proc_her2_neu_immunohistochemistry_receptor_status',
            'patient.her2_erbb_pos_finding_cell_percent_category',

            # Other_status
            'patient.margin_status', 'patient.menopause_status',

            # Variables related to treatment and diagnosis
            'radiation_therapy', 'histological_type',
            'patient.anatomic_neoplasm_subdivisions.anatomic_neoplasm_subdivision',
            'patient.breast_carcinoma_surgical_procedure_name',
            'patient.history_of_neoadjuvant_treatment',
            'patient.initial_pathologic_diagnosis_method',

            # Tumor related
            'patient.biospecimen_cqcf.tumor_samples.tumor_sample.tumor_necrosis_percent',
            'patient.biospecimen_cqcf.tumor_samples.tumor_sample.tumor_nuclei_percent',
            'patient.biospecimen_cqcf.tumor_samples.tumor_sample.tumor_weight'
            ]

selected_data = colData[selected_cols]

In [55]:
# Rename the columns

def rename_columns(col):
    return col.split('.')[-1]

selected_data.columns = selected_data.columns.map(rename_columns)

column_mapping = {
    'vital_status': 'status',
    'years_to_birth':'age'
}

selected_data = selected_data.rename(columns=column_mapping)

In [56]:
selected_data.columns

Index(['patientID', 'status', 'time', 'pathologic_stage', 'pathology_T_stage',
       'pathology_N_stage', 'pathology_M_stage', 'age', 'gender', 'race',
       'ethnicity', 'number_of_lymph_nodes', 'lymph_node_examined_count',
       'axillary_lymph_node_stage_method_type',
       'number_of_lymphnodes_positive_by_he',
       'breast_carcinoma_estrogen_receptor_status',
       'er_level_cell_percentage_category',
       'breast_carcinoma_progesterone_receptor_status',
       'progesterone_receptor_level_cell_percent_category',
       'lab_proc_her2_neu_immunohistochemistry_receptor_status',
       'her2_erbb_pos_finding_cell_percent_category', 'margin_status',
       'menopause_status', 'radiation_therapy', 'histological_type',
       'anatomic_neoplasm_subdivision',
       'breast_carcinoma_surgical_procedure_name',
       'history_of_neoadjuvant_treatment',
       'initial_pathologic_diagnosis_method', 'tumor_necrosis_percent',
       'tumor_nuclei_percent', 'tumor_weight'],
      dt

In [57]:
selected_data.shape

(1093, 32)

In [58]:
# Remove columns with NA values > 0.2
import pandas as pd

# Set threshold for the percentage of NA values
threshold = 0.2

# Calculate the percentage of missing values in each column
na_percentage = selected_data.isnull().mean()

# Get the column names where the percentage of missing values exceeds the threshold
columns_to_drop = na_percentage[na_percentage > threshold].index

# Drop columns with too many missing values
selected_data_filtered = selected_data.drop(columns=columns_to_drop)

In [59]:
selected_data_filtered.shape

(1093, 29)

In [60]:
selected_data_filtered.isnull().sum()

patientID                                                   0
status                                                      0
time                                                        1
pathologic_stage                                            8
pathology_T_stage                                           0
pathology_N_stage                                           0
pathology_M_stage                                           0
age                                                        15
gender                                                      0
race                                                       95
ethnicity                                                 174
number_of_lymph_nodes                                     168
lymph_node_examined_count                                 126
axillary_lymph_node_stage_method_type                     216
number_of_lymphnodes_positive_by_he                       168
breast_carcinoma_estrogen_receptor_status                  48
breast_c

In [61]:
col = RNA.columns
RNA = RNA.T  # Transpose the DataFrame
# Set the values in the first row as column names
RNA.columns = RNA.iloc[0]

# Drop the first row from the DataFrame
RNA = RNA[1:]

RNA = RNA.reset_index(drop=True)
col = col[1:]
RNA.insert(0, 'Hybridization REF', col)
RNA.insert(1, 'patientID', RNA['Hybridization REF'].str[:12])

In [62]:
RNA.head(2)

Unnamed: 0,Hybridization REF,patientID,A1BG,A1CF,A2BP1,A2LD1,A2ML1,A2M,A4GALT,A4GNT,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,psiTPTE22,tAKR
0,TCGA-3C-AAAU-01A-11R-A41B-07,TCGA-3C-AAAU,197.0897,0.0,0.0,102.9634,1.3786,5798.3746,68.2424,8.6165,...,129.5917,1007.7824,1658.4983,258.4941,1208.3738,3507.2482,1894.9342,1180.4565,1.7233,0.0
1,TCGA-3C-AALI-01A-11R-A41B-07,TCGA-3C-AALI,237.3844,0.0,0.0,70.8646,4.3502,7571.9793,157.6944,0.5438,...,59.8151,448.6134,1343.1213,198.4774,603.5889,5504.6221,1318.6514,406.7428,926.5905,0.0


In [63]:
# Check for NA values in columns
columns_with_na = RNA.columns[RNA.isnull().any()]

if len(columns_with_na) > 0:
    print("Columns with NA values:")
    print(columns_with_na)
else:
    print("No columns with NA values.")

No columns with NA values.


In [64]:
duplicate_rows = RNA[RNA.duplicated(subset='patientID')]

if duplicate_rows.empty:
    print("No duplicate values found in the DataFrame.")
else:
    print("Duplicate values found in the DataFrame:")
    print(duplicate_rows)

No duplicate values found in the DataFrame.


In [65]:
print(colData.shape)
print(RNA.shape)

(1093, 2685)
(1093, 20503)


In [66]:
# Merge Dataframes based on patient ID
# only consider RNA (gene expression)
merged_df = pd.merge(selected_data_filtered, RNA, on='patientID')

In [67]:
print(merged_df.shape)

(1093, 20531)


In [68]:
cols = list(merged_df.columns)
cols.insert(0, cols.pop(cols.index('Hybridization REF')))
merged_df = merged_df.loc[:, cols]

In [69]:
merged_df.head(2)

Unnamed: 0,Hybridization REF,patientID,status,time,pathologic_stage,pathology_T_stage,pathology_N_stage,pathology_M_stage,age,gender,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,psiTPTE22,tAKR
0,TCGA-A1-A0SB-01A-11R-A144-07,TCGA-A1-A0SB,0,259.0,stage i,t1c,n0,m0,70.0,female,...,95.9568,519.4279,1415.9252,19.3716,1364.5681,6186.7327,1931.2986,1436.1978,552.3144,0.0
1,TCGA-A1-A0SD-01A-11R-A115-07,TCGA-A1-A0SD,0,437.0,stage iia,t2,n0,m0,59.0,female,...,96.27,578.2814,1225.7051,33.0825,868.0837,3559.6725,1278.9678,1195.6,86.0144,0.0


In [70]:
merged_df.shape

(1093, 20531)

In [71]:
# RNA[RNA['Hybridization REF'].str.contains('TCGA-A7-A0CE')]

# but in firebrowse (previous data), 2 samples
# tcga-a7-a0ce  TCGA-A7-A0CE-11A-21R-A089-07  normalized_count
# tcga-a7-a0ce  TCGA-A7-A0CE-01A-11R-A00Z-07  normalized_count

# Why is the problem resolved? As this data filtered only code 01 Primary Solid Tumor
# 11 refers to Solid Tissue Normal

In [72]:
# drop rows with "status" and "time" value being NA
merged_df_filtered = merged_df.dropna(subset=['status', 'time'])

In [73]:
print(merged_df.shape)
print(merged_df_filtered.shape)

(1093, 20531)
(1092, 20531)


In [74]:
# Find and drop columns with constant values
constant_cols = merged_df_filtered.columns[merged_df_filtered.nunique() <= 1]
new_merged_df_filtered = merged_df_filtered.drop(columns=constant_cols).copy()

In [75]:
print(merged_df_filtered.shape)
print(new_merged_df_filtered.shape)

(1092, 20531)
(1092, 20247)


In [76]:
# As RNA data has no missing value, only data from colData need imputation
categorical_features_idx = [4, 5, 6, 7, 9, 10, 11, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
numerical_features_idx = [8, 12, 13, 15, 27, 28, 29]
#numerical_features_idx += list(range(30, 20247))

In [77]:
# Imputation

from sklearn.impute import SimpleImputer

merged_df_imputed = new_merged_df_filtered

imputer = SimpleImputer(strategy='mean')
merged_df_imputed.iloc[:,numerical_features_idx] = imputer.fit_transform(merged_df_imputed.iloc[:,numerical_features_idx])

categorical_imputer = SimpleImputer(strategy='most_frequent')
merged_df_imputed.iloc[:,categorical_features_idx] = categorical_imputer.fit_transform(merged_df_imputed.iloc[:,categorical_features_idx])

In [78]:
# Check for NA values in columns
columns_with_na = merged_df_imputed.columns[merged_df_imputed.isnull().any()]

if len(columns_with_na) > 0:
    print("Columns with NA values:")
    print(columns_with_na)
else:
    print("No columns with NA values.")

No columns with NA values.


In [79]:
merged_df_imputed.shape

(1092, 20247)

In [80]:
# update numerical features index
numerical_features_idx = [idx for idx in range(merged_df_imputed.shape[1]) if idx not in categorical_features_idx][4:]

In [81]:
# Reorder data frame
y = merged_df_imputed.iloc[:,2:4]
categorical_df = merged_df_imputed.iloc[:, categorical_features_idx]
numerical_df = merged_df_imputed.iloc[:, numerical_features_idx]

reordered_df = pd.concat([y, numerical_df, categorical_df], axis=1)

In [82]:
reordered_df.shape

(1092, 20245)

In [83]:
categorical_start = reordered_df.columns.get_loc('pathologic_stage')

In [84]:
# survival data
reordered_df.iloc[:,:2].columns

Index(['status', 'time'], dtype='object')

In [85]:
# numerical feature
reordered_df.iloc[:,2:categorical_start].columns

Index(['age', 'number_of_lymph_nodes', 'lymph_node_examined_count',
       'number_of_lymphnodes_positive_by_he', 'tumor_necrosis_percent',
       'tumor_nuclei_percent', 'tumor_weight', 'A1BG', 'A1CF', 'A2BP1',
       ...
       'ZXDA', 'ZXDB', 'ZXDC', 'ZYG11A', 'ZYG11B', 'ZYX', 'ZZEF1', 'ZZZ3',
       'psiTPTE22', 'tAKR'],
      dtype='object', length=20224)

In [86]:
# categorical feature
reordered_df.iloc[:,categorical_start:].columns

Index(['pathologic_stage', 'pathology_T_stage', 'pathology_N_stage',
       'pathology_M_stage', 'gender', 'race', 'ethnicity',
       'axillary_lymph_node_stage_method_type',
       'breast_carcinoma_estrogen_receptor_status',
       'breast_carcinoma_progesterone_receptor_status',
       'lab_proc_her2_neu_immunohistochemistry_receptor_status',
       'margin_status', 'menopause_status', 'radiation_therapy',
       'histological_type', 'anatomic_neoplasm_subdivision',
       'breast_carcinoma_surgical_procedure_name',
       'history_of_neoadjuvant_treatment',
       'initial_pathologic_diagnosis_method'],
      dtype='object')

In [87]:
#pip install scikit-survival

In [88]:
from sksurv.column import encode_categorical
# Dummy encoding, similar to one-hot encoding, but drops one category to avoid multicollinearity.
encoded = encode_categorical(reordered_df, columns=reordered_df.iloc[:,categorical_start:].columns)

In [89]:
encoded.head()

Unnamed: 0,status,time,age,number_of_lymph_nodes,lymph_node_examined_count,number_of_lymphnodes_positive_by_he,tumor_necrosis_percent,tumor_nuclei_percent,tumor_weight,A1BG,...,breast_carcinoma_surgical_procedure_name=modified radical mastectomy,breast_carcinoma_surgical_procedure_name=other,breast_carcinoma_surgical_procedure_name=simple mastectomy,history_of_neoadjuvant_treatment=yes,initial_pathologic_diagnosis_method=cytology (e.g. peritoneal or pleural fluid),initial_pathologic_diagnosis_method=excisional biopsy,initial_pathologic_diagnosis_method=fine needle aspiration biopsy,initial_pathologic_diagnosis_method=incisional biopsy,"initial_pathologic_diagnosis_method=other method, specify:",initial_pathologic_diagnosis_method=tumor resection
0,0,259.0,70.0,0.0,2.0,0.0,0.0,85.0,500.0,49.1992,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0,437.0,59.0,0.0,3.0,0.0,0.0,70.0,500.0,142.2976,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1321.0,56.0,0.0,8.0,0.0,0.0,90.0,500.0,192.8194,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1463.0,54.0,0.0,2.0,0.0,0.0,75.0,500.0,326.0194,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0,434.0,61.0,1.0,11.0,1.0,0.0,90.0,500.0,168.8309,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
encoded.shape

(1092, 20315)

In [91]:
# perform Min-Max normalization
from sklearn.preprocessing import MinMaxScaler

numerical_features = encoded.iloc[:, 2:categorical_start]

scaler = MinMaxScaler()
numerical_features_scaled = scaler.fit_transform(numerical_features)

# Create a new pandas DataFrame with the scaled values
numerical_features_scaled_df = pd.DataFrame(numerical_features_scaled, columns=numerical_features.columns)

In [92]:
y = encoded.iloc[:, :2]
new_categorical_df = encoded.iloc[:, categorical_start:]
numerical_features_scaled_df = numerical_features_scaled_df.set_index(y.index)
# Concatenating the dataframes
scaled_df = pd.concat([y, numerical_features_scaled_df, new_categorical_df], axis=1)

In [93]:
scaled_df.head()

Unnamed: 0,status,time,age,number_of_lymph_nodes,lymph_node_examined_count,number_of_lymphnodes_positive_by_he,tumor_necrosis_percent,tumor_nuclei_percent,tumor_weight,A1BG,...,breast_carcinoma_surgical_procedure_name=modified radical mastectomy,breast_carcinoma_surgical_procedure_name=other,breast_carcinoma_surgical_procedure_name=simple mastectomy,history_of_neoadjuvant_treatment=yes,initial_pathologic_diagnosis_method=cytology (e.g. peritoneal or pleural fluid),initial_pathologic_diagnosis_method=excisional biopsy,initial_pathologic_diagnosis_method=fine needle aspiration biopsy,initial_pathologic_diagnosis_method=incisional biopsy,"initial_pathologic_diagnosis_method=other method, specify:",initial_pathologic_diagnosis_method=tumor resection
0,0,259.0,0.6875,0.0,0.045455,0.0,0.0,0.8125,0.158499,0.015338,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0,437.0,0.515625,0.0,0.068182,0.0,0.0,0.625,0.158499,0.049253,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1321.0,0.46875,0.0,0.181818,0.0,0.0,0.875,0.158499,0.067657,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1463.0,0.4375,0.0,0.045455,0.0,0.0,0.6875,0.158499,0.11618,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0,434.0,0.546875,0.028571,0.25,0.028571,0.0,0.875,0.158499,0.058918,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
print(encoded.shape)
print(y.shape)
print(numerical_features_scaled_df.shape)
print(new_categorical_df.shape)
print(scaled_df.shape)

(1092, 20315)
(1092, 2)
(1092, 20224)
(1092, 89)
(1092, 20315)


In [95]:
# Save the DataFrame to a CSV file
# scaled_df.to_csv('/content/drive/My Drive/3799/brca.csv', index=False)