# Data Preprocessing
1. Load raw data obtained using curatedTCGAData.R.
2. Remove cases with incomplete survival data
3. Select clinical and biospecimen data, combine with RNA expression based on patient ID.
4. Remove columns with NA values > 0.2.
5. Remove columns with constant values.
6. Imputation using KNN Imputer.
7. Encode categorical variables.
8. Perform Min-Max normalization.
9. Output data.

## Load raw data

In [63]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [64]:
import pandas as pd

# Read the colData file
colData = pd.read_csv('/content/drive/My Drive/3799/tumor_colData.csv',low_memory=False)

# Read the RNASeq2GeneNorm-2 file
RNA = pd.read_csv('/content/drive/My Drive/3799/tumor_BRCA_RNASeq2GeneNorm-20160128.csv')

In [65]:
colData = colData.drop("Unnamed: 0", axis=1)
colData.head(2)

Unnamed: 0,patientID,years_to_birth,vital_status,days_to_death,days_to_last_followup,tumor_tissue_site,pathologic_stage,pathology_T_stage,pathology_N_stage,pathology_M_stage,...,SigClust.Unsupervised.mRNA,SigClust.Intrinsic.mRNA,miRNA.Clusters,methylation.Clusters,RPPA.Clusters,CN.Clusters,Integrated.Clusters..with.PAM50.,Integrated.Clusters..no.exp.,Integrated.Clusters..unsup.exp.,X60.Gene.classifier.Class.Assignment
0,TCGA-A1-A0SB,70.0,0,,259.0,breast,stage i,t1c,n0,m0,...,,,3.0,5.0,,1.0,,,,
1,TCGA-A1-A0SD,59.0,0,,437.0,breast,stage iia,t2,n0,m0,...,-3.0,-9.0,6.0,1.0,,2.0,,,,


In [66]:
# prepare survival data
import numpy as np
colData['time'] = np.where(colData['vital_status'] == 0, colData['days_to_last_followup'], colData['days_to_death'])

In [67]:
# remove cases with incomplete survival data
colData_new = colData[colData['time'] > 0]

In [68]:
print(colData.shape)
print(colData_new.shape)

(1093, 2685)
(1079, 2685)


In [69]:
# select relevant clinical and biospecimen data

selected_cols = [  'patientID',
            # Survival data
            'vital_status', 'time',

            # staging parameters
            'pathologic_stage', 'pathology_T_stage', 'pathology_N_stage', 'pathology_M_stage',

            # Demographical variables
            'years_to_birth', 'gender', 'race', 'ethnicity',

            # Lymph_node related
            'number_of_lymph_nodes', 'patient.lymph_node_examined_count',
            'patient.axillary_lymph_node_stage_method_type',
            'patient.number_of_lymphnodes_positive_by_he',

            # ER_Status
            'patient.breast_carcinoma_estrogen_receptor_status',
            'patient.er_level_cell_percentage_category',

            # PR_Status
            'patient.breast_carcinoma_progesterone_receptor_status',
            'patient.progesterone_receptor_level_cell_percent_category',

            # HER2_Status
            'patient.lab_proc_her2_neu_immunohistochemistry_receptor_status',
            'patient.her2_erbb_pos_finding_cell_percent_category',

            # Other_status
            'patient.margin_status', 'patient.menopause_status',

            # Variables related to treatment and diagnosis
            'radiation_therapy', 'histological_type',
            'patient.anatomic_neoplasm_subdivisions.anatomic_neoplasm_subdivision',
            'patient.breast_carcinoma_surgical_procedure_name',
            'patient.history_of_neoadjuvant_treatment',
            'patient.initial_pathologic_diagnosis_method',

            # Tumor related
            'patient.biospecimen_cqcf.tumor_samples.tumor_sample.tumor_necrosis_percent',
            'patient.biospecimen_cqcf.tumor_samples.tumor_sample.tumor_nuclei_percent',
            'patient.biospecimen_cqcf.tumor_samples.tumor_sample.tumor_weight',

            #'PAM50.mRNA'
            ]

selected_data = colData_new[selected_cols]

In [70]:
# Rename the columns

def rename_columns(col):
    return col.split('.')[-1]

selected_data.columns = selected_data.columns.map(rename_columns)

column_mapping = {
    'vital_status': 'status',
    'years_to_birth':'age',
    'breast_carcinoma_estrogen_receptor_status':'er_status',
    'breast_carcinoma_progesterone_receptor_status':'pr_status',
    'lab_proc_her2_neu_immunohistochemistry_receptor_status':'her2_status',
    'breast_carcinoma_surgical_procedure_name': 'surgical_procedure',
    'initial_pathologic_diagnosis_method': 'diagnosis_method',
    #'mRNA':'pam50'
}

selected_data = selected_data.rename(columns=column_mapping)

In [71]:
selected_data.columns

Index(['patientID', 'status', 'time', 'pathologic_stage', 'pathology_T_stage',
       'pathology_N_stage', 'pathology_M_stage', 'age', 'gender', 'race',
       'ethnicity', 'number_of_lymph_nodes', 'lymph_node_examined_count',
       'axillary_lymph_node_stage_method_type',
       'number_of_lymphnodes_positive_by_he', 'er_status',
       'er_level_cell_percentage_category', 'pr_status',
       'progesterone_receptor_level_cell_percent_category', 'her2_status',
       'her2_erbb_pos_finding_cell_percent_category', 'margin_status',
       'menopause_status', 'radiation_therapy', 'histological_type',
       'anatomic_neoplasm_subdivision', 'surgical_procedure',
       'history_of_neoadjuvant_treatment', 'diagnosis_method',
       'tumor_necrosis_percent', 'tumor_nuclei_percent', 'tumor_weight'],
      dtype='object')

In [72]:
print(colData_new.shape)
print(selected_data.shape)

(1079, 2685)
(1079, 32)


In [73]:
# Select rows where 'er_status', 'pr_status', and 'her2_status' are all negative
#triple_negative_br = selected_data[(selected_data['er_status'] == 'negative') &
#                  (selected_data['pr_status'] == 'negative') &
#                  (selected_data['her2_status'] == 'negative')]

#print(triple_negative_br.shape)

In [74]:
# Drop rows with 'NA' values in the 'pathologic_stage' column
#selected_data_cleaned = selected_data.dropna(subset=['pathologic_stage'])
# stage i-iii
#stage_1_3 = selected_data_cleaned[selected_data_cleaned['pathologic_stage'].str.startswith(('stage i', 'stage ii', 'stage iii')) & ~selected_data_cleaned['pathologic_stage'].str.startswith('stage iv')]
# stage i-ii
#stage_1_2 = selected_data_cleaned[selected_data_cleaned['pathologic_stage'].str.startswith(('stage i', 'stage ii')) & ~selected_data_cleaned['pathologic_stage'].str.startswith(('stage iii','stage iv'))]
# stage ii-iii
#stage_2_3 = selected_data_cleaned[selected_data_cleaned['pathologic_stage'].str.startswith(('stage ii'))]

In [75]:
# Remove columns with NA values > 0.2

# Set threshold for the percentage of NA values
threshold = 0.2

# Calculate the percentage of missing values in each column
na_percentage = selected_data.isnull().mean()

# Get the column names where the percentage of missing values exceeds the threshold
columns_to_drop = na_percentage[na_percentage > threshold].index

# Drop columns with too many missing values
selected_data_filtered = selected_data.drop(columns=columns_to_drop)

In [76]:
selected_data_filtered.shape

(1079, 29)

In [77]:
selected_data_filtered.isnull().sum()

patientID                                  0
status                                     0
time                                       0
pathologic_stage                           8
pathology_T_stage                          0
pathology_N_stage                          0
pathology_M_stage                          0
age                                       14
gender                                     0
race                                      86
ethnicity                                165
number_of_lymph_nodes                    157
lymph_node_examined_count                116
axillary_lymph_node_stage_method_type    207
number_of_lymphnodes_positive_by_he      157
er_status                                 45
pr_status                                 46
her2_status                              174
margin_status                             67
menopause_status                          88
radiation_therapy                         83
histological_type                          1
anatomic_n

In [78]:
col = RNA.columns
RNA = RNA.T  # Transpose the DataFrame
# Set the values in the first row as column names
RNA.columns = RNA.iloc[0]

# Drop the first row from the DataFrame
RNA = RNA[1:]

RNA = RNA.reset_index(drop=True)
col = col[1:]
RNA.insert(0, 'Hybridization REF', col)
RNA.insert(1, 'patientID', RNA['Hybridization REF'].str[:12])

In [79]:
RNA.head(2)

Unnamed: 0,Hybridization REF,patientID,A1BG,A1CF,A2BP1,A2LD1,A2ML1,A2M,A4GALT,A4GNT,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,psiTPTE22,tAKR
0,TCGA-3C-AAAU-01A-11R-A41B-07,TCGA-3C-AAAU,197.0897,0.0,0.0,102.9634,1.3786,5798.3746,68.2424,8.6165,...,129.5917,1007.7824,1658.4983,258.4941,1208.3738,3507.2482,1894.9342,1180.4565,1.7233,0.0
1,TCGA-3C-AALI-01A-11R-A41B-07,TCGA-3C-AALI,237.3844,0.0,0.0,70.8646,4.3502,7571.9793,157.6944,0.5438,...,59.8151,448.6134,1343.1213,198.4774,603.5889,5504.6221,1318.6514,406.7428,926.5905,0.0


In [80]:
# Check for NA values in columns
columns_with_na = RNA.columns[RNA.isnull().any()]

if len(columns_with_na) > 0:
    print("Columns with NA values:")
    print(columns_with_na)
else:
    print("No columns with NA values.")

No columns with NA values.


In [81]:
duplicate_rows = RNA[RNA.duplicated(subset='patientID')]

if duplicate_rows.empty:
    print("No duplicate values found in the DataFrame.")
else:
    print("Duplicate values found in the DataFrame:")
    print(duplicate_rows)

No duplicate values found in the DataFrame.


In [82]:
print(colData.shape)
print(selected_data_filtered.shape)
print(RNA.shape)

(1093, 2685)
(1079, 29)
(1093, 20503)


## Merge with RNA

In [83]:
# Merge Dataframes based on patient ID
# only consider RNA (gene expression)
merged_df = pd.merge(selected_data_filtered, RNA, on='patientID')

In [84]:
print(merged_df.shape)

(1079, 20531)


In [85]:
cols = list(merged_df.columns)
cols.insert(0, cols.pop(cols.index('Hybridization REF')))
merged_df = merged_df.loc[:, cols]

In [86]:
merged_df.head(2)

Unnamed: 0,Hybridization REF,patientID,status,time,pathologic_stage,pathology_T_stage,pathology_N_stage,pathology_M_stage,age,gender,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,psiTPTE22,tAKR
0,TCGA-A1-A0SB-01A-11R-A144-07,TCGA-A1-A0SB,0,259.0,stage i,t1c,n0,m0,70.0,female,...,95.9568,519.4279,1415.9252,19.3716,1364.5681,6186.7327,1931.2986,1436.1978,552.3144,0.0
1,TCGA-A1-A0SD-01A-11R-A115-07,TCGA-A1-A0SD,0,437.0,stage iia,t2,n0,m0,59.0,female,...,96.27,578.2814,1225.7051,33.0825,868.0837,3559.6725,1278.9678,1195.6,86.0144,0.0


In [87]:
merged_df.shape

(1079, 20531)

## Feature Engineering 1

In [88]:
# Assuming df is your DataFrame containing age data
bins = [0, 18, 30, 40, 50, 60, 70, 80, 90, 100]
labels = ['0-18', '19-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100']
merged_df['age_group'] = pd.cut(merged_df['age'], bins=bins, labels=labels, right=False)

In [89]:
merged_df['lymph_node_ratio'] = merged_df['number_of_lymphnodes_positive_by_he'] / merged_df['lymph_node_examined_count']

In [90]:
# Assuming 'er_status', 'pr_status', and 'her2_status' are categorical columns with values 'positive' and 'negative'

merged_df['is_tnbc'] = (merged_df['er_status'] == 'negative') & \
                           (merged_df['pr_status'] == 'negative') & \
                           (merged_df['her2_status'] == 'negative')

# Convert boolean values to integers (True -> 1, False -> 0)
merged_df['is_tnbc'] = merged_df['is_tnbc'].astype(int)

In [91]:
tnbc_count = merged_df[merged_df['is_tnbc'] == 1].shape[0]
print(tnbc_count)

115


In [92]:
# RNA[RNA['Hybridization REF'].str.contains('TCGA-A7-A0CE')]

# but in firebrowse (previous data), 2 samples
# tcga-a7-a0ce  TCGA-A7-A0CE-11A-21R-A089-07  normalized_count
# tcga-a7-a0ce  TCGA-A7-A0CE-01A-11R-A00Z-07  normalized_count

# Why is the problem resolved? As this data filtered only code 01 Primary Solid Tumor
# 11 refers to Solid Tissue Normal

In [93]:
# Find and drop columns with constant values
constant_cols = merged_df.columns[merged_df.nunique() <= 1]
new_merged_df = merged_df.drop(columns=constant_cols).copy()

In [94]:
print(merged_df.shape)
print(new_merged_df.shape)

(1079, 20534)
(1079, 20250)


## Imputation

In [95]:
# As RNA data has no missing value, only data from colData need imputation
categorical_features_idx = [4, 5, 6, 7, 9, 10, 11, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 20247, 20249]
numerical_features_idx = [8, 12, 13, 15, 27, 28, 29, 20248]
#numerical_features_idx += list(range(30, 20247))

In [96]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder

# Identify categorical features
categorical_features = new_merged_df.columns[categorical_features_idx]

# Encode categorical features
encoder = OrdinalEncoder()
merged_df_encoded = new_merged_df.copy()  # Create a copy of the dataframe
merged_df_encoded[categorical_features] = encoder.fit_transform(merged_df_encoded[categorical_features])

# Initialize KNN imputer for both numerical and encoded categorical features
imputer = KNNImputer(n_neighbors=5)

# Impute missing values in both numerical and encoded categorical features
merged_df_encoded.iloc[:,numerical_features_idx] = imputer.fit_transform(merged_df_encoded.iloc[:,numerical_features_idx])
merged_df_encoded.iloc[:,categorical_features_idx] = imputer.fit_transform(merged_df_encoded.iloc[:,categorical_features_idx])

# Inverse transform the encoded categorical features back to their original values
merged_df_encoded[categorical_features] = encoder.inverse_transform(merged_df_encoded[categorical_features])

In [97]:
merged_df_imputed = merged_df_encoded

In [98]:
# Check for NA values in columns
columns_with_na = merged_df_imputed.columns[merged_df_imputed.isnull().any()]

if len(columns_with_na) > 0:
    print("Columns with NA values:")
    print(columns_with_na)
else:
    print("No columns with NA values.")

No columns with NA values.


In [99]:
merged_df_imputed.shape

(1079, 20250)

In [100]:
# Export non-encoded & non-sacled brca dataset
# Save the DataFrame to a CSV file
# merged_df_imputed.to_csv('/content/drive/My Drive/3799_new/brca_demo.csv', index=False)

In [101]:
merged_df_imputed.iloc[:,2:30]

Unnamed: 0,status,time,pathologic_stage,pathology_T_stage,pathology_N_stage,pathology_M_stage,age,gender,race,ethnicity,...,menopause_status,radiation_therapy,histological_type,anatomic_neoplasm_subdivision,surgical_procedure,history_of_neoadjuvant_treatment,diagnosis_method,tumor_necrosis_percent,tumor_nuclei_percent,tumor_weight
0,0,259.0,stage i,t1c,n0,m0,70.0,female,white,not hispanic or latino,...,post (prior bilateral ovariectomy or >12 mo si...,no,"other, specify",left,lumpectomy,no,fine needle aspiration biopsy,0.0,85.0,500.0
1,0,437.0,stage iia,t2,n0,m0,59.0,female,white,not hispanic or latino,...,post (prior bilateral ovariectomy or >12 mo si...,no,infiltrating ductal carcinoma,left,lumpectomy,no,core needle biopsy,0.0,70.0,500.0
2,0,1321.0,stage i,t1c,n0 (i-),m0,56.0,female,white,not hispanic or latino,...,pre (<6 months since lmp and no prior bilatera...,no,mixed histology (please specify),left upper outer quadrant,modified radical mastectomy,no,core needle biopsy,0.0,90.0,500.0
3,0,1463.0,stage iia,t2,n0,m0,54.0,female,white,not hispanic or latino,...,pre (<6 months since lmp and no prior bilatera...,no,infiltrating ductal carcinoma,left,modified radical mastectomy,no,fine needle aspiration biopsy,0.0,75.0,500.0
4,0,434.0,stage iib,t2,n1a,m0,61.0,female,white,not hispanic or latino,...,post (prior bilateral ovariectomy or >12 mo si...,no,"other, specify",right,lumpectomy,no,core needle biopsy,0.0,90.0,500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1074,0,428.0,stage iia,t2,n0,mx,75.0,female,black or african american,not hispanic or latino,...,post (prior bilateral ovariectomy or >12 mo si...,no,infiltrating ductal carcinoma,right,simple mastectomy,no,core needle biopsy,10.0,40.0,189.0
1075,0,215.0,stage iiia,t2,n2a,m0,43.0,female,black or african american,not hispanic or latino,...,post (prior bilateral ovariectomy or >12 mo si...,yes,infiltrating ductal carcinoma,left upper outer quadrant,lumpectomy,no,tumor resection,0.0,70.0,200.0
1076,0,562.0,stage iib,t2,n1,m0,71.0,female,black or african american,not hispanic or latino,...,post (prior bilateral ovariectomy or >12 mo si...,yes,mixed histology (please specify),left,lumpectomy,no,excisional biopsy,3.0,60.0,400.0
1077,0,1062.0,stage ia,t1c,n0,m0,48.0,female,black or african american,not hispanic or latino,...,peri (6-12 months since last menstrual period),yes,infiltrating ductal carcinoma,right upper outer quadrant,other,no,core needle biopsy,0.0,80.0,103.0


## Feature Engineering 2

In [102]:
# As RNA data has no missing value, only data from colData need imputation
categorical_features_idx = [4, 5, 6, 7, 9, 10, 11, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 20247, 20249]
numerical_features_idx = [8, 12, 13, 15, 27, 28, 29, 20248]
numerical_features_idx += list(range(30, 20247))

In [103]:
merged_df_imputed.shape

(1079, 20250)

In [104]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline

# Feature selection to identify relevant numerical features
# 200 and 300 has same result for deepsurv
selector = SelectKBest(score_func=f_regression, k=200)  # Adjust k based on feature importance analysis
selected_features = selector.fit(merged_df_imputed.iloc[:, numerical_features_idx], merged_df_imputed.iloc[:,2])
selected_feature_indices = selector.get_support(indices=True)
selected_features = selector.transform(merged_df_imputed.iloc[:, numerical_features_idx])

In [105]:
from sklearn.preprocessing import SplineTransformer

In [106]:
# Interaction features
poly = PolynomialFeatures(interaction_only=True, include_bias=False)
interaction_features = poly.fit_transform(selected_features)
interaction_df = pd.DataFrame(interaction_features, columns=['inter_' + str(i) for i in range(1, interaction_features.shape[1] + 1)])

# Spline features
spline_transformer = SplineTransformer(degree=2, n_knots=3)
spline_features = spline_transformer.fit_transform(selected_features)
spline_df = pd.DataFrame(spline_features, columns=['spline_' + str(i) for i in range(1, spline_features.shape[1] + 1)])

# Polynomial features
degree = 2  # Set the degree of polynomial features
poly = PolynomialFeatures(degree=degree, include_bias=False)
poly_features = poly.fit_transform(selected_features)
poly_df = pd.DataFrame(poly_features, columns=['poly_' + str(i) for i in range(1, poly_features.shape[1] + 1)])

In [107]:
combined_df = pd.concat([merged_df_imputed, interaction_df, spline_df, poly_df], axis=1)
# combined_df = pd.concat([merged_df_imputed], axis=1)

In [108]:
# update numerical features index
numerical_features_idx = [idx for idx in range(combined_df.shape[1]) if idx not in categorical_features_idx][4:]

In [109]:
# Reorder data frame
y = combined_df.iloc[:,2:4]
categorical_df = combined_df.iloc[:, categorical_features_idx]
numerical_df = combined_df.iloc[:, numerical_features_idx]

reordered_df = pd.concat([y, numerical_df, categorical_df], axis=1)

In [110]:
reordered_df.shape

(1079, 61448)

In [111]:
categorical_start = reordered_df.columns.get_loc('pathologic_stage')

In [112]:
# survival data
reordered_df.iloc[:,:2].columns

Index(['status', 'time'], dtype='object')

In [113]:
# numerical feature
reordered_df.iloc[:,2:categorical_start].columns

Index(['age', 'number_of_lymph_nodes', 'lymph_node_examined_count',
       'number_of_lymphnodes_positive_by_he', 'tumor_necrosis_percent',
       'tumor_nuclei_percent', 'tumor_weight', 'A1BG', 'A1CF', 'A2BP1',
       ...
       'poly_20291', 'poly_20292', 'poly_20293', 'poly_20294', 'poly_20295',
       'poly_20296', 'poly_20297', 'poly_20298', 'poly_20299', 'poly_20300'],
      dtype='object', length=61425)

In [114]:
# categorical feature
reordered_df.iloc[:,categorical_start:].columns

Index(['pathologic_stage', 'pathology_T_stage', 'pathology_N_stage',
       'pathology_M_stage', 'gender', 'race', 'ethnicity',
       'axillary_lymph_node_stage_method_type', 'er_status', 'pr_status',
       'her2_status', 'margin_status', 'menopause_status', 'radiation_therapy',
       'histological_type', 'anatomic_neoplasm_subdivision',
       'surgical_procedure', 'history_of_neoadjuvant_treatment',
       'diagnosis_method', 'age_group', 'is_tnbc'],
      dtype='object')

## Encode Categorical

In [115]:
#pip install scikit-survival

In [116]:
from sksurv.column import encode_categorical
# encode_categorical: encode categorical columns with M categories into M-1 columns according to the one-hot scheme.
# Drops one category to avoid multicollinearity
encoded = encode_categorical(reordered_df, columns=reordered_df.iloc[:,categorical_start:].columns)

In [117]:
encoded.head()

Unnamed: 0,status,time,age,number_of_lymph_nodes,lymph_node_examined_count,number_of_lymphnodes_positive_by_he,tumor_necrosis_percent,tumor_nuclei_percent,tumor_weight,A1BG,...,"diagnosis_method=other method, specify:",diagnosis_method=tumor resection,age_group=31-40,age_group=41-50,age_group=51-60,age_group=61-70,age_group=71-80,age_group=81-90,age_group=91-100,is_tnbc=1
0,0,259.0,70.0,0.0,2.0,0.0,0.0,85.0,500.0,49.1992,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0,437.0,59.0,0.0,3.0,0.0,0.0,70.0,500.0,142.2976,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0,1321.0,56.0,0.0,8.0,0.0,0.0,90.0,500.0,192.8194,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0,1463.0,54.0,0.0,2.0,0.0,0.0,75.0,500.0,326.0194,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0,434.0,61.0,1.0,11.0,1.0,0.0,90.0,500.0,168.8309,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [118]:
encoded.shape

(1079, 61524)

## Min-Max normalization

In [119]:
# perform Min-Max normalization
from sklearn.preprocessing import MinMaxScaler

numerical_features = encoded.iloc[:, 2:categorical_start]

scaler = MinMaxScaler()
numerical_features_scaled = scaler.fit_transform(numerical_features)

# Create a new pandas DataFrame with the scaled values
numerical_features_scaled_df = pd.DataFrame(numerical_features_scaled, columns=numerical_features.columns)

In [120]:
y = encoded.iloc[:, :2]
new_categorical_df = encoded.iloc[:, categorical_start:]
numerical_features_scaled_df = numerical_features_scaled_df.set_index(y.index)
# Concatenating the dataframes
scaled_df = pd.concat([y, numerical_features_scaled_df, new_categorical_df], axis=1)

In [121]:
scaled_df.head()

Unnamed: 0,status,time,age,number_of_lymph_nodes,lymph_node_examined_count,number_of_lymphnodes_positive_by_he,tumor_necrosis_percent,tumor_nuclei_percent,tumor_weight,A1BG,...,"diagnosis_method=other method, specify:",diagnosis_method=tumor resection,age_group=31-40,age_group=41-50,age_group=51-60,age_group=61-70,age_group=71-80,age_group=81-90,age_group=91-100,is_tnbc=1
0,0,259.0,0.6875,0.0,0.045455,0.0,0.0,0.8125,0.158499,0.015338,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0,437.0,0.515625,0.0,0.068182,0.0,0.0,0.625,0.158499,0.049253,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0,1321.0,0.46875,0.0,0.181818,0.0,0.0,0.875,0.158499,0.067657,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0,1463.0,0.4375,0.0,0.045455,0.0,0.0,0.6875,0.158499,0.11618,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0,434.0,0.546875,0.028571,0.25,0.028571,0.0,0.875,0.158499,0.058918,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [122]:
print(encoded.shape)
print(y.shape)
print(numerical_features_scaled_df.shape)
print(new_categorical_df.shape)
print(scaled_df.shape)

(1079, 61524)
(1079, 2)
(1079, 61425)
(1079, 97)
(1079, 61524)


## Output datasets

In [123]:
# Save the DataFrame to a CSV file
# scaled_df.to_csv('/content/drive/My Drive/3799_new/brca.csv', index=False)
# scaled_df.to_csv('/content/drive/My Drive/3799_new/brca_no_poly_sp_int.csv', index=False)
# encoded.to_csv('/content/drive/My Drive/3799_new/brca_not_scaled.csv', index=False)

In [124]:
# scaled_df.to_csv('/content/drive/My Drive/3799_new/brca_250.csv', index=False)
# encoded.to_csv('/content/drive/My Drive/3799_new/brca_not_scaled_250.csv', index=False)

In [125]:
scaled_df.to_csv('/content/drive/My Drive/3799_new/brca_200.csv', index=False)
encoded.to_csv('/content/drive/My Drive/3799_new/brca_not_scaled_200.csv', index=False)