## Data Preprocessing Steps:
1. Load raw data obtained using curatedTCGAData.R.
2. Select clinical and biospecimen data, combine with RNA expression based on patient ID.
3. Remove columns with NA values > 0.2.
4. Drop rows with "status" and "time" values being NA.
5. Encode categorical variables.
6. Output data for further imputation.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

# Read the colData file
colData = pd.read_csv('/content/drive/My Drive/3799/tumor_colData.csv',low_memory=False)

# Read the RNASeq2GeneNorm-2 file
RNA = pd.read_csv('/content/drive/My Drive/3799/tumor_BRCA_RNASeq2GeneNorm-20160128.csv')


In [None]:
colData = colData.drop("Unnamed: 0", axis=1)
colData.head(2)

In [None]:
import numpy as np
colData['time'] = np.where(colData['vital_status'] == 0, colData['days_to_last_followup'], colData['days_to_death'])

In [None]:

selected_cols = [  'patientID',
            # Survival data
            'vital_status', 'time',

            # staging parameters
            'pathologic_stage', 'pathology_T_stage', 'pathology_N_stage', 'pathology_M_stage',

            # Demographical variables
            'years_to_birth', 'gender', 'race', 'ethnicity',

            # Lymph_node related
            'number_of_lymph_nodes', 'patient.lymph_node_examined_count',
            'patient.axillary_lymph_node_stage_method_type',
            'patient.number_of_lymphnodes_positive_by_he',

            # ER_Status
            'patient.breast_carcinoma_estrogen_receptor_status',
            'patient.er_level_cell_percentage_category',

            # PR_Status
            'patient.breast_carcinoma_progesterone_receptor_status',
            'patient.progesterone_receptor_level_cell_percent_category',

            # HER_Status
            'patient.lab_proc_her2_neu_immunohistochemistry_receptor_status',
            'patient.her2_erbb_pos_finding_cell_percent_category',

            # Other_status
            'patient.margin_status', 'patient.menopause_status',

            # Variables related to treatment and diagnosis
            'radiation_therapy', 'histological_type',
            'patient.anatomic_neoplasm_subdivisions.anatomic_neoplasm_subdivision',
            'patient.breast_carcinoma_surgical_procedure_name',
            'patient.history_of_neoadjuvant_treatment',
            'patient.initial_pathologic_diagnosis_method',

            # Tumor related
            'patient.biospecimen_cqcf.tumor_samples.tumor_sample.tumor_necrosis_percent',
            'patient.biospecimen_cqcf.tumor_samples.tumor_sample.tumor_nuclei_percent',
            'patient.biospecimen_cqcf.tumor_samples.tumor_sample.tumor_weight'
            ]

selected_data = colData[selected_cols]

In [None]:
# Rename the columns

def rename_columns(col):
    return col.split('.')[-1]

selected_data.columns = selected_data.columns.map(rename_columns)

column_mapping = {
    'vital_status': 'status',
    'years_to_birth':'age'
}

selected_data = selected_data.rename(columns=column_mapping)

In [None]:
selected_data.columns

In [None]:
selected_data.shape

In [None]:
# Remove columns with NA values > 0.2
import pandas as pd

# Set threshold for the percentage of NA values
threshold = 0.2

# Calculate the percentage of missing values in each column
na_percentage = selected_data.isnull().mean()

# Get the column names where the percentage of missing values exceeds the threshold
columns_to_drop = na_percentage[na_percentage > threshold].index

# Drop columns with too many missing values
selected_data_filtered = selected_data.drop(columns=columns_to_drop)

In [None]:
selected_data_filtered.shape

In [None]:
selected_data_filtered.isnull().sum()

In [None]:
# Save the selected DataFrame to a new CSV file in Google Colab
#selected_data.to_csv('/content/drive/My Drive/3799/selected_col.csv', index=False)

#print("selected DataFrame saved to Google Drive.")

In [None]:
col = RNA.columns
RNA = RNA.T  # Transpose the DataFrame
# Set the values in the first row as column names
RNA.columns = RNA.iloc[0]

# Drop the first row from the DataFrame
RNA = RNA[1:]

RNA = RNA.reset_index(drop=True)
col = col[1:]
RNA.insert(0, 'Hybridization REF', col)
RNA.insert(1, 'patientID', RNA['Hybridization REF'].str[:12])

In [None]:
RNA.head(2)

In [None]:
# Check for NA values in columns
columns_with_na = RNA.columns[RNA.isnull().any()]

if len(columns_with_na) > 0:
    print("Columns with NA values:")
    print(columns_with_na)
else:
    print("No columns with NA values.")

In [None]:
print(colData.shape)
print(RNA.shape)

In [None]:
# only consider RNA (gene expression)
merged_df = pd.merge(selected_data_filtered, RNA, on='patientID')

In [None]:
print(merged_df.shape)

In [None]:
cols = list(merged_df.columns)
cols.insert(0, cols.pop(cols.index('Hybridization REF')))
merged_df = merged_df.loc[:, cols]

In [None]:
merged_df.head(2)

In [None]:
merged_df.shape

In [None]:
merged_df.head(2)

In [None]:
RNA[RNA['Hybridization REF'].str.contains('TCGA-A7-A0CE')]

# but in firebrowse (previous data), 2 samples
# tcga-a7-a0ce  TCGA-A7-A0CE-11A-21R-A089-07  normalized_count
# tcga-a7-a0ce  TCGA-A7-A0CE-01A-11R-A00Z-07  normalized_count

# Why is the problem resolved? As this data filtered only code 01 Primary Solid Tumor
# 11 refers to Solid Tissue Normal

In [None]:
duplicate_rows = RNA[RNA.duplicated(subset='patientID')]

if duplicate_rows.empty:
    print("No duplicate values found in the DataFrame.")
else:
    print("Duplicate values found in the DataFrame:")
    print(duplicate_rows)

In [None]:
# want to use RF method instead

#from sklearn.impute import SimpleImputer

#merged_df_imputed = merged_df

#imputer = SimpleImputer(strategy='mean')
#merged_df_imputed.iloc[:,numerical_features_idx] = imputer.fit_transform(merged_df_imputed.iloc[:,numerical_features_idx])

#categorical_imputer = SimpleImputer(strategy='most_frequent')
#merged_df_imputed.iloc[:,categorical_features_idx] = categorical_imputer.fit_transform(merged_df_imputed.iloc[:,categorical_features_idx])

In [None]:
# drop rows with "status" and "time" value being NA
merged_df_filtered = merged_df.dropna(subset=['status', 'time'])

In [None]:
print(merged_df.shape)
print(merged_df_filtered.shape)

In [None]:
# Find and drop columns with constant values
constant_cols = merged_df_filtered.columns[merged_df_filtered.nunique() <= 1]
new_merged_df_filtered = merged_df_filtered.drop(columns=constant_cols).copy()

In [None]:
print(merged_df_filtered.shape)
print(new_merged_df_filtered.shape)

In [None]:
categorical_features_idx = [4, 5, 6, 7, 9, 10, 11, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
numerical_features_idx = [idx for idx in list(range(0, 20247)) if idx not in categorical_features_idx][4:]
#numerical_features_idx = [8, 12, 13, 15, 27, 28, 29]
#numerical_features_idx += list(range(30, 20247))

In [None]:
new_merged_df_filtered.iloc[:,categorical_features_idx].head()

In [None]:
new_merged_df_filtered.iloc[:,numerical_features_idx].head()

In [None]:
# One-hot encode categorical variables
categorical_features = new_merged_df_filtered.columns[categorical_features_idx]
new_merged_df_filtered_encoded = pd.get_dummies(new_merged_df_filtered, columns=categorical_features)

In [None]:
new_merged_df_filtered_encoded

In [None]:
# Save the merged DataFrame to a CSV file
merged_df.to_csv('merged_data.csv', index=False)