In [1]:
import numpy as np
import pandas as pd

In [2]:
metadata = pd.read_csv("/home/bruce1996/data/LIHC_anomaly_detection/data/sample_info/nationwidechildrens.org_clinical_patient_lihc.txt",sep='\t')

In [50]:
candidate_metadata  = ['patient_id','age_at_diagnosis','gender','vital_status','viral_hepatitis_serology','history_hepato_carcinoma_risk_factors',
                        'tumor_grade','ajcc_tumor_pathologic_pt','ajcc_nodes_pathologic_pn','ajcc_metastasis_pathologic_pm','ajcc_pathologic_tumor_stage',
                        'death_days_to','last_contact_days_to','new_tumor_event_dx_indicator']

In [51]:
df = metadata.loc[:,candidate_metadata]
df = df.iloc[2:,:]
df.index = df.patient_id

### HCC risk factor & virus serology one hot encoding

In [52]:
def one_hot_encoding(target) :
    """_summary_
    Args:
        df (df.series): the column of dataframe will be ont hot encoding.
    Returns:
        ont_hot_df (df.dataframe) : data frame after one hot encoding.
    """    
    condition = set()
    for element in target.values :
        if element == '[Unknown]' or element == '[Not Available]' :
            continue
        for con in element.split('|') :
            if con not in condition :
                condition.add(con)
    sample_idx = target.index
    one_hot_df = pd.DataFrame(np.zeros([len(target),len(condition)],dtype=np.ubyte),index=sample_idx,columns=[target.name + '_' + x for x in list(condition)])
    for idx,element in enumerate(target.values) :
        for con in condition :
            if con in element :
                col = target.name + '_' + con
                one_hot_df.loc[sample_idx[idx],col] = 1
    return one_hot_df

In [53]:
hbv_one_hot = one_hot_encoding(df.viral_hepatitis_serology)
hcc_one_hot = one_hot_encoding(df.history_hepato_carcinoma_risk_factors)
hcc_one_hot.drop(['history_hepato_carcinoma_risk_factors_Alpha-1 Antitrypsin Deficiency','history_hepato_carcinoma_risk_factors_Hemochromatosis',
'history_hepato_carcinoma_risk_factors_Other'],axis=1,inplace=True)
df = pd.concat([df,hbv_one_hot,hcc_one_hot],axis=1)
df['HBV'] = np.where(df['viral_hepatitis_serology_Hepatitis B Surface Antigen'] == 1,'Positive','Negative')

### convert stage symbol

In [54]:
stage_d = {
    'Stage I' : 'Stage I',
    'Stage II' : 'Stage II',
    'Stage III' : 'Stage III',
    'Stage IIIA' : 'Stage III',
    'Stage IIIB' : 'Stage III',
    'Stage IIIC' : 'Stage III',
    'Stage IV' : 'Stage IV',
    'Stage IVA' : 'Stage IV',
    'Stage IVB' : 'Stage IV',
    '[Discrepancy]' : '[Not Available]',
    '[Not Available]' : '[Not Available]'
}
df['ajcc_pathologic_tumor_stage'].replace(stage_d,inplace=True)

### Processing survival information

In [55]:
survival_days = []
for idx,death in enumerate(df.death_days_to.values) :
    followup = df.last_contact_days_to.values[idx]
    if followup == '[Not Available]' and death == '[Not Applicable]' :
        print("Patient %s not death and censored!"% df.index[idx])
        survival_days.append('[Not Available]')
        continue
    elif followup != '[Not Available]' and death != '[Not Applicable]' :
        print("Patient %s is death or censored!"% df.index[idx])
    if followup == '[Not Available]' :
        survival_days.append(int(death))
    elif death == '[Not Applicable]' :
        survival_days.append(int(followup))

df['Survival_days'] = survival_days

Patient A95S not death and censored!


In [56]:
col_order = ['patient_id', 'age_at_diagnosis', 'gender', 'vital_status','Survival_days',
       'HBV','viral_hepatitis_serology', 'history_hepato_carcinoma_risk_factors',
       'tumor_grade', 'ajcc_tumor_pathologic_pt', 'ajcc_nodes_pathologic_pn',
       'ajcc_metastasis_pathologic_pm', 'ajcc_pathologic_tumor_stage','new_tumor_event_dx_indicator',
       'death_days_to', 'last_contact_days_to',
       'viral_hepatitis_serology_HBV Surface Antibody',
       'viral_hepatitis_serology_HBV DNA',
       'viral_hepatitis_serology_Hepatitis C Virus RNA',
       'viral_hepatitis_serology_Hepatitis B Surface Antigen',
       'viral_hepatitis_serology_HBV Core Antibody',
       'viral_hepatitis_serology_Hepatitis  C Antibody',
       'viral_hepatitis_serology_HCV Genotype',
       'history_hepato_carcinoma_risk_factors_Alcohol consumption',
       'history_hepato_carcinoma_risk_factors_No History of Primary Risk Factors',
       'history_hepato_carcinoma_risk_factors_Non-Alcoholic Fatty Liver Disease',
       'history_hepato_carcinoma_risk_factors_Hepatitis C',
       'history_hepato_carcinoma_risk_factors_Hepatitis B']
df = df.loc[:,col_order]

In [58]:
df.to_csv("/home/bruce1996/data/LIHC_anomaly_detection/data/sample_info/processed_metadata.txt",sep='\t')