In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_selection import f_classif, SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler


### ANOVA for Numerical Features, Categorical Output

In [2]:
numeric = pd.read_csv('../../data/numeric.csv')

In [3]:
numeric.shape

(165603, 203)

In [4]:
numeric.set_index(keys = 'TRR_ID_CODE', inplace = True)

In [5]:
# Categorical variables encoded numerically
numeric.drop(columns = {'INIT_STAT','A1', 'A2','B1',
'B2','DR1','DR2','EDUCATION','FUNC_STAT_TCR','DGN_TCR','DIAB','END_STAT',
'ETHNICITY','ETHCAT','REGION','PRI_PAYMENT_TCR_KI','FUNC_STAT_TRR',
'PRI_PAYMENT_TRR_KI','ORG_REC_ON','DA1','DA2','DB1','DB2','DDR1','DDR2',
'RA1','RA2','RB1','RB2','RDR1','RDR2','AMIS', 'BMIS','DRMIS','HLAMIS',
'ETHCAT_DON','COD_CAD_DON',
'DEATH_CIRCUM_DON','DEATH_MECH_DON',
'CANCER_SITE_DON','HIST_DIABETES_DON','END_STAT_KI','ABO_MAT',
'DIAG_KI','TX_PROCEDUR_TY_KI','SHARE_TY',
'TRANSFUS_TERM_DON'}, inplace = True)

In [6]:
# Get percent nulls for each column in df
# Used code provided here: https://studymachinelearning.com/pandas-count-missing-values-nan-for-each-columns-in-dataframe/

def missing_data(data):
    # Count number of missing value in a column
    total = data.isnull().sum()           
    
    # Get Percentage of missing values
    percent = (data.isnull().sum()/data.isnull().count()*100)   
    temp = pd.concat([total, percent], axis=1, keys=['Total', 'Percent(%)'])

    # Create a Type column, that indicates the data-type of the column.
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    temp['Types'] = types

    return(np.transpose(temp))


In [7]:
nulls_2 = missing_data(numeric)
nulls_2 = nulls_2.transpose()
df_keep_2 = nulls_2[nulls_2['Percent(%)'] <= 10]
columns = df_keep_2.index.tolist()
columns = columns + ['GRF_STAT_NUM']
numeric = numeric[columns]

In [8]:
numeric.shape

(165603, 48)

### Create a temporary numerical dataframe for feature selection

In [9]:
# Drop columns with information about graft failure/survival.
numeric_temp = numeric.drop(columns = {'KDPI','KDRI_RAO','EGFR_CKDEPI_DON','KDRI_MED','GSTATUS_KI',
'PSTATUS','PTIME','GTIME_KI','GSTATUS_DTHCNS_KI'})


In [10]:
numeric_temp.isnull().sum().sort_values(ascending = False)


GRF_STAT_NUM         16726
COLD_ISCH_KI          3755
PO2_FIO2_DON          2791
PO2_DON               2421
DAYSWAIT_ALLOC        2246
INIT_BMI_CALC         1778
CREAT_TRR             1637
BMI_TCR               1578
HGT_CM_TCR            1352
INIT_HGT_CM           1206
PCO2_DON              1123
PH_DON                 989
WGT_KG_TCR             958
INIT_WGT_KG            880
TBILI_DON              623
SGOT_DON               524
SGPT_DON               522
HEMATOCRIT_DON         412
END_BMI_CALC           315
BMI_CALC               306
BMI_DON_CALC           237
HGT_CM_CALC            232
SODIUM170_VAL_DON      133
WGT_KG_CALC            113
WGT_KG_DON_CALC        109
BUN_DON                 36
CREAT_DON               28
HGT_CM_DON_CALC         12
ETHNICITY_DON            0
DAYSWAIT_CHRON           0
INIT_AGE                 0
DAYSWAIT_CHRON_KI        0
AGE_DON                  0
ECD_DONOR                0
URINE_INF_DON            0
Transplant_Year          0
AGE                      0
D

In [11]:
numeric_temp.dropna(inplace = True)

In [12]:
numeric_temp.shape

(137621, 39)

### Feature selection tips from : https://medium.com/analytics-vidhya/feature-selection-using-scikit-learn-5b4362e0c19b

In [13]:
X = numeric_temp.drop((['GRF_STAT_NUM']), axis = 1)
y = numeric_temp['GRF_STAT_NUM']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
y_train.value_counts(normalize = True)

0.0    0.848361
1.0    0.151639
Name: GRF_STAT_NUM, dtype: float64

In [16]:
nm = RandomUnderSampler()

X_train_under, y_train_under = nm.fit_resample(X_train, y_train)

In [17]:
cols = X_train_under.columns
ss = StandardScaler()
X_train_under = pd.DataFrame(ss.fit_transform(X_train_under), columns=cols)
X_test = pd.DataFrame(ss.transform(X_test), columns=cols)


In [18]:
y_train_under.value_counts()

1.0    14608
0.0    14608
Name: GRF_STAT_NUM, dtype: int64

In [19]:
X_best = SelectKBest(f_classif, k=30).fit(X_train_under, y_train_under)
mask = X_best.get_support() #list of booleans for selected features
new_feat = [] 
for bool, feature in zip(mask, X_train_under.columns):
    if bool:
        new_feat.append(feature)
print('The 30 best features are:{}'.format(new_feat)) # The list of your 30 best features

The 30 best features are:['WGT_KG_TCR', 'HGT_CM_TCR', 'INIT_WGT_KG', 'INIT_HGT_CM', 'INIT_AGE', 'DAYSWAIT_ALLOC', 'CREAT_TRR', 'AGE_DON', 'BLOOD_INF_DON', 'BUN_DON', 'CREAT_DON', 'SGOT_DON', 'SGPT_DON', 'URINE_INF_DON', 'WGT_KG_DON_CALC', 'AGE', 'DISTANCE', 'COLD_ISCH_KI', 'ECD_DONOR', 'HGT_CM_CALC', 'WGT_KG_CALC', 'BMI_CALC', 'SODIUM170_VAL_DON', 'PO2_DON', 'PH_DON', 'HEMATOCRIT_DON', 'PO2_FIO2_DON', 'PCO2_DON', 'ETHNICITY_DON', 'Transplant_Year']


In [20]:
X_best.scores_

array([3.02965437e+01, 8.38886000e+01, 1.52618955e+00, 3.13788139e+01,
       4.46559488e+01, 4.49010206e-01, 5.59011451e+02, 1.38996478e+00,
       1.05245589e+00, 4.38991975e+02, 9.81329799e+01, 2.48201044e+02,
       1.44597858e+01, 1.93342804e+02, 8.87106996e+00, 1.42159851e+01,
       2.53710424e+01, 4.79756666e-01, 1.44310364e+01, 2.87130211e-01,
       2.90433496e+00, 1.20169272e+00, 5.80567098e+02, 4.62096252e+00,
       5.62278034e+01, 4.49010206e-01, 2.31746055e+02, 8.29631050e+01,
       8.14717366e+01, 2.86932478e+01, 3.49498916e+01, 4.15419486e+00,
       3.94500157e+01, 1.33861680e+02, 7.62317873e+00, 5.16364104e+01,
       1.79580533e+01, 5.98211232e+03])

In [21]:
X_best.pvalues_

array([3.73899116e-008, 5.56676366e-020, 2.16695162e-001, 2.14193191e-008,
       2.39102195e-011, 5.02810663e-001, 1.95231220e-122, 2.38420650e-001,
       3.04950646e-001, 9.25106953e-097, 4.25445140e-023, 1.08674786e-055,
       1.43475049e-004, 8.17425947e-044, 2.89963188e-003, 1.63300982e-004,
       4.75779176e-007, 4.88538063e-001, 1.45680823e-004, 5.92069534e-001,
       8.83523181e-002, 2.72992657e-001, 4.91265588e-127, 3.15916961e-002,
       6.63733991e-014, 5.02810663e-001, 3.93391408e-052, 8.87864620e-020,
       1.88417803e-019, 8.54394027e-008, 3.42054228e-009, 4.15405016e-002,
       3.41301131e-010, 6.84090729e-031, 5.76587848e-003, 6.83908817e-013,
       2.26519016e-005, 0.00000000e+000])

### Chi-Square Test for Categorical Input, Categorical Output

In [22]:
categorical = pd.read_csv('../../data/categorical.csv')
categorical.shape

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


(165603, 196)

In [23]:
pd.set_option('display.max_columns', None)
categorical.set_index(keys = 'TRR_ID_CODE', inplace = True)

In [24]:
categorical.drop(columns = {'PX_STAT_DATE', 'INIT_DATE', 'WT_QUAL_DATE',
                  'DON_DATE','TX_DATE', 'ADMISSION_DATE',
                           'RECOVERY_DATE',}, inplace = True)

In [25]:
df_cat = pd.read_csv('../../data/main_final_table.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [26]:
df_cat = df_cat[['TRR_ID_CODE','INIT_STAT','A1', 'A2','B1',
'B2','DR1','DR2','EDUCATION','FUNC_STAT_TCR','DGN_TCR','DIAB','END_STAT',
'REGION','PRI_PAYMENT_TCR_KI','FUNC_STAT_TRR',
'PRI_PAYMENT_TRR_KI','ORG_REC_ON','DA1','DA2','DB1','DB2','DDR1','DDR2',
'RA1','RA2','RB1','RB2','RDR1','RDR2','AMIS', 'BMIS','DRMIS','HLAMIS',
'COD_CAD_DON',
'DEATH_CIRCUM_DON','DEATH_MECH_DON',
'CANCER_SITE_DON','HIST_DIABETES_DON','END_STAT_KI','ABO_MAT',
'DIAG_KI','TX_PROCEDUR_TY_KI','SHARE_TY',
'TRANSFUS_TERM_DON']]

df_cat.drop_duplicates(subset = ['TRR_ID_CODE'], keep = 'last', inplace = True )
df_cat.set_index(keys = 'TRR_ID_CODE', inplace = True)

In [27]:
categorical = categorical.join(df_cat)

In [28]:
categorical.shape

(165603, 232)

In [29]:
# TO DUMMIFY:
df = pd.get_dummies(categorical, prefix=['PX_STAT','GENDER','ABO','INIT_STAT','A1', 'A2','B1',
'B2','DR1','DR2','EDUCATION','FUNC_STAT_TCR','DGN_TCR','DIAB','END_STAT',
'REGION','PRI_PAYMENT_TCR_KI','FUNC_STAT_TRR',
'PRI_PAYMENT_TRR_KI','ORG_REC_ON','DA1','DA2','DB1','DB2','DDR1','DDR2',
'RA1','RA2','RB1','RB2','RDR1','RDR2','AMIS', 'BMIS','DRMIS','HLAMIS',
'CMV_DON','HBV_CORE_DON','HBV_SUR_ANTIGEN_DON','COD_CAD_DON',
'DEATH_CIRCUM_DON','DEATH_MECH_DON','HEP_C_ANTI_DON','ABO_DON','GENDER_DON',
'VDRL_DON','CANCER_SITE_DON','HIST_DIABETES_DON','END_STAT_KI','ABO_MAT',
'DIAG_KI','TX_PROCEDUR_TY_KI','EBV_SEROSTATUS','HBV_SUR_ANTIGEN','HCV_SEROSTATUS',
'HIV_SEROSTATUS','CMV_STATUS','SHARE_TY','HBSAB_DON','EBV_IGG_CAD_DON',
'EBV_IGM_CAD_DON','HIV_DON','EBNA_DON','HTLV_DON',
'TRANSFUS_TERM_DON'], columns=['PX_STAT','GENDER','ABO','INIT_STAT','A1', 'A2','B1',
'B2','DR1','DR2','EDUCATION','FUNC_STAT_TCR','DGN_TCR','DIAB','END_STAT',
'REGION','PRI_PAYMENT_TCR_KI','FUNC_STAT_TRR',
'PRI_PAYMENT_TRR_KI','ORG_REC_ON','DA1','DA2','DB1','DB2','DDR1','DDR2',
'RA1','RA2','RB1','RB2','RDR1','RDR2','AMIS', 'BMIS','DRMIS','HLAMIS',
'CMV_DON','HBV_CORE_DON','HBV_SUR_ANTIGEN_DON','COD_CAD_DON',
'DEATH_CIRCUM_DON','DEATH_MECH_DON','HEP_C_ANTI_DON','ABO_DON','GENDER_DON',
'VDRL_DON','CANCER_SITE_DON','HIST_DIABETES_DON','END_STAT_KI','ABO_MAT',
'DIAG_KI','TX_PROCEDUR_TY_KI','EBV_SEROSTATUS','HBV_SUR_ANTIGEN','HCV_SEROSTATUS',
'HIV_SEROSTATUS','CMV_STATUS','SHARE_TY','HBSAB_DON','EBV_IGG_CAD_DON',
'EBV_IGM_CAD_DON','HIV_DON','EBNA_DON','HTLV_DON',
'TRANSFUS_TERM_DON'])

In [30]:
# Convert Y's and N's to 1's & 0's 
# FOR GRF_STAT, Y = functioning graft, N = failed. Since predicting Graft failure, will set 1 = failure = N
df['GRF_STAT'] = df['GRF_STAT'].map({'Y': 0, 'N': 1})
# FOR ON_DIALYSIS Y = dialysis administered, N= not administered
df['ON_DIALYSIS'] = df['ON_DIALYSIS'].map({'Y': 1, 'N': 0})
df['PERIP_VASC'] = df['PERIP_VASC'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['EXH_PERIT_ACCESS'] = df['EXH_PERIT_ACCESS'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['EXH_VASC_ACCESS'] = df['EXH_VASC_ACCESS'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['WORK_INCOME_TCR'] = df['WORK_INCOME_TCR'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['MALIG_TCR_KI'] = df['MALIG_TCR_KI'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['MALIG_TRR'] = df['MALIG_TRR'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['WORK_INCOME_TRR'] = df['WORK_INCOME_TRR'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['DON_RETYP'] = df['DON_RETYP'].map({'Y': 1, 'N': 0})
df['DDAVP_DON'] = df['DDAVP_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['NON_HRT_DON'] = df['NON_HRT_DON'].map({'Y': 1, 'N': 0})
df['ANTIHYPE_DON'] = df['ANTIHYPE_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['PT_DIURETICS_DON'] = df['PT_DIURETICS_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['PT_STEROIDS_DON'] = df['PT_STEROIDS_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['PT_T3_DON'] = df['PT_T3_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['PT_T4_DON'] = df['PT_T4_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['VASODIL_DON'] = df['VASODIL_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['CLIN_INFECT_DON'] = df['CLIN_INFECT_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['EXTRACRANIAL_CANCER_DON'] = df['EXTRACRANIAL_CANCER_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['HIST_CIG_DON'] = df['HIST_CIG_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['HIST_COCAINE_DON'] = df['HIST_COCAINE_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['HIST_HYPERTENS_DON'] = df['HIST_HYPERTENS_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['INTRACRANIAL_CANCER_DON'] = df['INTRACRANIAL_CANCER_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['HIST_CANCER_DON'] = df['HIST_CANCER_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['HIST_OTH_DRUG_DON'] = df['HIST_OTH_DRUG_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['SKIN_CANCER_DON'] = df['SKIN_CANCER_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['DIABETES_DON'] = df['DIABETES_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['HEPARIN_DON'] = df['HEPARIN_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['ARGININE_DON'] = df['ARGININE_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['INSULIN_DON'] = df['INSULIN_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['PUMP_KI'] = df['PUMP_KI'].map({'Y': 1, 'N': 0})
df['DIAL_TRR'] = df['DIAL_TRR'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['GRF_STAT_KI'] = df['GRF_STAT_KI'].map({'Y': 1, 'N': 0})
df['DWFG_KI'] = df['DWFG_KI'].map({'Y': 1, 'N': 0})
df['PAYBACK'] = df['PAYBACK'].map({'Y': 1, 'N': 0})
df['MALIG'] = df['MALIG'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['PROTEIN_URINE'] = df['PROTEIN_URINE'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['INOTROP_SUPPORT_DON'] = df['INOTROP_SUPPORT_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['TATTOOS'] = df['TATTOOS'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['CDC_RISK_HIV_DON'] = df['CDC_RISK_HIV_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['PROTEIN_URINE_DON'] = df['PROTEIN_URINE_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['ALCOHOL_HEAVY_DON'] = df['ALCOHOL_HEAVY_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['TATTOOS_DON'] = df['TATTOOS_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['HISTORY_MI_DON'] = df['HISTORY_MI_DON'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['CORONARY_ANGIO_DON'] = df['CORONARY_ANGIO_DON'].map({'Y': 1, 'N': 0})
df['PREV_GASTRO_DIS'] = df['PREV_GASTRO_DIS'].map({'Y': 1, 'N': 0, 'U': np.NaN})
df['PRELIM_XMATCH'] = df['PRELIM_XMATCH'].map({'Y': 1, 'N': 0})

In [31]:
categorical = df

In [32]:
# Get percent nulls for each column in df
# Used code provided here: https://studymachinelearning.com/pandas-count-missing-values-nan-for-each-columns-in-dataframe/

def missing_data(data):
    # Count number of missing value in a column
    total = data.isnull().sum()           
    
    # Get Percentage of missing values
    percent = (data.isnull().sum()/data.isnull().count()*100)   
    temp = pd.concat([total, percent], axis=1, keys=['Total', 'Percent(%)'])

    # Create a Type column, that indicates the data-type of the column.
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    temp['Types'] = types

    return(np.transpose(temp))

In [33]:
nulls = missing_data(categorical)
nulls = nulls.transpose()

In [34]:
df_keep_2 = nulls[nulls['Percent(%)'] <= 10]

In [35]:
columns = df_keep_2.index.tolist()

In [36]:
columns = columns + ['GRF_STAT']
categorical = categorical[columns]

In [37]:
categorical = categorical.drop(columns = {'GRF_STAT_KI',
'DWFG_KI'})

In [38]:
categorical.isnull().sum().sort_values(ascending=False)
categorical = categorical.dropna()
categorical.shape

(107981, 1677)

In [39]:
X = categorical.drop((['GRF_STAT']), axis = 1)
y = categorical['GRF_STAT']

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
y_train.value_counts()

0.0    65620
1.0     9966
Name: GRF_STAT, dtype: int64

In [41]:
nm = RandomUnderSampler()

X_train_under, y_train_under = nm.fit_resample(X_train, y_train)

In [42]:
 y_train_under.value_counts()

1.0    9966
0.0    9966
Name: GRF_STAT, dtype: int64

In [43]:
X_best_c = SelectKBest(chi2, k= 70).fit(X_train_under, y_train_under)
mask = X_best_c.get_support() #list of booleans for selected features
new_feat_2 = [] 
for bool, feature in zip(mask, X_train_under.columns):
    if bool:
        new_feat_2.append(feature)
print('The 70 best features are:{}'.format(new_feat_2)) # The list of your 70 best categorical features


The 70 best features are:['MALIG_TCR_KI', 'NON_HRT_DON', 'CLIN_INFECT_DON', 'HIST_CIG_DON', 'HIST_COCAINE_DON', 'HIST_HYPERTENS_DON', 'HIST_OTH_DRUG_DON', 'DIABETES_DON', 'PAYBACK', 'MALIG', 'INOTROP_SUPPORT_DON', 'TATTOOS', 'CDC_RISK_HIV_DON', 'TATTOOS_DON', 'PREV_GASTRO_DIS', 'PX_STAT_N', 'PX_STAT_R', 'B1_42', 'B2_70', 'DR1_3', 'DR2_18', 'EDUCATION_5.0', 'EDUCATION_6.0', 'EDUCATION_998.0', 'FUNC_STAT_TCR_1.0', 'FUNC_STAT_TCR_2070.0', 'DGN_TCR_3008.0', 'DGN_TCR_3012.0', 'DGN_TCR_3072.0', 'DIAB_5.0', 'REGION_2', 'REGION_5', 'PRI_PAYMENT_TCR_KI_13.0', 'FUNC_STAT_TRR_2070.0', 'FUNC_STAT_TRR_2090.0', 'FUNC_STAT_TRR_2100.0', 'RB1_42.0', 'RB2_70.0', 'RDR1_3.0', 'RDR2_18.0', 'DRMIS_0.0', 'DRMIS_2.0', 'HLAMIS_6.0', 'HBV_CORE_DON_P', 'COD_CAD_DON_1', 'COD_CAD_DON_2', 'DEATH_CIRCUM_DON_2', 'DEATH_CIRCUM_DON_5', 'DEATH_CIRCUM_DON_997', 'DEATH_MECH_DON_3', 'DEATH_MECH_DON_4', 'DEATH_MECH_DON_11', 'HIST_DIABETES_DON_4', 'DIAG_KI_3004.0', 'DIAG_KI_3008.0', 'DIAG_KI_3040.0', 'DIAG_KI_3072.0', 'EBV_S

In [44]:
scores = X_best_c.scores_


In [45]:
pvalues = X_best_c.pvalues_


### Dataframe w/ reduced features for Random Survival Forests

In [46]:
new_feat.append('GTIME_KI')
new_feat.append('GRF_STAT_NUM')
numeric = numeric[new_feat]


In [47]:
categorical = categorical[new_feat_2]


In [48]:
df_rsf = categorical.join(numeric)


In [49]:
df_rsf.shape


(107981, 102)

In [50]:
df_rsf['GTIME_KI'].head()


TRR_ID_CODE
A705660    1458
A715738    1068
A599066     457
A526101     340
A51101     1440
Name: GTIME_KI, dtype: int64

In [51]:
df_rsf.isnull().sum().sort_values(ascending = False)

DAYSWAIT_ALLOC       1382
COLD_ISCH_KI         1268
PO2_FIO2_DON         1211
PO2_DON               921
CREAT_TRR             830
                     ... 
DIAG_KI_3008.0          0
DIAG_KI_3040.0          0
DIAG_KI_3072.0          0
EBV_SEROSTATUS_ND       0
MALIG_TCR_KI            0
Length: 102, dtype: int64

In [52]:
df_rsf.shape

(107981, 102)

In [53]:
df_rsf.dropna(inplace = True)
df_rsf.shape

(102480, 102)

In [54]:
df_rsf['GRF_STAT_NUM'].value_counts(normalize = True)

0.0    0.870706
1.0    0.129294
Name: GRF_STAT_NUM, dtype: float64

In [55]:
# Convert graft life from days to years
df_rsf['GTIME_KI_YEARS'] = df_rsf['GTIME_KI'] / 365.2425
df_rsf['GTIME_KI_YEARS'].head()

TRR_ID_CODE
A705660    3.991868
A715738    2.924085
A599066    1.251224
A526101    0.930888
A51101     3.942586
Name: GTIME_KI_YEARS, dtype: float64

In [56]:
df_rsf['GTIME_KI_YEARS'].describe()

count    102480.000000
mean          4.219939
std           3.211010
min           0.000000
25%           1.522276
50%           3.553803
75%           6.182194
max          14.182358
Name: GTIME_KI_YEARS, dtype: float64

In [57]:
df = df_rsf.drop(columns = 'GTIME_KI')

In [58]:
df.shape

(102480, 102)

In [60]:
df.to_csv('../../data/df_gcp.csv', index = True)