### Fill cell metadata for TCGA samples

Create a table containing the following:

- Deconvolution result => ITTH score [DONE]
- NMF results
- Drug response
- Survival 

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import entropy 

In [50]:
GDSC_decon_df = pd.read_csv('../data/CIBERSORT_related/GDSC_subtype_panel/unc_gdsc_output_full.csv', index_col=0)
LM_decon_df = pd.read_csv('../data/CIBERSORT_related/GDSC_subtype_panel/unc_gdsc_output_full.csv', index_col=0)

If percentage < 0.05, then set to 0

In [51]:
GDSC_decon_df = GDSC_decon_df.where(GDSC_decon_df >= 0.05, other=0)
LM_decon_df = LM_decon_df.where(LM_decon_df >= 0.05, other=0)

In [52]:
# Remove P-value, correlation, and RMSE columns
GDSC_decon_df = GDSC_decon_df[GDSC_decon_df.columns[0:-3]]
LM_decon_df = LM_decon_df[GDSC_decon_df.columns[0:-3]]

Get patient id

In [53]:
GDSC_decon_df.head()

patient_list = ['-'.join(s.split('-')[0:3]) for s in GDSC_decon_df.index]
acc_list = [s.split('-')[1] for s in GDSC_decon_df.index]

meta_df = pd.DataFrame(patient_list, index=GDSC_decon_df.index, columns=['patient_id'])

In [54]:
tss_to_acr_dict = pd.read_pickle('../preprocessed_data/TCGA/tss_to_acronym_dict.pkl')
meta_df.loc[:, 'cancer_type'] = [tss_to_acr_dict[acc] for acc in acc_list]

Calculate entropy (ITTH)

In [58]:
meta_df.loc[:, 'ITTH'] = entropy(pk=GDSC_decon_df.T, qk=None)

In [62]:
meta_df.head()

Unnamed: 0_level_0,patient_id,cancer_type,ITTH
Input Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TCGA-OR-A5J1-01A-11R-A29S-07,TCGA-OR-A5J1,ACC,2.174661
TCGA-OR-A5J2-01A-11R-A29S-07,TCGA-OR-A5J2,ACC,1.734458
TCGA-OR-A5J3-01A-11R-A29S-07,TCGA-OR-A5J3,ACC,1.896308
TCGA-OR-A5J5-01A-11R-A29S-07,TCGA-OR-A5J5,ACC,1.905145
TCGA-OR-A5J6-01A-31R-A29S-07,TCGA-OR-A5J6,ACC,2.045562
