In [50]:
import os, json
from os.path import join as pj
import pandas as pd
from tqdm import tqdm

In [51]:
%%capture
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', None)

In [52]:
dir_home      = os.path.expanduser('~')
dir_project   = '/mnt/4TB/TCGA_Lung'
dir_scripts   = pj(dir_project, 'scripts')
dir_analysis  = pj(dir_scripts, '4-analysis')

path_trainGeneImpact  = pj(dir_analysis, '1-matrix_trainGeneImpact.pq')

---

## Genes

In [53]:
df = pd.read_parquet(path_trainGeneImpact)
cols = df.columns.tolist()
cols.remove('vital_status')
cols.remove('file_id')
df_a = df[df['vital_status']=='Alive']
df_d = df[df['vital_status']=='Dead'] 
size_a = df_a.shape[0]
size_d = df_d.shape[0]

score = 0.64
score_str = str(score*100).split('.')[0]

variances = []
for c in tqdm(cols):
    variance = {}
    
    # what percent of survivors are meaningfully mutated?
    num_a = df_a[df_a[c]>score].shape[0]
    # what percent of dead patients are meaningfully mutated?
    num_d = df_d[df_d[c]>score].shape[0]
    
    pct_a = num_a/size_a
    pct_d = num_d/size_d
    
    pct_variance = abs(pct_a - pct_d)
    
    # is the difference in pct meaningful? 
    # e.g. 6% difference could be 0%vs6% or 30%vs36%
    max_diff = max(pct_a, pct_d)
    if (max_diff==0):
        scaled_variance = 0
    else:
        scaled_variance = abs(pct_a - pct_d)/ max_diff
    
    variance['Gene']        = c
    variance['Pct_Alive']   = pct_a
    variance['Pct_Dead']    = pct_d
    variance['Pct_Diff']    = pct_variance
    variance['Scaled_Diff'] = scaled_variance
    variance['PctDiff*ScaledDiff'] = pct_variance * scaled_variance
    
    variances.append(variance)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21141/21141 [00:23<00:00, 913.76it/s]


In [54]:
df_var = pd.DataFrame(variances)

In [55]:
combo = 0.08
combo_str = str(combo*100).split('.')[0]

In [56]:
df_mod = df_var[
    (df_var['PctDiff*ScaledDiff']>=combo) &
    ~(df_var['Gene'].str.contains('_OR|_KRT'))
]

df_mod.sort_values('PctDiff*ScaledDiff', ascending=False).round(2)

Unnamed: 0,Gene,Pct_Alive,Pct_Dead,Pct_Diff,Scaled_Diff,PctDiff*ScaledDiff
4230,MODERATE_SETX,0.0,0.21,0.21,1.0,0.21
6304,MODERATE_MUC12,0.09,0.32,0.23,0.72,0.17
7145,MODERATE_CASP8AP2,0.24,0.04,0.2,0.83,0.16
8798,MODERATE_MYO1A,0.28,0.07,0.21,0.76,0.16
3769,MODERATE_POTEI,0.59,0.29,0.3,0.51,0.15
1344,MODERATE_CROCC,0.7,0.37,0.33,0.47,0.15
2289,MODERATE_IGHV1OR21-1,0.15,0.0,0.15,1.0,0.15
8606,MODERATE_BOP1,0.15,0.0,0.15,1.0,0.15
1275,MODERATE_COL14A1,0.0,0.15,0.15,1.0,0.15
7168,MODERATE_CFAP20DC,0.17,0.01,0.16,0.92,0.15


In [57]:
df_mod[df_mod['Gene'].str.contains('HIGH')]

Unnamed: 0,Gene,Pct_Alive,Pct_Dead,Pct_Diff,Scaled_Diff,PctDiff*ScaledDiff
150,HIGH_MYL1,0.543478,0.287671,0.255807,0.470685,0.120405
203,HIGH_PIEZO1,0.173913,0.054795,0.119119,0.684932,0.081588
217,HIGH_PRDM12,0.369565,0.178082,0.191483,0.518131,0.099213
268,HIGH_TDG,0.673913,0.424658,0.249256,0.369863,0.09219
274,HIGH_TMBIM4,0.630435,0.328767,0.301668,0.478507,0.14435
5534,HIGH_NBPF26,0.108696,0.013699,0.094997,0.873973,0.083025
5594,HIGH_VCP,0.217391,0.082192,0.1352,0.621918,0.084083
7830,HIGH_CEBPZ,0.217391,0.054795,0.162597,0.747945,0.121613
9877,HIGH_NLRP14,0.108696,0.0,0.108696,1.0,0.108696
9903,HIGH_TM7SF2,0.152174,0.041096,0.111078,0.729941,0.08108


---

In [62]:
diff_genes = df_mod['Gene'].tolist()

In [63]:
len(diff_genes)

255

In [64]:
diff_genes

['HIGH_MYL1',
 'HIGH_PIEZO1',
 'HIGH_PRDM12',
 'HIGH_TDG',
 'HIGH_TMBIM4',
 'MODERATE_ABCA10',
 'MODERATE_ADAMTS8',
 'MODERATE_ADGRF5',
 'MODERATE_AEN',
 'MODERATE_AGER',
 'MODERATE_AKR1C3',
 'MODERATE_AMH',
 'MODERATE_ANKRD30A',
 'MODERATE_ANKRD30B',
 'MODERATE_ARHGEF10',
 'MODERATE_ATG4B',
 'MODERATE_ATP10B',
 'MODERATE_BCAM',
 'MODERATE_BMP8A',
 'MODERATE_BRWD1',
 'MODERATE_BUB1B',
 'MODERATE_C2CD3',
 'MODERATE_C8orf48',
 'MODERATE_CCDC170',
 'MODERATE_CCDC178',
 'MODERATE_CCDC40',
 'MODERATE_CEACAM6',
 'MODERATE_CEP170',
 'MODERATE_CEP170B',
 'MODERATE_COBLL1',
 'MODERATE_COL14A1',
 'MODERATE_COL23A1',
 'MODERATE_CPT1B',
 'MODERATE_CR1L',
 'MODERATE_CROCC',
 'MODERATE_CYP2C19',
 'MODERATE_DBH',
 'MODERATE_DLK1',
 'MODERATE_DNAH1',
 'MODERATE_DRC7',
 'MODERATE_DSCAML1',
 'MODERATE_DTNB',
 'MODERATE_EEF2K',
 'MODERATE_EFCAB8',
 'MODERATE_EGFLAM',
 'MODERATE_EPHX1',
 'MODERATE_ESPL1',
 'MODERATE_FAM120B',
 'MODERATE_FAM131C',
 'MODERATE_FGL1',
 'MODERATE_FOXM1',
 'MODERATE_FRMD4B',
 '

---

In [42]:
cols = ['file_id','vital_status'] + diff_genes

In [43]:
path_train = pj(dir_analysis,'1-matrix_trainGeneImpact.pq')

In [44]:
df_train = pd.read_parquet(path_train)

In [45]:
df_train = df_train.filter(cols)

In [46]:
path_diffTrain = pj(dir_analysis, f"2-diffTrain_score{score_str}_combo{combo_str}.pq")

In [47]:
df_train.to_parquet(path_diffTrain)

In [48]:
print(path_diffTrain.split('/')[-1])

2-diffTrain_score64_combo12.pq
