In [81]:
import os, json
from os.path import join as pj
import pandas as pd
from tqdm import tqdm

In [82]:
%%capture
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', None)

In [83]:
dir_home      = os.path.expanduser('~')
dir_project   = '/mnt/4TB/TCGA_Liver'
dir_scripts   = pj(dir_project, 'scripts')
dir_analysis  = pj(dir_scripts, '4-analysis')

path_trainGeneImpact  = pj(dir_analysis, '1-matrix_trainGeneImpact.pq')

---

## Genes

In [84]:
df = pd.read_parquet(path_trainGeneImpact)
cols = df.columns.tolist()
cols.remove('vital_status')
cols.remove('file_id')
df_a = df[df['vital_status']=='Alive']
df_d = df[df['vital_status']=='Dead'] 
size_a = df_a.shape[0]
size_d = df_d.shape[0]

score = 0.64
score_str = str(score*100).split('.')[0]

variances = []
for c in tqdm(cols):
    variance = {}
    
    # what percent of survivors are meaningfully mutated?
    num_a = df_a[df_a[c]>score].shape[0]
    # what percent of dead patients are meaningfully mutated?
    num_d = df_d[df_d[c]>score].shape[0]
    
    pct_a = num_a/size_a
    pct_d = num_d/size_d
    
    pct_variance = pct_a - pct_d
    abs_variance = abs(pct_variance)
    
    # is the difference in pct meaningful? 
    # e.g. 6% difference could be 0%vs6% or 30%vs36%
    max_diff = max(pct_a, pct_d)
    if (max_diff==0):
        scaled_variance = 0
    else:
        scaled_variance = abs_variance/max_diff
    
    score_diff = pct_variance * scaled_variance
    OncoScore  = abs_variance * scaled_variance
    # Helps accentuate points on plot. Surprisingly tricky to get this feeling right.
    score_size = (OncoScore * 100) ** 2
    
    variance['Gene']        = c
    variance['Pct_Alive']   = pct_a
    variance['Pct_Dead']    = pct_d
    variance['Pct_Diff']    = pct_variance
    variance['Abs_Diff']    = abs_variance
    variance['Scaled_Diff'] = scaled_variance
    variance['Score_Diff']  = score_diff
    variance['OncoScore']   = OncoScore
    variance['Score_Size']  = score_size
    
    variances.append(variance)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19188/19188 [00:27<00:00, 691.21it/s]


In [85]:
df_var = pd.DataFrame(variances)

In [86]:
df_var.to_parquet('2-gene_diff.pq')

^ The pancancer scripts use the concatenated gene column, so don't delete this file

---

In [87]:
df_polish = df_var.sort_values('OncoScore', ascending=False)

In [88]:
df_polish = df_polish[
        ~(df_polish['Gene'].str.contains('_OR|_KRT|_ZNF'))
]

In [89]:
df_polish[['VEP_Impact', 'Gene']] = df_polish['Gene'].str.split('_', expand=True)

In [90]:
df_polish['VEP_Impact'] = df_polish['VEP_Impact'].str.title()

In [91]:
df_polish = df_polish.filter([
    'Gene',
    'VEP_Impact',
    'Pct_Alive',
    'Pct_Dead',
    'Pct_Diff',
    'Abs_Diff',
    'Scaled_Diff',
    'Score_Diff',
    'OncoScore',
    'Score_Size'
])

In [92]:
df_polish["Pct_Alive"]   = df_polish["Pct_Alive"] * 100
df_polish["Pct_Dead"]    = df_polish["Pct_Dead"] * 100
df_polish["Pct_Diff"]    = df_polish["Pct_Diff"] * 100
df_polish["Abs_Diff"]    = df_polish["Abs_Diff"] * 100
df_polish["Scaled_Diff"] = df_polish["Scaled_Diff"] * 100
df_polish["Score_Diff"]  = df_polish["Score_Diff"] * 100
df_polish["OncoScore"]   = df_polish["OncoScore"] * 100
# Already multiplied
df_polish["Score_Size"]  = df_polish["Score_Size"]

In [93]:
df_polish = df_polish.round(2).reset_index(drop=True)

In [94]:
df_polish.to_parquet("2-geneDiff_polish.pq")

---

In [95]:
combo = 0.16
combo_str = str(combo*100).split('.')[0]

In [96]:
df_mod = df_var[
    (df_var['AbsDiff*ScaledDiff']>=combo) &
    ~(df_var['Gene'].str.contains('_OR|_KRT')) &
    ~(df_var['Gene'].str.startswith('HIGH_'))
]

df_mod.sort_values('AbsDiff*ScaledDiff', ascending=False).round(2)

KeyError: 'AbsDiff*ScaledDiff'

In [None]:
mods = df_mod.sort_values('AbsDiff*ScaledDiff', ascending=False)['Pct_Dead']

In [None]:
[print(m) for m in mods]

In [None]:
combo_high = 0.09

In [None]:
df_high = df_var[           
    (df_var['AbsDiff*ScaledDiff']>=combo_high) &
    (df_var['Gene'].str.startswith('HIGH_')) &
    ~(df_var['Gene'].str.contains('_OR|_KRT'))
]

df_high.sort_values('AbsDiff*ScaledDiff', ascending=False).round(2)

In [None]:
highs = df_high.sort_values('AbsDiff*ScaledDiff', ascending=False)['Pct_Dead']

In [None]:
[print(h) for h in highs]

---

In [None]:
diff_genes = df_mod['Gene'].tolist()

In [None]:
len(diff_genes)

In [None]:
diff_genes

---

In [None]:
cols = ['file_id','vital_status'] + diff_genes

In [None]:
path_train = pj(dir_analysis,'1-matrix_trainGeneImpact.pq')

In [None]:
df_train = pd.read_parquet(path_train)

In [None]:
df_train = df_train.filter(cols)

In [None]:
path_diffTrain = pj(dir_analysis, f"2-diffTrain_score{score_str}_combo{combo_str}.pq")

In [None]:
df_train.to_parquet(path_diffTrain)

In [None]:
print(path_diffTrain.split('/')[-1])