In [1]:
import pandas as pd
import numpy as np

from statsmodels.stats.multitest import fdrcorrection

from scipy.stats import linregress
from sklearn.metrics import mean_squared_error

import os

In [2]:
in_dir = '../../MBC/filtering_and_correction_analysis/merged_data'

In [3]:
data = pd.DataFrame()
for item in os.listdir(in_dir):
    if item.startswith('10000-sites_reformatted'):
        current=pd.read_csv(in_dir+'/'+item, sep='\t')
        _,correction,filtered,_ = item.rsplit('.',3)
        current['correction'] = correction
        current['filter'] = filtered
        current = current[(current['tumor_fraction']>=0.1) & (current['ulp_wgs_coverage']>=0.1) ]

        data = data.append(current, ignore_index=True)

In [4]:
features = data.columns[(data.columns.str.startswith('central_cov')) | (data.columns.str.startswith('mean_cov')) | (data.columns.str.startswith('amplitude'))]
site_names = list(features.str.rsplit('_',1,expand=True).droplevel().unique())
print(len(site_names))

377


In [5]:
stat_results = {}

for filtered in ['exclusion_filtered']: 
    stat_results[filtered] = {}
    for correction in ['uncorrected','GC_corrected','GC_map_corrected']:
        stat_results[filtered][correction] = pd.DataFrame()

for k,site_name in enumerate(site_names):
    if k%50==0:
        print(k,site_name)
    for filtered in ['exclusion_filtered']: 
        for correction in ['uncorrected','GC_corrected','GC_map_corrected']:
            current = data[(data['filter']==filtered) & (data['correction']==correction)]
            current_outputs = {'site_name':site_name}
            for feature_type in ['central_coverage','mean_coverage','amplitude']:
                feature = feature_type + '_' + site_name
                #pearson r plus regression line
                lin_reg_slope, lin_reg_intercept, pearson_r, pearson_pval, lin_reg_std_err = linregress(current['tumor_fraction'],current[feature])
                fit_line = lin_reg_slope*current['tumor_fraction']+lin_reg_intercept
                RMSE = np.sqrt(mean_squared_error(current[feature],fit_line))
                current_outputs['pearson-r_'+feature_type] = pearson_r
                current_outputs['pearson-pval_'+feature_type] = pearson_pval
                current_outputs['lin-reg-slope_'+feature_type] = lin_reg_slope
                current_outputs['lin-reg-intercept_'+feature_type] = lin_reg_intercept
                current_outputs['lin-reg-RMSE_'+feature_type] = RMSE
            stat_results[filtered][correction] = stat_results[filtered][correction].append(current_outputs, ignore_index=True)


0 AHR.hg38.10000
50 ELK4.hg38.10000
100 HIF3A.hg38.10000
150 MEF2A.hg38.10000
200 OTX2.hg38.10000
250 SMAD5.hg38.10000
300 TLX1.hg38.10000
350 ZNF316.hg38.10000


In [6]:
#perform fdr correction
for filtered in ['exclusion_filtered']: 
    for correction in ['uncorrected','GC_corrected','GC_map_corrected']:
        for feature_type in ['central_coverage','mean_coverage','amplitude']:
            stat_results[filtered][correction]['pearson-pval_'+feature_type]
            significant,qval = fdrcorrection(stat_results[filtered][correction]['pearson-pval_'+feature_type], alpha=0.05)
            stat_results[filtered][correction]['pearson-adjusted-p-val_'+feature_type] = qval


In [7]:
#export for supplemental table
keep_cols = ['site_name', 
             'pearson-r_central_coverage','pearson-pval_central_coverage', 'pearson-adjusted-p-val_central_coverage', 'lin-reg-RMSE_central_coverage',
             'pearson-r_mean_coverage', 'pearson-pval_mean_coverage','pearson-adjusted-p-val_mean_coverage','lin-reg-RMSE_mean_coverage',
             'pearson-r_amplitude', 'pearson-pval_amplitude','pearson-adjusted-p-val_amplitude','lin-reg-RMSE_amplitude']

for filtered in ['exclusion_filtered']: 
    for correction in ['uncorrected','GC_corrected','GC_map_corrected']:
        stat_results[filtered][correction][keep_cols].to_csv('files/'+filtered+'.'+correction+'.tsv', sep='\t', index=False)


In [8]:
a = stat_results['exclusion_filtered']['uncorrected'].set_index('site_name')
b = stat_results['exclusion_filtered']['GC_corrected'].set_index('site_name')
c = a.merge(b, left_index = True, right_index = True, suffixes = ['.uncorrected','.GC_corrected'])

for feature_type in ['central_coverage','mean_coverage','amplitude']:
    print(feature_type)
    print(sum(c['lin-reg-RMSE_'+feature_type+'.uncorrected']>c['lin-reg-RMSE_'+feature_type+'.GC_corrected']))
    print(np.round(sum(c['lin-reg-RMSE_'+feature_type+'.uncorrected']>c['lin-reg-RMSE_'+feature_type+'.GC_corrected'])/len(c),3))

central_coverage
351
0.931
mean_coverage
325
0.862
amplitude
32
0.085


In [9]:
feature_type = 'central_coverage'
sortby = 'pearson-adjusted-p-val_'+feature_type
pearsonr_col = 'pearson-r_'+feature_type
stat_results['exclusion_filtered']['GC_corrected'].sort_values(by = sortby)[['site_name',sortby,pearsonr_col]].head(20)



Unnamed: 0,site_name,pearson-adjusted-p-val_central_coverage,pearson-r_central_coverage
140,LYL1.hg38.10000,6.061251999999999e-20,0.630011
148,MECOM.hg38.10000,7.672481e-20,0.625303
93,GRHL2.hg38.10000,9.298828e-20,-0.622231
266,SPIB.hg38.10000,6.751328e-19,0.610243
265,SPI1.hg38.10000,2.023478e-18,0.603027
237,RUNX1.hg38.10000,1.023599e-15,0.565193
22,CEBPA.hg38.10000,1.5235e-12,0.51406
63,ETV6.hg38.10000,1.750733e-12,0.512018
274,STAT5B.hg38.10000,1.069741e-11,0.497174
141,MAF.hg38.10000,5.629816e-11,0.482915


In [10]:
feature_type = 'mean_coverage'
sortby = 'pearson-adjusted-p-val_'+feature_type
pearsonr_col = 'pearson-r_'+feature_type
stat_results['exclusion_filtered']['GC_corrected'].sort_values(by = sortby)[['site_name',sortby,pearsonr_col]].head(20)



Unnamed: 0,site_name,pearson-adjusted-p-val_mean_coverage,pearson-r_mean_coverage
140,LYL1.hg38.10000,4.7807060000000005e-18,0.607145
265,SPI1.hg38.10000,1.997666e-15,0.56789
148,MECOM.hg38.10000,2.736684e-15,0.563358
316,ZBTB16.hg38.10000,1.720175e-14,0.549562
178,NFIB.hg38.10000,3.003085e-14,-0.543099
275,STAT6.hg38.10000,3.003085e-14,0.543633
93,GRHL2.hg38.10000,8.329239e-14,-0.535024
237,RUNX1.hg38.10000,1.11847e-12,0.515368
22,CEBPA.hg38.10000,4.024055e-12,0.504792
63,ETV6.hg38.10000,1.007926e-11,0.496194


In [11]:
feature_type = 'amplitude'
sortby = 'pearson-adjusted-p-val_'+feature_type
pearsonr_col = 'pearson-r_'+feature_type
stat_results['exclusion_filtered']['GC_corrected'].sort_values(by = sortby)[['site_name',sortby,pearsonr_col]].head(20)



Unnamed: 0,site_name,pearson-adjusted-p-val_amplitude,pearson-r_amplitude
265,SPI1.hg38.10000,6.3848190000000004e-27,-0.700461
266,SPIB.hg38.10000,5.819564e-21,-0.638181
29,CTCF.hg38.10000,4.246896e-20,-0.626252
268,SRF.hg38.10000,2.01888e-13,-0.532745
318,ZBTB2.hg38.10000,3.892119e-13,-0.526504
120,IRF1.hg38.10000,8.563114e-13,-0.519465
159,MSC.hg38.10000,4.364373e-11,-0.487926
123,IRF4.hg38.10000,4.71561e-11,-0.486206
121,IRF2.hg38.10000,9.2935e-11,-0.478745
140,LYL1.hg38.10000,9.2935e-11,-0.479303
