In [1]:
import pandas as pd
import numpy as np

from statsmodels.stats.multitest import fdrcorrection

from scipy.stats import linregress
from sklearn.metrics import mean_squared_error

import os

In [2]:
in_files = {'no-CNA-correction':'../../MBC/number_of_TFBS_sites_analysis/merged_data/10000-sites_reformatted.txt',
           'CNA-corrected-TFBS':'../../MBC/CNA_correction_100kb_TFBS_10ksites_np/analysis/merged_data/CNA-corrected-TFBS_reformatted.txt'}


In [3]:
data = pd.DataFrame()
for group in in_files.keys():
    current=pd.read_csv(in_files[group], sep='\t')
    current['group'] = group
    current = current[(current['tumor_fraction']>=0.1) & (current['ulp_wgs_coverage']>=0.1) ]
    print(len(current))
    data = data.append(current, ignore_index=True)

191
191


In [4]:
features = data.columns[(data.columns.str.startswith('central_cov')) | (data.columns.str.startswith('mean_cov')) | (data.columns.str.startswith('amplitude'))]
site_names = list(features.str.rsplit('_',1,expand=True).droplevel().unique())
print(len(site_names))

377


In [5]:
data['group'].unique()

array(['no-CNA-correction', 'CNA-corrected-TFBS'], dtype=object)

In [6]:
stat_results = {}

for group in ['no-CNA-correction', 'CNA-corrected-TFBS']:
    stat_results[group] = pd.DataFrame()

for k,site_name in enumerate(site_names):
    if k%50==0:
        print(k,site_name)
    for group in ['no-CNA-correction', 'CNA-corrected-TFBS']:
        current = data[(data['group']==group)]
        current_outputs = {'site_name':site_name}
        for feature_type in ['central_coverage','mean_coverage','amplitude']:
            feature = feature_type + '_' + site_name
            #pearson r plus regression line
            lin_reg_slope, lin_reg_intercept, pearson_r, pearson_pval, lin_reg_std_err = linregress(current['tumor_fraction'],current[feature])
            fit_line = lin_reg_slope*current['tumor_fraction']+lin_reg_intercept
            RMSE = np.sqrt(mean_squared_error(current[feature],fit_line))
            current_outputs['pearson-r_'+feature_type] = pearson_r
            current_outputs['pearson-pval_'+feature_type] = pearson_pval
            current_outputs['lin-reg-slope_'+feature_type] = lin_reg_slope
            current_outputs['lin-reg-intercept_'+feature_type] = lin_reg_intercept
            current_outputs['lin-reg-RMSE_'+feature_type] = RMSE
        stat_results[group] = stat_results[group].append(current_outputs, ignore_index=True)


0 AHR.hg38.10000
50 ELK4.hg38.10000
100 HIF3A.hg38.10000
150 MEF2A.hg38.10000
200 OTX2.hg38.10000
250 SMAD5.hg38.10000
300 TLX1.hg38.10000
350 ZNF316.hg38.10000


In [7]:
#perform fdr correction
for group in ['no-CNA-correction', 'CNA-corrected-TFBS']:
    for feature_type in ['central_coverage','mean_coverage','amplitude']:
        stat_results[group]['pearson-pval_'+feature_type]
        significant,qval = fdrcorrection(stat_results[group]['pearson-pval_'+feature_type], alpha=0.05)
        stat_results[group]['pearson-adjusted-p-val_'+feature_type] = qval


In [8]:
#export for supplemental table
keep_cols = ['site_name', 
             'pearson-r_central_coverage','pearson-pval_central_coverage', 'pearson-adjusted-p-val_central_coverage', 'lin-reg-RMSE_central_coverage',
             'pearson-r_mean_coverage', 'pearson-pval_mean_coverage','pearson-adjusted-p-val_mean_coverage','lin-reg-RMSE_mean_coverage',
             'pearson-r_amplitude', 'pearson-pval_amplitude','pearson-adjusted-p-val_amplitude','lin-reg-RMSE_amplitude']

for group in ['no-CNA-correction', 'CNA-corrected-TFBS']:
    stat_results[group][keep_cols].to_csv('files/'+group+'.correlations.tsv', sep='\t', index=False)


In [9]:
a = stat_results['no-CNA-correction'].set_index('site_name')
b = stat_results['CNA-corrected-TFBS'].set_index('site_name')
c = a.merge(b, left_index = True, right_index = True, suffixes = ['.no-CNA-correction','.CNA-corrected'])

for feature_type in ['central_coverage','mean_coverage','amplitude']:
    print(feature_type)
    print(sum(c['lin-reg-RMSE_'+feature_type+'.no-CNA-correction']>c['lin-reg-RMSE_'+feature_type+'.CNA-corrected']))
    print(np.round(sum(c['lin-reg-RMSE_'+feature_type+'.no-CNA-correction']>c['lin-reg-RMSE_'+feature_type+'.CNA-corrected'])/len(c),3))

central_coverage
157
0.416
mean_coverage
0
0.0
amplitude
294
0.78
