In [1]:
import pandas as pd
import numpy as np

In [2]:
site_group = '30000-sites'
results_dict = {'DELFI_1x': '../../delfi_data_cancer_detection/number_of_sites_analysis/logreg_PCA_results/'+site_group+'_logreg_results/'+site_group+'.AUC.txt',
                'DELFI_ULP':'../../delfi_downsampled_cancer_detection/number_of_sites_analysis/logreg_PCA_results/'+site_group+'_logreg_results/'+site_group+'.AUC.txt',
                'LUCAS_1x':'../../lung_validation_cancer_detection/number_of_sites_analysis/logreg_PCA_results/'+site_group+'_logreg_results/'+site_group+'.AUC.txt',
                'LUCAS_ULP':'../../lung_validation_downsample_cancer_detection/number_of_sites_analysis/logreg_PCA_results/'+site_group+'_logreg_results/'+site_group+'.AUC.txt',
                'Validation_1x':'../../lung_validation_cancer_detection/validation_analysis/logreg_PCA_results/'+site_group+'_logreg_results/'+site_group+'.AUC.txt',
                'Validation_ULP':'../../lung_validation_downsample_cancer_detection/validation_analysis/logreg_PCA_results/'+site_group+'_logreg_results/'+site_group+'.AUC.txt'}

#get metadata 
metadata_dict = {'DELFI':'../../../griffin_paper/data/delfi_metadata/analysis/matched_metadata_corrected.txt',
                'LUCAS':'/fh/scratch/delete90/ha_g/realigned_bams/cfDNA_lung-cancer_EGA_hg38/metadata/scripts/lung_validation_cfDNA_metadata.tsv',
                'Validation':'/fh/scratch/delete90/ha_g/realigned_bams/cfDNA_lung-cancer_EGA_hg38/metadata/scripts/lung_validation_cfDNA_metadata.tsv'}

tfx_dict = {'DELFI':'../../metadata/delfi/analysis/Delfi_tumor_fraction_data.txt',
                'LUCAS':'../../metadata/lung_validation/lung_tumor_fraction_data.txt',
                'Validation':'../../metadata/lung_validation/lung_tumor_fraction_data.txt'}
                
iterations = 1000

In [3]:
probabilities = pd.DataFrame()

for dataset in results_dict.keys():
    current_path = results_dict[dataset].rsplit('.AUC.txt',1)[0]+'.probabilities.txt'
    print(current_path)
    current = pd.read_csv(current_path, sep='\t')

    if not dataset.startswith('Validation'):
        current['median_probability'] = current[np.arange(iterations).astype(str)].median(axis = 1)
        current = current[['sample','status','median_probability']]
    else:
        current = current[['sample','status','probability']]
    current['dataset'] = dataset
    
    metadata_path = metadata_dict[dataset.split('_')[0]]
    metadata = pd.read_csv(metadata_path,sep='\t')
    metadata = metadata.rename(columns = {'new_sample_name':'sample'})
    
    if not dataset.startswith('DELFI'):
        metadata['file_name'] = metadata['file_id']+'.'+metadata['file_format']
    metadata = metadata[['sample','file_name','Stage']]
     
    merged = current.merge(metadata, on ='sample')
    
    
    tfx_path = tfx_dict[dataset.split('_')[0]]
    tfx = pd.read_csv(tfx_path,sep='\t')
    
    merged = merged.merge(tfx[['sample','tumor_fraction']], on = 'sample')
    
    if 'Stage' in merged.columns:
        merged['Stage']=merged['Stage'].str.strip('ABC')
    
    #for the validation dataset, combine stage III and IV
    if dataset.startswith('Validation'):
        merged['Stage'] = merged['Stage'].replace('III','III-IV').replace('IV','III-IV')
        
    print(len(current),len(merged))
    
    probabilities = probabilities.append(merged, ignore_index = True)



../../delfi_data_cancer_detection/number_of_sites_analysis/logreg_PCA_results/30000-sites_logreg_results/30000-sites.probabilities.txt
423 423
../../delfi_downsampled_cancer_detection/number_of_sites_analysis/logreg_PCA_results/30000-sites_logreg_results/30000-sites.probabilities.txt
423 423
../../lung_validation_cancer_detection/number_of_sites_analysis/logreg_PCA_results/30000-sites_logreg_results/30000-sites.probabilities.txt
287 287
../../lung_validation_downsample_cancer_detection/number_of_sites_analysis/logreg_PCA_results/30000-sites_logreg_results/30000-sites.probabilities.txt
287 287
../../lung_validation_cancer_detection/validation_analysis/logreg_PCA_results/30000-sites_logreg_results/30000-sites.probabilities.txt
431 431
../../lung_validation_downsample_cancer_detection/validation_analysis/logreg_PCA_results/30000-sites_logreg_results/30000-sites.probabilities.txt
431 431


In [4]:
#export delfi data
delfi_1x = probabilities[probabilities['dataset']=='DELFI_1x'].copy()
delfi_1x['sample_type'] = delfi_1x['sample'].str.rsplit('_',1,expand=True)[0]
delfi_1x['cancer_predicted_1-2x_WGS']=np.where(delfi_1x['median_probability']>=0.5,1,0)
delfi_1x = delfi_1x.rename(columns = {'file_name':'original_bam_file','median_probability':'median_probability_cancer_1-2x_WGS'})
delfi_1x = delfi_1x[['sample','original_bam_file','sample_type','tumor_fraction','Stage','median_probability_cancer_1-2x_WGS','cancer_predicted_1-2x_WGS']]

delfi_ULP = probabilities[probabilities['dataset']=='DELFI_ULP'].copy()
delfi_ULP['cancer_predicted_ULP_WGS']=np.where(delfi_ULP['median_probability']>=0.5,1,0)
delfi_ULP = delfi_ULP.rename(columns = {'median_probability':'median_probability_cancer_ULP_WGS'})
delfi_ULP = delfi_ULP[['sample','median_probability_cancer_ULP_WGS','cancer_predicted_ULP_WGS']]

delfi = delfi_1x.merge(delfi_ULP, on = 'sample')
delfi.to_csv('files/S6_DELFI_cohort_predictions.tsv', sep='\t', index = False)

In [5]:
del(delfi,delfi_1x,delfi_ULP)

In [6]:
#repeat for LUCAS
LUCAS_1x = probabilities[probabilities['dataset']=='LUCAS_1x'].copy()
LUCAS_1x['cancer_predicted_1-2x_WGS']=np.where(LUCAS_1x['median_probability']>=0.5,1,0)
LUCAS_1x = LUCAS_1x.rename(columns = {'file_name':'original_bam_file','median_probability':'median_probability_cancer_1-2x_WGS','status':'cancer_present'})
LUCAS_1x = LUCAS_1x[['sample','cancer_present','original_bam_file','tumor_fraction','Stage','median_probability_cancer_1-2x_WGS','cancer_predicted_1-2x_WGS']]

LUCAS_ULP = probabilities[probabilities['dataset']=='LUCAS_ULP'].copy()
LUCAS_ULP['cancer_predicted_ULP_WGS']=np.where(LUCAS_ULP['median_probability']>=0.5,1,0)
LUCAS_ULP = LUCAS_ULP.rename(columns = {'median_probability':'median_probability_cancer_ULP_WGS'})
LUCAS_ULP = LUCAS_ULP[['sample','median_probability_cancer_ULP_WGS','cancer_predicted_ULP_WGS']]

LUCAS = LUCAS_1x.merge(LUCAS_ULP, on = 'sample')
LUCAS.to_csv('files/S7_LUCAS_cohort_predictions.tsv', sep='\t', index = False)

In [7]:
del(LUCAS,LUCAS_1x,LUCAS_ULP)

In [8]:
#repeat for LUCAS validation
validation_1x = probabilities[probabilities['dataset']=='Validation_1x'].copy()
validation_1x['cancer_predicted_1-2x_WGS']=np.where(validation_1x['probability']>=0.5,1,0)
validation_1x = validation_1x.rename(columns = {'file_name':'original_bam_file','probability':'probability_cancer_1-2x_WGS','status':'cancer_present'})
validation_1x = validation_1x[['sample','cancer_present','original_bam_file','tumor_fraction','Stage','probability_cancer_1-2x_WGS','cancer_predicted_1-2x_WGS']]

validation_ULP = probabilities[probabilities['dataset']=='Validation_ULP'].copy()
validation_ULP['cancer_predicted_ULP_WGS']=np.where(validation_ULP['probability']>=0.5,1,0)
validation_ULP = validation_ULP.rename(columns = {'probability':'probability_cancer_ULP_WGS'})
validation_ULP = validation_ULP[['sample','probability_cancer_ULP_WGS','cancer_predicted_ULP_WGS']]

validation = validation_1x.merge(validation_ULP, on = 'sample')
validation.to_csv('files/S8_LUCAS_Validation_cohort_predictions.tsv', sep='\t', index = False)


Unnamed: 0,sample,cancer_present,original_bam_file,tumor_fraction,Stage,probability_cancer_1-2x_WGS,cancer_predicted_1-2x_WGS,probability_cancer_ULP_WGS,cancer_predicted_ULP_WGS
0,CGPLH1000P,0,EGAF00005322703.bam,0.02616,,0.14299,0,0.98338,1
1,CGPLH1001P,0,EGAF00005322704.bam,0.00000,,0.32475,0,0.99999,1
2,CGPLH1002P,0,EGAF00005322705.bam,0.00000,,0.16214,0,0.99226,1
3,CGPLH1003P,0,EGAF00005322706.bam,0.00000,,0.25229,0,0.00295,0
4,CGPLH1004P,0,EGAF00005322707.bam,0.00000,,0.52095,1,0.07703,0
...,...,...,...,...,...,...,...,...,...
426,CGPLLU4P,1,EGAF00005323410.bam,0.00000,II,0.50070,1,0.00340,0
427,CGPLLU5P,1,EGAF00005323509.bam,0.00000,II,0.65261,1,0.00168,0
428,CGPLLU63P,1,EGAF00005323554.bam,0.03618,I,0.49137,0,0.00729,0
429,CGPLLU64P,1,EGAF00005323555.bam,0.01633,I,0.45426,0,0.96647,1
