In [1]:
import pandas as pd
import os
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
results_path='../../../griffin_paper/delfi_cancer_detection/Ulz_downsampled_analysis/scratch_copies/'
metadata_path = '../../../griffin_paper/data/delfi_metadata/analysis/matched_metadata_corrected.txt'
tfx_path = '../../metadata/delfi/analysis/Delfi_tumor_fraction_data.txt'

output_path = 'merged_data/Ulz-downsampled_cancer_detection_results_merged.txt'

In [3]:
results=pd.DataFrame()
for item in os.listdir(results_path): #import the results
    if item.startswith('.'):
        continue
    result_file = results_path+item
    current=pd.read_csv(result_file, sep='\t')
    current[['TF_name','number_sites','extra_stuff']]=current['TF_profile'].str.rsplit('.', n=2, expand=True)
    current = current[['TF_name','HighFreqRange']]

    current  = current.set_index('TF_name').T.reset_index(drop=True)
    current['bam_name']=item.rsplit('_Accessibility1KSitesAdjusted.txt')[0]

    results = results.append(current, ignore_index=True)
features = results.set_index('bam_name').columns 
print('features', len(features))

features 504


In [4]:
print(len(results))
results.head()

540


TF_name,ADNP,AEBP2,AhR,Androgen,AP-2&#945;,AP-2&#947;,AP-4,ARID1A,ARID1B,ARID2,...,ZNF85,ZNF8,ZNF92,ZSCAN16,ZSCAN22,ZSCAN2,ZSCAN5A,ZSCAN5D,ZXDC,bam_name
0,0.811035,0.809477,0.645685,0.524515,0.698147,0.639548,0.691884,0.670617,1.231582,0.725045,...,0.636365,0.568145,0.675192,0.698984,0.696669,0.920359,0.66765,0.744383,0.640529,PGDX18259P_WGS.sorted_processed
1,0.740001,0.790616,0.651644,0.628981,1.180467,0.596563,1.632324,0.765319,0.656969,0.727409,...,0.473352,0.524055,0.868077,0.676794,1.124909,0.620888,1.171612,0.701792,0.847923,PGDX2750P_WGS_X1.sorted_processed
2,1.033225,1.052757,0.728701,0.811694,0.6715,0.598978,0.847551,1.041598,0.780689,1.391952,...,0.531902,0.837135,0.607291,0.882978,0.579957,0.608522,0.854119,0.88918,0.747744,PGDX5882P_WGS_processed_downsamp
3,0.731452,0.55739,0.422682,0.568415,0.624778,1.209577,0.659978,0.767851,1.226222,0.576228,...,0.566848,0.77766,0.661992,0.667641,0.753801,0.769049,0.572226,0.75257,0.536912,PGDX8828P_WGS.sorted_processed
4,0.789273,0.961033,0.598697,0.570938,0.706001,0.797353,0.734222,0.755581,0.539486,0.944916,...,0.82244,0.660216,0.629517,0.763933,0.681369,0.820655,0.66383,0.818916,1.026617,PGDX3512P5_WGS.sorted_processed


In [5]:
metadata = pd.read_csv(metadata_path, sep='\t')
metadata['bam_name'] = metadata['file_name'].str.rsplit('.bam',1,expand=True)[0]
metadata = metadata[~(metadata['Sample Type']=='Human adult elutriated lymphocytes')] #drop the non-cfDNA samples

In [6]:
merged = results.merge(metadata[['bam_name','new_sample_name','Stage']], on = 'bam_name')
merged['sample_type'] = merged['new_sample_name'].str.rsplit('_CG',1,expand=True)[0]
merged['status'] = np.where(merged['sample_type']=='Healthy',0,1)
merged = merged.rename(columns = {'new_sample_name':'sample'})

tumor_fractions = pd.read_csv(tfx_path, sep = '\t')
tumor_fractions = tumor_fractions.rename(columns = {'tfx':'tumor_fraction'})

merged = merged.merge(tumor_fractions[['sample','tumor_fraction']], on = 'sample')

#rename the feature columns
merged = merged.rename(columns = {m:'Ulz_'+m for m in merged[features].columns})


In [7]:
print(len(merged))
merged.head()

423


Unnamed: 0,Ulz_ADNP,Ulz_AEBP2,Ulz_AhR,Ulz_Androgen,Ulz_AP-2&#945;,Ulz_AP-2&#947;,Ulz_AP-4,Ulz_ARID1A,Ulz_ARID1B,Ulz_ARID2,...,Ulz_ZSCAN2,Ulz_ZSCAN5A,Ulz_ZSCAN5D,Ulz_ZXDC,bam_name,sample,Stage,sample_type,status,tumor_fraction
0,0.811035,0.809477,0.645685,0.524515,0.698147,0.639548,0.691884,0.670617,1.231582,0.725045,...,0.920359,0.66765,0.744383,0.640529,PGDX18259P_WGS.sorted_processed,Healthy_CGPLH640,,Healthy,0,0.0
1,0.740001,0.790616,0.651644,0.628981,1.180467,0.596563,1.632324,0.765319,0.656969,0.727409,...,0.620888,1.171612,0.701792,0.847923,PGDX2750P_WGS_X1.sorted_processed,Breast_Cancer_CGPLBR24,II,Breast_Cancer,1,0.02752
2,1.033225,1.052757,0.728701,0.811694,0.6715,0.598978,0.847551,1.041598,0.780689,1.391952,...,0.608522,0.854119,0.88918,0.747744,PGDX5882P_WGS_processed_downsamp,Colorectal_Cancer_CGCRC292,IV,Colorectal_Cancer,1,0.05273
3,0.731452,0.55739,0.422682,0.568415,0.624778,1.209577,0.659978,0.767851,1.226222,0.576228,...,0.769049,0.572226,0.75257,0.536912,PGDX8828P_WGS.sorted_processed,Colorectal_Cancer_CGCRC341,IV,Colorectal_Cancer,1,0.05058
4,0.498629,0.831827,0.552264,0.60949,0.646907,0.715652,0.869012,1.035716,0.745268,1.526194,...,1.352853,0.604555,0.773505,0.663729,PGDX18251P_WGS.sorted_processed,Healthy_CGPLH324,,Healthy,0,0.0


In [8]:
merged.to_csv(output_path,sep='\t',index=False)

In [12]:
#check that there aren't any nulls
merged[['Ulz_'+m for m in features]].isnull().any().any()

False

In [13]:
print(len(merged))
merged.shape

423


(423, 510)

In [14]:
merged[['Ulz_'+feature for feature in features]].shape

(423, 504)

In [15]:
merged['Stage'].unique()

array([nan, 'II', 'IV', 'I', 'III', '0', 'X'], dtype=object)