In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from scipy.stats import mannwhitneyu
import time


In [2]:
MBC_pt_metadata = '../../raw_TCGA/sample_info.txt'
MBC_counts_path = '../../raw_TCGA/BRCA_raw_counts.txt'
MBC_peak_loc_path = '../../raw_TCGA/BRCA_peak_locations.txt'

out_file_labels = '../all_sites/labels_for_DEseq2.txt'
out_file_data = '../all_sites/BRCA_peak_counts_for_DEseq2.txt'

In [3]:
#import the atac patient metadata
pt_data = pd.read_csv(MBC_pt_metadata, sep='\t')

#get only breast cancer data and keep only the sample name and subtype
pt_data = pt_data[pt_data['cohort']=='BRCA'][['submitter_id','BRCA_scmod2']]

#drop replicates
pt_data = pt_data.drop_duplicates()
print('total_brca_patient_samples',len(pt_data))

#drop samples without subtype
pt_data = pt_data[~(pt_data['BRCA_scmod2'].isnull())]
print('patient samples with subtype info',len(pt_data),'\n')

#get list of ER positive samples
ER_pos = pt_data[(pt_data['BRCA_scmod2'].str.contains('ER\+'))]#'ER\+' is necessary so that this isn't a regex expression meaning something else

#print the subtypes for these samples (there are multiple subtypes within ER positive)
print('ER_pos',ER_pos['BRCA_scmod2'].unique())

#get just the pt ids
ER_pos = ER_pos['submitter_id'].values
print('ER+',len(ER_pos),'\n')

#repeat for ER negative
ER_neg = pt_data[pt_data['BRCA_scmod2'].str.contains('ER-')]
print('ER_neg',ER_neg['BRCA_scmod2'].unique())

#get just the pt ids
ER_neg = ER_neg['submitter_id'].values
print('ER-',len(ER_neg),'\n')

#sanity check, get the samples that are neither ER+ nor ER-
ER_other = pt_data[~(pt_data['BRCA_scmod2'].str.contains('ER\+')) & ~(pt_data['BRCA_scmod2'].str.contains('ER-'))]#'ER\+' is necessary so that this isn't a regex expression meaning something else
print('other',ER_other['BRCA_scmod2'].unique())
print('other', len(ER_other))

total_brca_patient_samples 74
patient samples with subtype info 70 

ER_pos ['ER+/HER2- High Prolif' 'ER+/HER2- Low Prolif']
ER+ 44 

ER_neg ['ER-/HER2-']
ER- 15 

other ['HER2+']
other 11


In [4]:
chroms = ['chr'+str(m) for m in range(1,23)]

In [5]:
#peak data for each sample
peak_counts = pd.read_csv(MBC_counts_path,sep='\t')

#drop the last part of each column name to match the library names
peak_counts.columns = peak_counts.columns.str.rsplit('_', 1,expand=True).droplevel(1)

peak_counts = peak_counts.rename(columns = {'name':'peak_name'}).set_index('peak_name')

print(len(peak_counts))

peak_counts = peak_counts[peak_counts['seqnames'].isin(chroms)]
print(len(peak_counts))

peak_counts.head()

215920
211938


Unnamed: 0_level_0,seqnames,start,end,score,BRCA_000CFD9F_ADDF_4304_9E60_6041549E189C_X017_S06_L011_B1_T1,BRCA_000CFD9F_ADDF_4304_9E60_6041549E189C_X017_S06_L012_B1_T2,BRCA_01112370_4F6F_4A20_9BE0_7975C3465268_X017_S04_L007_B1_T1,BRCA_01112370_4F6F_4A20_9BE0_7975C3465268_X017_S04_L008_B1_T2,BRCA_0142AAAC_FFE8_43B7_AB99_02F7A1740567_X022_S06_L057_B1_T1,BRCA_0142AAAC_FFE8_43B7_AB99_02F7A1740567_X022_S06_L058_B1_T2,...,BRCA_EF17C882_9808_4676_9DFA_432D34290B33_X023_S15_L101_B1_T1,BRCA_EF17C882_9808_4676_9DFA_432D34290B33_X023_S15_L102_B1_T2,BRCA_FB055B59_7512_40E4_8547_39798A4C9B8C_X011_S09_L017_B1_T1,BRCA_FB055B59_7512_40E4_8547_39798A4C9B8C_X011_S09_L018_B1_T2,BRCA_FB1C995E_6C78_414A_B74C_8C77CD924348_X015_S09_L041_B1_T1,BRCA_FB1C995E_6C78_414A_B74C_8C77CD924348_X015_S09_L042_B1_T2,BRCA_FCD2477B_7E05_4EB7_BD63_302496AEA537_X017_S11_L021_B1_T1,BRCA_FCD2477B_7E05_4EB7_BD63_302496AEA537_X017_S11_L022_B1_T2,BRCA_FE43880C_3F93_4463_9C91_5A2DE7130718_X009_S11_L045_B1_T1,BRCA_FE43880C_3F93_4463_9C91_5A2DE7130718_X009_S11_L046_B1_T2
peak_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BRCA_2,chr1,17232,17733,1.711525,22,18,14,14,15,10,...,13,12,13,14,25,15,14,8,5,11
BRCA_3,chr1,180632,181133,1.812572,23,13,31,45,17,10,...,59,33,60,41,12,40,27,24,19,30
BRCA_4,chr1,181205,181706,7.213276,37,26,51,67,36,48,...,104,78,70,66,35,52,46,45,37,43
BRCA_5,chr1,183555,184056,1.343504,29,19,28,29,5,15,...,61,36,18,18,31,22,49,36,25,16
BRCA_6,chr1,184245,184746,2.552184,14,13,12,18,17,18,...,69,44,18,6,28,22,34,22,10,14


In [6]:
#sum the technical replicates
sample_peak_sums = peak_counts[['seqnames', 'start', 'end', 'score']].copy()

#get a list of all sample library pairs
libraries = pd.read_csv(MBC_pt_metadata, sep='\t')[['Library_Name','submitter_id']]

for sample in ER_pos:
    current_libraries = libraries[libraries['submitter_id']==sample]['Library_Name'].values
    if len(current_libraries) != len(peak_counts[current_libraries].columns):
        print('problem!')
    sample_peak_sums[sample] = peak_counts[current_libraries].sum(axis = 1)

for sample in ER_neg:
    current_libraries = libraries[libraries['submitter_id']==sample]['Library_Name'].values
    if len(current_libraries) != len(peak_counts[current_libraries].columns):
        print('problem!')
    sample_peak_sums[sample] = peak_counts[current_libraries].sum(axis = 1)

del(peak_counts)

In [7]:
sample_peak_sums

Unnamed: 0_level_0,seqnames,start,end,score,TCGA-A7-A13F,TCGA-BH-A0DP,TCGA-A2-A0EW,TCGA-A7-A0CH,TCGA-A2-A0T4,TCGA-AR-A0TV,...,TCGA-AR-A0U0,TCGA-AO-A12F,TCGA-D8-A13Z,TCGA-BH-A0E0,TCGA-C8-A12K,TCGA-S3-AA0Z,TCGA-BH-A0DL,TCGA-A2-A0SX,TCGA-AO-A124,TCGA-C8-A12V
peak_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BRCA_2,chr1,17232,17733,1.711525,40,25,41,9,13,20,...,21,1,47,30,18,28,48,87,37,31
BRCA_3,chr1,180632,181133,1.812572,36,27,77,58,67,34,...,131,36,80,61,40,25,168,116,81,105
BRCA_4,chr1,181205,181706,7.213276,63,84,148,24,114,88,...,143,23,252,192,64,21,247,140,208,365
BRCA_5,chr1,183555,184056,1.343504,48,20,47,15,39,150,...,65,32,164,210,21,48,45,67,143,53
BRCA_6,chr1,184245,184746,2.552184,27,35,38,14,43,159,...,37,16,171,163,16,42,43,46,77,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BRCA_211949,chr22,50713480,50713981,4.839397,30,138,124,45,178,8,...,20,428,63,33,488,305,19,116,111,702
BRCA_211950,chr22,50732063,50732564,1.573881,70,90,201,90,189,329,...,66,69,51,65,124,33,145,85,27,146
BRCA_211951,chr22,50756924,50757425,2.061515,97,197,143,86,160,193,...,67,202,112,72,129,166,146,72,120,371
BRCA_211952,chr22,50783390,50783891,9.473388,278,639,676,700,612,1392,...,442,446,452,481,390,611,563,177,281,1762


In [8]:
output_df = sample_peak_sums[list(ER_pos)+list(ER_neg)]
output_df = output_df.sort_index()
output_df.index.name = 'Peak_Name' #use same index name as old version
output_df.columns = ['ER_pos_'+str(m) for m in range(len(ER_pos))]+['ER_neg_'+str(m) for m in range(len(ER_neg))]
output_df.to_csv(out_file_data, sep='\t')

In [9]:
labels = pd.Series(['ER_pos' for m in range(len(ER_pos))]+['ER_neg' for m in range(len(ER_neg))])
labels.to_csv(out_file_labels, sep='\t',index=False, header=False)

In [10]:
output_df

Unnamed: 0_level_0,ER_pos_0,ER_pos_1,ER_pos_2,ER_pos_3,ER_pos_4,ER_pos_5,ER_pos_6,ER_pos_7,ER_pos_8,ER_pos_9,...,ER_neg_5,ER_neg_6,ER_neg_7,ER_neg_8,ER_neg_9,ER_neg_10,ER_neg_11,ER_neg_12,ER_neg_13,ER_neg_14
Peak_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BRCA_10,408,1350,1666,1385,1778,2439,1710,456,987,956,...,1381,119,616,733,1457,810,1847,619,682,2132
BRCA_100,423,1017,938,1073,1026,2085,2008,215,480,775,...,991,185,725,1314,990,864,1244,425,834,1833
BRCA_1000,20,84,54,70,119,38,79,18,45,48,...,429,35,61,135,67,159,114,58,42,495
BRCA_10000,93,145,193,174,125,187,157,44,110,116,...,176,125,116,137,103,128,94,65,192,518
BRCA_100000,102,97,72,139,55,632,230,14,121,33,...,12,13,19,7,21,5,8,15,6,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BRCA_99995,90,70,48,167,60,380,213,13,71,85,...,19,14,27,8,17,6,10,21,13,19
BRCA_99996,203,140,200,340,119,1318,573,20,136,122,...,18,8,10,9,16,11,18,6,14,15
BRCA_99997,49,44,61,134,39,298,167,8,66,38,...,10,6,9,6,17,6,11,6,5,13
BRCA_99998,51,45,47,49,60,187,100,13,28,40,...,6,13,11,4,12,6,10,13,3,10
