In [1]:
import os 
import glob
import numpy as np 
import pandas as pd

os.chdir('/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/')
samplesheet_dir = 'results/samplesheets/chipseq/'
batch = 'batch1'

## Load the GSM Data

In [2]:
fn = os.path.join(samplesheet_dir, 'ChIP-seq-Tracker-Batch1-2016-10.22.23-Step0.HiChIP-to-ChIP-seq-Map.tsv')
data = pd.read_table(fn, skiprows=1)
data = data.loc[data['Has ChIP-seq?'] == 'Yes']

In [3]:
data = pd.read_table(fn, skiprows=1)

data = data.loc[data['Has ChIP-seq?'] == 'Yes']

In [4]:
chipseq_gse_lists = data['GSM ID(s)'].dropna().tolist() + data['Control GSM(s)'].dropna().tolist()
chipseq_gsms = []
for gse_list in chipseq_gse_lists:
    gsm_ids = [x.strip() for x in gse_list.split(',')]
    chipseq_gsms.extend(gsm_ids)
    
chipseq_gsms = sorted(set(chipseq_gsms))

In [5]:
data = data.loc[data['Has ChIP-seq?'] == 'Yes']

Query the below within the SRA Run Selector: https://www.ncbi.nlm.nih.gov/Traces/study/

In [6]:
','.join(chipseq_gsms)

'GSM2572581,GSM2572582,GSM2572583,GSM2572584,GSM2572585,GSM2572586,GSM2572587,GSM2572588,GSM2572589,GSM2572590,GSM2773998,GSM2773999,GSM2816615,GSM2816616,GSM2816617,GSM2816618,GSM2816619,GSM2816620,GSM2816625,GSM2816626,GSM2816627,GSM2816628,GSM2816629,GSM2816630,GSM2816654,GSM2816655,GSM2861704,GSM2861707,GSM2915165,GSM2915166,GSM2915167,GSM2915168,GSM3018462,GSM3018463,GSM3018477,GSM3032900,GSM3032905,GSM3032906,GSM3032911,GSM3082014,GSM3082015,GSM3082016,GSM3082017,GSM3082018,GSM3082019,GSM3106259,GSM3106260,GSM3106269,GSM3106270,GSM3106271,GSM3106272,GSM3106273,GSM3106274,GSM3106277,GSM3106278,GSM3106281,GSM3106282,GSM3106283,GSM3106284,GSM3106285,GSM3106286,GSM3106287,GSM3106289,GSM3106290,GSM3106291,GSM3210231,GSM3210233,GSM3212820,GSM3212825,GSM3212826,GSM3212836,GSM3212840,GSM3212841,GSM3212870,GSM3212875,GSM3212876,GSM3212886,GSM3212890,GSM3212891,GSM3263163,GSM3263167,GSM3314497,GSM3314498,GSM3314499,GSM3664982,GSM3664983,GSM3664984,GSM3664985,GSM3664986,GSM3664987,GSM366498

In [7]:
len(chipseq_gsms)

242

## Parse the Queried Data

In [8]:
# load the data
sra_table_fn = os.path.join(samplesheet_dir, 'chipseq.{}.SraRunTable.txt'.format(batch))
gsm_to_srr_data = pd.read_table(sra_table_fn, sep=',')
gsm_to_srr_data = gsm_to_srr_data[['Sample Name', 'Run']]
gsm_to_srr_data.columns = ['gsm_id', 'srr_id']
gsm_to_srr_data.sort_values('gsm_id', inplace=True)

In [9]:
# # manually adding missing samples, these are from batch 2 not batch 1
# adding_missing_entries = [['GSM7336681', 'SRR16538542'],
#                           ['GSM7336682', 'SRR16538550'],
#                           ['GSM7336684', 'SRR16538541'],
#                           ['GSM7336685', 'SRR16538549'],
# ]
# adding_missing_entries = pd.DataFrame(adding_missing_entries, columns=gsm_to_srr_data.columns.tolist())
# gsm_to_srr_data = pd.concat([gsm_to_srr_data, adding_missing_entries], axis=0)

## Check for Discrepancies

In [10]:
original_gses = set(chipseq_gsms)

In [11]:
queried_gses = set(gsm_to_srr_data.gsm_id.unique())

In [12]:
len(queried_gses)

243

In [13]:
original_gses.difference(queried_gses)

set()

Not sure how I got these extra stragglers 

In [14]:
queried_gses.difference(original_gses)

{'GSM3263164'}

In [15]:
gsm_to_srr_data.gsm_id.value_counts()

gsm_id
GSM5379675    8
GSM5379674    8
GSM5379673    8
GSM5379671    4
GSM5379672    4
             ..
GSM3314499    1
GSM3664982    1
GSM3664983    1
GSM3664984    1
GSM5455046    1
Name: count, Length: 243, dtype: int64

## Create the Samplesheet for Downloading

In [16]:
# get unique SRRs 

In [17]:
srr_ids = sorted(gsm_to_srr_data.srr_id.unique())

In [18]:
srr_ids_fn = os.path.join(samplesheet_dir, 'chipseq.srr_ids.{}.txt'.format(batch))
with open(srr_ids_fn, 'w') as fw:
    fw.write('\n'.join(srr_ids))

In [19]:
srr_ids_fn

'results/samplesheets/chipseq/chipseq.srr_ids.batch1.txt'

## Create the Samplesheet for Concatination

In [20]:
# get Combinations of SRRs that should be concatinated
gsm_srr_combos_data = []

# main GSMs
for gsm_ids_combo in data['GSM ID(s)'].tolist():
    gsm_ids = gsm_ids_combo.replace(' ', '').split(',')
    gsm_combo_reps = [gsm_ids_combo] * len(gsm_ids)
    split_data = list(zip(gsm_combo_reps, gsm_ids))
    gsm_srr_combos_data.extend(split_data)    
    
# control GSMs
for gsm_ids_combo in data['Control GSM(s)'].tolist():
    
    if type(gsm_ids_combo) == str:
        gsm_ids = gsm_ids_combo.replace(' ', '').split(',')
        gsm_combo_reps = [gsm_ids_combo] * len(gsm_ids)
        split_data = list(zip(gsm_combo_reps, gsm_ids))
    else:
        split_data = [[np.nan, np.nan]]
    gsm_srr_combos_data.extend(split_data)    

In [21]:
gsm_srr_combos_data = pd.DataFrame(gsm_srr_combos_data, columns=['GSM ID(s)', 'split_gsm_ids']).drop_duplicates()
gsm_srr_combos_data = gsm_srr_combos_data.merge(gsm_to_srr_data, left_on='split_gsm_ids', right_on='gsm_id', how='outer', indicator=True)
gsm_srr_combos_data = gsm_srr_combos_data.loc[~gsm_srr_combos_data.gsm_id.isna()]

In [22]:
gsm_srr_combos_data_agg = gsm_srr_combos_data.groupby('GSM ID(s)').agg({'srr_id': '-'.join})
srr_combos = sorted(gsm_srr_combos_data_agg.srr_id.unique())

In [23]:
srr_combos_fn = os.path.join(samplesheet_dir, 'chipseq.srr_id.combos.{}.txt'.format(batch))
with open(srr_combos_fn, 'w') as fw:
    fw.write('\n'.join(srr_combos))

### len(srr_combos)

## Create the Samplesheet for Alignment of Control SRRs

#### Generate a samplesheet that contains the organism

In [24]:
# extracting gsms
df1 = data[['GSM ID(s)', 'Organism']].rename(columns={'GSM ID(s)': 'gsm_ids'})
df1['sample_type'] = 'main'
df2 = data[['Control GSM(s)', 'Organism']].rename(columns={'Control GSM(s)': 'gsm_ids'})
df2['sample_type'] = 'control'

# concating the gsm dfs
gsm_to_organism = pd.concat([df1, df2], axis=0).dropna().drop_duplicates()

# adding srr information
gsm_to_organism = gsm_to_organism.merge(gsm_srr_combos_data_agg, left_on='gsm_ids', right_index=True)

# extract control only
gsm_to_organism_control = gsm_to_organism.loc[gsm_to_organism.sample_type == 'control']

# adding reference genome information
human_subset = gsm_to_organism_control.loc[gsm_to_organism_control['Organism'] == 'Homo Sapiens']
mouse_subset = gsm_to_organism_control.loc[gsm_to_organism_control['Organism'] == 'Mus Musculus']

grch38_subset = human_subset.copy().sort_values(['srr_id'])
grch38_subset.loc[:, 'ref_genome'] = 'grch38'

t2t_subset = human_subset.copy().sort_values(['srr_id'])
t2t_subset.loc[:, 'ref_genome'] = 't2t'

mm10_subset = mouse_subset.copy().sort_values(['srr_id'])
mouse_subset.loc[:, 'ref_genome'] = 'mm10'

# concating all the subsets for a final samplesheet
gsm_to_organism_control = pd.concat([grch38_subset, mouse_subset, t2t_subset])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mouse_subset.loc[:, 'ref_genome'] = 'mm10'


In [25]:
# extract srr and ref_genome
gsm_to_organism_control = gsm_to_organism_control[['srr_id', 'ref_genome']]

In [26]:
samplesheet_path = os.path.join(samplesheet_dir, 'chipseq.srr_ids_to_ref_genome.combos.{}.txt'.format(batch))
gsm_to_organism_control.to_csv(samplesheet_path, sep='\t', header=False, index=False)

## Create the Samplesheet for Running ChIPLine

In [33]:
chipline_ss

Unnamed: 0,chipseq_std_sample_name,Main SRR(s),Control SRR(s),Ref Genome,Sample Name,GSE ID For ChIP-seq Data,Organism,Pulldown,Rep No.,GSM ID(s),Control GSM(s)
0,H9.GSE105028.Homo_Sapiens.CTCF.b1,SRR6177938,,grch38,H9,GSE105028,Homo_Sapiens,CTCF,b1,GSM2816619,
5,H9.GSE105028.Homo_Sapiens.H3K4me1.b1,SRR6177973,,grch38,H9,GSE105028,Homo_Sapiens,H3K4me1,b1,GSM2816654,
9,H9.GSE105028.Homo_Sapiens.KLF4.b1,SRR6177946,,grch38,H9,GSE105028,Homo_Sapiens,KLF4,b1,GSM2816627,
12,H9.GSE105028.Homo_Sapiens.NANOG.b1,SRR6177944,,grch38,H9,GSE105028,Homo_Sapiens,NANOG,b1,GSM2816625,
16,H9.GSE105028.Homo_Sapiens.OCT4.b1,SRR6177948,,grch38,H9,GSE105028,Homo_Sapiens,OCT4,b1,GSM2816629,
...,...,...,...,...,...,...,...,...,...,...,...
194,SUCCS1-siEA.GSE180198.Homo_Sapiens.H3K27ac.b1,SRR15163142-SRR15163145,,t2t,SUCCS1-siEA,GSE180198,Homo_Sapiens,H3K27ac,b1,"GSM5455043, GSM5455046",
196,MDA-MB-231-PVT1sgRNAR2.GSE97584.Homo_Sapiens.H...,SRR5442293-SRR5442294,SRR5442299-SRR5442300,t2t,MDA-MB-231-PVT1sgRNAR2,GSE97584,Homo_Sapiens,H3K27ac,b1,"GSM2572583, GSM2572584","GSM2572589, GSM2572590"
198,MDA-MB-231-PVT1sgRNAR3.GSE97584.Homo_Sapiens.H...,SRR5442295-SRR5442296,,t2t,MDA-MB-231-PVT1sgRNAR3,GSE97584,Homo_Sapiens,H3K27ac,b1,"GSM2572585, GSM2572586",
200,MDA-MB-231-sgRNA-Ctrl.GSE97584.Homo_Sapiens.H3...,SRR5442291-SRR5442292,SRR5442297-SRR5442298,t2t,MDA-MB-231-sgRNA-Ctrl,GSE97584,Homo_Sapiens,H3K27ac,b1,"GSM2572581, GSM2572582","GSM2572587, GSM2572588"


In [27]:
# process only uniqe GSM combos
chipline_ss = data[['Sample Name', 'GSE ID For ChIP-seq Data', 'Organism', 'Pulldown', 'GSM ID(s)', 'Control GSM(s)']]
chipline_ss = chipline_ss.drop_duplicates(subset=['GSM ID(s)'])

# add the srr information
chipline_ss = chipline_ss.merge(gsm_srr_combos_data_agg, left_on='GSM ID(s)', right_index=True, how='left')
chipline_ss.rename(columns={'srr_id': 'Main SRR(s)'}, inplace=True)
chipline_ss = chipline_ss.merge(gsm_srr_combos_data_agg, left_on='Control GSM(s)', right_index=True, how='left')
chipline_ss.rename(columns={'srr_id': 'Control SRR(s)'}, inplace=True)

# add organism infor
chipline_ss.loc[:, 'Organism'] = chipline_ss.loc[:, 'Organism'].str.replace('Homo Sapiens', 'Homo_Sapiens')
chipline_ss.loc[:, 'Organism'] = chipline_ss.loc[:, 'Organism'].str.replace('Mus Musculus', 'Mus_Musculus')

# adding the biorep information 
chipline_ss_tmp_list = []
sort_cols = ['GSE ID For ChIP-seq Data', 'Sample Name', 'Organism', 'Pulldown', 'GSM ID(s)', 'Control GSM(s)']
chipline_ss = chipline_ss.sort_values(sort_cols)
for grp, grp_df in chipline_ss.groupby(['GSE ID For ChIP-seq Data', 'Sample Name', 'Organism', 'Pulldown']):
    grp_df.loc[:, 'Rep No.'] = ['b{}'.format(x) for x in range(1, grp_df.shape[0] + 1)]
    chipline_ss_tmp_list.append(grp_df)
chipline_ss = pd.concat(chipline_ss_tmp_list)

In [28]:
# add the standard sample name 
def create_std_sample_name(sr):
    name = '{}.{}.{}.{}.{}'.format(sr['Sample Name'], sr['GSE ID For ChIP-seq Data'],
                                   sr['Organism'], sr['Pulldown'], sr['Rep No.'])
    return(name)
chipline_ss.loc[:, 'chipseq_std_sample_name'] = chipline_ss.apply(create_std_sample_name, axis=1)

# organize the reference genome of samples
chipline_ss.loc[:, 'Ref Genome'] = chipline_ss['Organism'].map({'Homo_Sapiens': 'grch38',
                                                                 'Mus_Musculus': 'mm10'})
# add T2T samples
chipline_ss_grps = chipline_ss.groupby('Ref Genome')
t2t_chipline_ss = chipline_ss_grps.get_group('grch38').copy(deep=True)
t2t_chipline_ss.loc[:, 'Ref Genome'] = 't2t'

In [29]:
# generate teh final chipline samplesheet
chipline_ss = pd.concat([chipline_ss, t2t_chipline_ss])

In [30]:
# organize the columns 
chipline_ss = chipline_ss[['chipseq_std_sample_name', 'Main SRR(s)', 'Control SRR(s)', 'Ref Genome',
                           'Sample Name', 'GSE ID For ChIP-seq Data',
                            'Organism', 'Pulldown', 'Rep No.', 'GSM ID(s)', 'Control GSM(s)']]

In [31]:
# finalize and save
chipline_ss.fillna('N/A', inplace=True)
chipline_ss_fn = os.path.join(samplesheet_dir, 'chipseq.chipline.{}.txt'.format(batch))
chipline_ss.to_csv(chipline_ss_fn, sep='\t', index=False, header=False)

## Add the ChIP-seq file that finally maps between HiChIP and ChIP-seq

In [32]:
mapper = data.copy()
mapper.rename(columns={'Replicate Serial No': 'Rep No.'}, inplace=True)
mapper.loc[:, 'Organism'] = mapper.loc[:, 'Organism'].str.replace(' ' , '_') 
mapper.loc[:, 'Rep No.'] = ['b{}'.format(x) for x in mapper.loc[:, 'Rep No.'].astype(int)]
mapper.loc[:, 'chipseq_std_sample_name'] = mapper.apply(create_std_sample_name, axis=1)

def get_chipseq_path(std_sample_name):
    
    res_dir = 'results/peaks/chipline_v2/'
    sample_tmpl = os.path.join(res_dir, '{0}/MACS2_Ext_*/{0}.macs2_peaks.narrowPeak_Q0.01filt'.format(std_sample_name))
    print(sample_tmpl)
    peak_fns = glob.glob(sample_tmpl)
    
    if len(peak_fns) == 1:
        return(peak_fns[0])
    elif len(peak_fns) > 1:
        return('Found multiple files')
    else:
        return('Could not find a matching file')

mapper.loc[:, 'chipseq_path'] = mapper.loc[:, 'chipseq_std_sample_name'].apply(get_chipseq_path)
fn = os.path.join(samplesheet_dir, 'chipseq.tracker.hichip_to_chipseq.paths_mapped.{}.tsv'.format(batch))
mapper.to_csv(fn, sep='\t', index=False, header=True)

results/peaks/chipline_v2/H9.GSE105028.Homo_Sapiens.CTCF.b1/MACS2_Ext_*/H9.GSE105028.Homo_Sapiens.CTCF.b1.macs2_peaks.narrowPeak_Q0.01filt
results/peaks/chipline_v2/H9.GSE105028.Homo_Sapiens.CTCF.b1/MACS2_Ext_*/H9.GSE105028.Homo_Sapiens.CTCF.b1.macs2_peaks.narrowPeak_Q0.01filt
results/peaks/chipline_v2/H9.GSE105028.Homo_Sapiens.CTCF.b1/MACS2_Ext_*/H9.GSE105028.Homo_Sapiens.CTCF.b1.macs2_peaks.narrowPeak_Q0.01filt
results/peaks/chipline_v2/H9-HS.GSE105028.Homo_Sapiens.CTCF.b1/MACS2_Ext_*/H9-HS.GSE105028.Homo_Sapiens.CTCF.b1.macs2_peaks.narrowPeak_Q0.01filt
results/peaks/chipline_v2/H9-HS.GSE105028.Homo_Sapiens.CTCF.b1/MACS2_Ext_*/H9-HS.GSE105028.Homo_Sapiens.CTCF.b1.macs2_peaks.narrowPeak_Q0.01filt
results/peaks/chipline_v2/H9.GSE105028.Homo_Sapiens.H3K4me1.b1/MACS2_Ext_*/H9.GSE105028.Homo_Sapiens.H3K4me1.b1.macs2_peaks.narrowPeak_Q0.01filt
results/peaks/chipline_v2/H9.GSE105028.Homo_Sapiens.H3K4me1.b1/MACS2_Ext_*/H9.GSE105028.Homo_Sapiens.H3K4me1.b1.macs2_peaks.narrowPeak_Q0.01filt
res

results/peaks/chipline_v2/CD34+-Cord-Blood.GSE107147.Homo_Sapiens.H3K27ac.b1/MACS2_Ext_*/CD34+-Cord-Blood.GSE107147.Homo_Sapiens.H3K27ac.b1.macs2_peaks.narrowPeak_Q0.01filt
results/peaks/chipline_v2/DCM_1.GSE165303.Homo_Sapiens.H3K27ac.b1/MACS2_Ext_*/DCM_1.GSE165303.Homo_Sapiens.H3K27ac.b1.macs2_peaks.narrowPeak_Q0.01filt
results/peaks/chipline_v2/DCM_10.GSE165303.Homo_Sapiens.H3K27ac.b1/MACS2_Ext_*/DCM_10.GSE165303.Homo_Sapiens.H3K27ac.b1.macs2_peaks.narrowPeak_Q0.01filt
results/peaks/chipline_v2/DCM_2.GSE165303.Homo_Sapiens.H3K27ac.b1/MACS2_Ext_*/DCM_2.GSE165303.Homo_Sapiens.H3K27ac.b1.macs2_peaks.narrowPeak_Q0.01filt
results/peaks/chipline_v2/DCM_3.GSE165303.Homo_Sapiens.H3K27ac.b1/MACS2_Ext_*/DCM_3.GSE165303.Homo_Sapiens.H3K27ac.b1.macs2_peaks.narrowPeak_Q0.01filt
results/peaks/chipline_v2/DCM_4.GSE165303.Homo_Sapiens.H3K27ac.b1/MACS2_Ext_*/DCM_4.GSE165303.Homo_Sapiens.H3K27ac.b1.macs2_peaks.narrowPeak_Q0.01filt
results/peaks/chipline_v2/DCM_5.GSE165303.Homo_Sapiens.H3K27ac.b1/MACS

In [None]:
mapper