In [1]:
import os 
import glob
import numpy as np 
import pandas as pd

os.chdir('/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/')
samplesheet_dir = 'results/samplesheets/chipseq/'
batch = 'pieqtls'

## Create the Samplesheet for Running ChIPLine

In [2]:
samples = [['CD4N_merged_donors.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
           'CD8N_merged_donors.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
           'Mono_merged_donors.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
           'NB_merged_donors.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
           'NK_merged_donors.phs001703v3p1.Homo_Sapiens.H3K27ac.b1',
           'NCM_merged_donors.phs001703v4p1.Homo_Sapiens.H3K27ac.b1']]

data = pd.DataFrame(samples).T
data.columns = ['Sample Name']
data['GSE ID For ChIP-seq Data'] = 'phs001703v3p1'
data['Organism'] = 'Homo_Sapiens'
data['Pulldown'] = 'H3K27ac'
data['GSM ID(s)'] = ''
data['Control GSM(s)'] = 'N/A'

# updating phys for just NCM
data.loc[data['Sample Name'] == 'NCM_merged_donors.phs001703v4p1.Homo_Sapiens.H3K27ac.b1', 'GSE ID For ChIP-seq Data'] = 'phs001703v4p1' 

In [3]:
data

Unnamed: 0,Sample Name,GSE ID For ChIP-seq Data,Organism,Pulldown,GSM ID(s),Control GSM(s)
0,CD4N_merged_donors.phs001703v3p1.Homo_Sapiens....,phs001703v3p1,Homo_Sapiens,H3K27ac,,
1,CD8N_merged_donors.phs001703v3p1.Homo_Sapiens....,phs001703v3p1,Homo_Sapiens,H3K27ac,,
2,Mono_merged_donors.phs001703v3p1.Homo_Sapiens....,phs001703v3p1,Homo_Sapiens,H3K27ac,,
3,NB_merged_donors.phs001703v3p1.Homo_Sapiens.H3...,phs001703v3p1,Homo_Sapiens,H3K27ac,,
4,NK_merged_donors.phs001703v3p1.Homo_Sapiens.H3...,phs001703v3p1,Homo_Sapiens,H3K27ac,,
5,NCM_merged_donors.phs001703v4p1.Homo_Sapiens.H...,phs001703v4p1,Homo_Sapiens,H3K27ac,,


In [4]:
# process only uniqe GSM combos
chipline_ss = data[['Sample Name', 'GSE ID For ChIP-seq Data', 'Organism', 'Pulldown', 'GSM ID(s)', 'Control GSM(s)']]
chipline_ss.loc[:, 'Main SRR(s)'] = chipline_ss['Sample Name']
chipline_ss.loc[:, 'Control SRR(s)'] = 'N/A'

# adding the biorep information 
chipline_ss_tmp_list = []
sort_cols = ['GSE ID For ChIP-seq Data', 'Sample Name', 'Organism', 'Pulldown', 'GSM ID(s)', 'Control GSM(s)']
chipline_ss = chipline_ss.sort_values(sort_cols)
for grp, grp_df in chipline_ss.groupby(['GSE ID For ChIP-seq Data', 'Sample Name', 'Organism', 'Pulldown']):
    grp_df.loc[:, 'Rep No.'] = ['b{}'.format(x) for x in range(1, grp_df.shape[0] + 1)]
    chipline_ss_tmp_list.append(grp_df)
chipline_ss = pd.concat(chipline_ss_tmp_list)

In [5]:
chipline_ss

Unnamed: 0,Sample Name,GSE ID For ChIP-seq Data,Organism,Pulldown,GSM ID(s),Control GSM(s),Main SRR(s),Control SRR(s),Rep No.
0,CD4N_merged_donors.phs001703v3p1.Homo_Sapiens....,phs001703v3p1,Homo_Sapiens,H3K27ac,,,CD4N_merged_donors.phs001703v3p1.Homo_Sapiens....,,b1
1,CD8N_merged_donors.phs001703v3p1.Homo_Sapiens....,phs001703v3p1,Homo_Sapiens,H3K27ac,,,CD8N_merged_donors.phs001703v3p1.Homo_Sapiens....,,b1
2,Mono_merged_donors.phs001703v3p1.Homo_Sapiens....,phs001703v3p1,Homo_Sapiens,H3K27ac,,,Mono_merged_donors.phs001703v3p1.Homo_Sapiens....,,b1
3,NB_merged_donors.phs001703v3p1.Homo_Sapiens.H3...,phs001703v3p1,Homo_Sapiens,H3K27ac,,,NB_merged_donors.phs001703v3p1.Homo_Sapiens.H3...,,b1
4,NK_merged_donors.phs001703v3p1.Homo_Sapiens.H3...,phs001703v3p1,Homo_Sapiens,H3K27ac,,,NK_merged_donors.phs001703v3p1.Homo_Sapiens.H3...,,b1
5,NCM_merged_donors.phs001703v4p1.Homo_Sapiens.H...,phs001703v4p1,Homo_Sapiens,H3K27ac,,,NCM_merged_donors.phs001703v4p1.Homo_Sapiens.H...,,b1


In [6]:
# add the standard sample name 
chipline_ss.loc[:, 'chipseq_std_sample_name'] = chipline_ss['Sample Name']

# organize the reference genome of samples
chipline_ss.loc[:, 'Ref Genome'] = chipline_ss['Organism'].map({'Homo_Sapiens': 'grch38',
                                                                 'Mus_Musculus': 'mm10'})
# # add T2T samples
# chipline_ss_grps = chipline_ss.groupby('Ref Genome')
# t2t_chipline_ss = chipline_ss_grps.get_group('grch38').copy(deep=True)
# t2t_chipline_ss.loc[:, 'Ref Genome'] = 't2t'

# generate teh final chipline samplesheet
# chipline_ss = pd.concat([chipline_ss, t2t_chipline_ss])

In [7]:
# organize the columns 
chipline_ss = chipline_ss[['chipseq_std_sample_name', 'Main SRR(s)', 'Control SRR(s)', 'Ref Genome',
                           'Sample Name', 'GSE ID For ChIP-seq Data',
                            'Organism', 'Pulldown', 'Rep No.', 'GSM ID(s)', 'Control GSM(s)']]

In [8]:
# finalize and save
chipline_ss.fillna('N/A', inplace=True)
chipline_ss_fn = os.path.join(samplesheet_dir, 'chipseq.chipline.{}.txt'.format(batch))
chipline_ss.to_csv(chipline_ss_fn, sep='\t', index=False, header=False)

In [9]:
chipline_ss

Unnamed: 0,chipseq_std_sample_name,Main SRR(s),Control SRR(s),Ref Genome,Sample Name,GSE ID For ChIP-seq Data,Organism,Pulldown,Rep No.,GSM ID(s),Control GSM(s)
0,CD4N_merged_donors.phs001703v3p1.Homo_Sapiens....,CD4N_merged_donors.phs001703v3p1.Homo_Sapiens....,,grch38,CD4N_merged_donors.phs001703v3p1.Homo_Sapiens....,phs001703v3p1,Homo_Sapiens,H3K27ac,b1,,
1,CD8N_merged_donors.phs001703v3p1.Homo_Sapiens....,CD8N_merged_donors.phs001703v3p1.Homo_Sapiens....,,grch38,CD8N_merged_donors.phs001703v3p1.Homo_Sapiens....,phs001703v3p1,Homo_Sapiens,H3K27ac,b1,,
2,Mono_merged_donors.phs001703v3p1.Homo_Sapiens....,Mono_merged_donors.phs001703v3p1.Homo_Sapiens....,,grch38,Mono_merged_donors.phs001703v3p1.Homo_Sapiens....,phs001703v3p1,Homo_Sapiens,H3K27ac,b1,,
3,NB_merged_donors.phs001703v3p1.Homo_Sapiens.H3...,NB_merged_donors.phs001703v3p1.Homo_Sapiens.H3...,,grch38,NB_merged_donors.phs001703v3p1.Homo_Sapiens.H3...,phs001703v3p1,Homo_Sapiens,H3K27ac,b1,,
4,NK_merged_donors.phs001703v3p1.Homo_Sapiens.H3...,NK_merged_donors.phs001703v3p1.Homo_Sapiens.H3...,,grch38,NK_merged_donors.phs001703v3p1.Homo_Sapiens.H3...,phs001703v3p1,Homo_Sapiens,H3K27ac,b1,,
5,NCM_merged_donors.phs001703v4p1.Homo_Sapiens.H...,NCM_merged_donors.phs001703v4p1.Homo_Sapiens.H...,,grch38,NCM_merged_donors.phs001703v4p1.Homo_Sapiens.H...,phs001703v4p1,Homo_Sapiens,H3K27ac,b1,,


## Add the ChIP-seq file that finally maps between HiChIP and ChIP-seq

In [69]:
mapper = data.copy()
mapper.rename(columns={'Replicate Serial No': 'Rep No.'}, inplace=True)
mapper.loc[:, 'Organism'] = mapper.loc[:, 'Organism'].str.replace(' ' , '_') 
mapper.loc[:, 'Rep No.'] = ['b{}'.format(x) for x in mapper.loc[:, 'Rep No.'].astype(int)]
mapper.loc[:, 'chipseq_std_sample_name'] = mapper.apply(create_std_sample_name, axis=1)

def get_chipseq_path(std_sample_name):
    
    res_dir = 'results/peaks/chipline_v2/'
    sample_tmpl = os.path.join(res_dir, '{0}/MACS2_Ext_*/{0}.macs2_peaks.narrowPeak_Q0.01filt'.format(std_sample_name))
    print(sample_tmpl)
    peak_fns = glob.glob(sample_tmpl)
    
    if len(peak_fns) == 1:
        return(peak_fns[0])
    elif len(peak_fns) > 1:
        return('Found multiple files')
    else:
        return('Could not find a matching file')

mapper.loc[:, 'chipseq_path'] = mapper.loc[:, 'chipseq_std_sample_name'].apply(get_chipseq_path)
fn = os.path.join(samplesheet_dir, 'chipseq.tracker.hichip_to_chipseq.paths_mapped.{}.tsv'.format(batch))
mapper.to_csv(fn, sep='\t', index=False, header=True)

KeyError: 'Rep No.'

In [None]:
mapper