In [1]:
import os 
import glob
import numpy as np 
import pandas as pd

## Load the GSM Data

In [2]:
fn = './Batch-2-ChIP-seq-Tracker_8.16.22-10.23.23_All-Tracker.tsv'
data = pd.read_table(fn, skiprows=1)
data = data.loc[data['Has ChIP-seq?'] == 'Yes']

In [3]:
chipseq_gse_lists = data['GSM ID(s)'].dropna().tolist() + data['Control GSM(s)'].dropna().tolist()
chipseq_gsms = []
for gse_list in chipseq_gse_lists:
    gsm_ids = [x.strip() for x in gse_list.split(',')]
    chipseq_gsms.extend(gsm_ids)
    
chipseq_gsms = sorted(set(chipseq_gsms))

Query the below within the SRA Run Selector: https://www.ncbi.nlm.nih.gov/Traces/study/

In [4]:
','.join(chipseq_gsms)

'GSM4952699,GSM4952700,GSM4952701,GSM4952702,GSM4952703,GSM4952704,GSM4952705,GSM4952706,GSM4952707,GSM4952708,GSM4952709,GSM4952710,GSM5034230,GSM5034231,GSM5513456,GSM5625199,GSM5625200,GSM5625201,GSM5625202,GSM5625207,GSM5625208,GSM5625209,GSM5625210,GSM5904685,GSM5904686,GSM5904687,GSM5904688,GSM5904689,GSM5904690,GSM5904691,GSM5904692,GSM5904693,GSM5904694,GSM5904695,GSM5904696,GSM6016304,GSM6016305,GSM6016306,GSM6016307,GSM6016308,GSM6016309,GSM6016310,GSM6016311,GSM6016312,GSM6016313,GSM6016314,GSM6016315,GSM6226246,GSM6226256,GSM6226257,GSM6514267,GSM6514268,GSM6514269,GSM6514270,GSM6514271,GSM6514272,GSM6568001,GSM6568006,GSM6568007,GSM6568008,GSM6568016,GSM6568017,GSM6568227,GSM6568228,GSM6568229,GSM6568230,GSM6568231,GSM6568232,GSM6568233,GSM6568234,GSM6568237,GSM6568238,GSM6568239,GSM6585290,GSM6585291,GSM6614089,GSM6614090,GSM6614091,GSM6620326,GSM6620327,GSM6620328,GSM6620329,GSM6620330,GSM6620331,GSM6620332,GSM6620333,GSM6715172,GSM6715173,GSM6715174,GSM6715175,GSM671517

In [5]:
len(chipseq_gsms)

125

## Parse the Queried Data

In [6]:
# load the data
gsm_to_srr_data = pd.read_table('./SraRunTable.txt', sep=',')
gsm_to_srr_data = gsm_to_srr_data[['Sample Name', 'Run']]
gsm_to_srr_data.columns = ['gsm_id', 'srr_id']
gsm_to_srr_data.sort_values('gsm_id', inplace=True)

In [7]:
# manually adding missing samples
adding_missing_entries = [['GSM7336681', 'SRR16538542'],
                          ['GSM7336682', 'SRR16538550'],
                          ['GSM7336684', 'SRR16538541'],
                          ['GSM7336685', 'SRR16538549'],
]
adding_missing_entries = pd.DataFrame(adding_missing_entries, columns=gsm_to_srr_data.columns.tolist())
gsm_to_srr_data = pd.concat([gsm_to_srr_data, adding_missing_entries], axis=0)

## Check for Discrepancies

In [8]:
original_gses = set(chipseq_gsms)

In [9]:
queried_gses = set(gsm_to_srr_data.gsm_id.unique())

In [10]:
len(queried_gses)

125

In [11]:
original_gses.difference(queried_gses)

set()

Not sure how I got these extra stragglers 

In [12]:
queried_gses.difference(original_gses)

set()

In [13]:
gsm_to_srr_data.gsm_id.value_counts()

GSM4952699    3
GSM4952706    3
GSM4952700    3
GSM4952710    3
GSM4952709    3
             ..
GSM6016313    1
GSM6016312    1
GSM6016311    1
GSM6016310    1
GSM7336685    1
Name: gsm_id, Length: 125, dtype: int64

## Create the Samplesheet for Downloading

In [51]:
# get unique SRRs 

In [14]:
srr_ids = sorted(gsm_to_srr_data.srr_id.unique())

In [15]:
with open('srr_ids.txt', 'w') as fw:
    fw.write('\n'.join(srr_ids))

## Create the Samplesheet for Concatination

In [53]:
# get Combinations of SRRs that should be concatinated
gsm_srr_combos_data = []

# main GSMs
for gsm_ids_combo in data['GSM ID(s)'].tolist():
    gsm_ids = gsm_ids_combo.replace(' ', '').split(',')
    gsm_combo_reps = [gsm_ids_combo] * len(gsm_ids)
    split_data = list(zip(gsm_combo_reps, gsm_ids))
    gsm_srr_combos_data.extend(split_data)    
    
# control GSMs
for gsm_ids_combo in data['Control GSM(s)'].tolist():
    
    if type(gsm_ids_combo) == str:
        gsm_ids = gsm_ids_combo.replace(' ', '').split(',')
        gsm_combo_reps = [gsm_ids_combo] * len(gsm_ids)
        split_data = list(zip(gsm_combo_reps, gsm_ids))
    else:
        split_data = [[np.nan, np.nan]]
    gsm_srr_combos_data.extend(split_data)    

In [17]:
gsm_srr_combos_data = pd.DataFrame(gsm_srr_combos_data, columns=['GSM ID(s)', 'split_gsm_ids']).drop_duplicates()
gsm_srr_combos_data = gsm_srr_combos_data.merge(gsm_to_srr_data, left_on='split_gsm_ids', right_on='gsm_id', how='outer', indicator=True)
gsm_srr_combos_data = gsm_srr_combos_data.loc[~gsm_srr_combos_data.gsm_id.isna()]

In [18]:
gsm_srr_combos_data_agg = gsm_srr_combos_data.groupby('GSM ID(s)').agg({'srr_id': '-'.join})
srr_combos = sorted(gsm_srr_combos_data_agg.srr_id.unique())

In [19]:
with open('srr_id.combos.txt', 'w') as fw:
    fw.write('\n'.join(srr_combos))

In [20]:
len(srr_combos)

81

## Create the Samplesheet for Alignment of Control SRRs

#### Generate a samplesheet that contains the organism

In [41]:
# extracting gsms
df1 = data[['GSM ID(s)', 'Organism']].rename(columns={'GSM ID(s)': 'gsm_ids'})
df1['sample_type'] = 'main'
df2 = data[['Control GSM(s)', 'Organism']].rename(columns={'Control GSM(s)': 'gsm_ids'})
df2['sample_type'] = 'control'

# concating the gsm dfs
gsm_to_organism = pd.concat([df1, df2], axis=0).dropna().drop_duplicates()

# adding srr information
gsm_to_organism = gsm_to_organism.merge(gsm_srr_combos_data_agg, left_on='gsm_ids', right_index=True)

# extract control only
gsm_to_organism_control = gsm_to_organism.loc[gsm_to_organism.sample_type == 'control']

# adding reference genome information
human_subset = gsm_to_organism_control.loc[gsm_to_organism_control['Organism'] == 'Homo sapiens']
mouse_subset = gsm_to_organism_control.loc[gsm_to_organism_control['Organism'] == 'Mus musculus']

grch38_subset = human_subset.copy().sort_values(['srr_id'])
grch38_subset.loc[:, 'ref_genome'] = 'grch38'

t2t_subset = human_subset.copy().sort_values(['srr_id'])
t2t_subset.loc[:, 'ref_genome'] = 't2t'

mm10_subset = mouse_subset.copy().sort_values(['srr_id'])
mouse_subset.loc[:, 'ref_genome'] = 'mm10'

# concating all the subsets for a final samplesheet
gsm_to_organism_control = pd.concat([grch38_subset, mouse_subset, t2t_subset])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mouse_subset.loc[:, 'ref_genome'] = 'mm10'


In [46]:
# extract srr and ref_genome
gsm_to_organism_control = gsm_to_organism_control[['srr_id', 'ref_genome']]

In [50]:
samplesheet_path = 'chipseq_srr_ids_to_ref_genome.combos.batch2.txt'
gsm_to_organism_control.to_csv(samplesheet_path, sep='\t', header=False, index=False)

## Create the Samplesheet for Running ChIPLine

In [278]:
# process only uniqe GSM combos
chipline_ss = data[['Sample Name', 'GSE ID For ChIP-seq Data', 'Organism', 'Pulldown', 'GSM ID(s)', 'Control GSM(s)']]
chipline_ss = chipline_ss.drop_duplicates(subset=['GSM ID(s)'])

In [280]:
# add the srr information
chipline_ss = chipline_ss.merge(gsm_srr_combos_data_agg, left_on='GSM ID(s)', right_index=True, how='left')
chipline_ss.rename(columns={'srr_id': 'Main SRR(s)'}, inplace=True)
chipline_ss = chipline_ss.merge(gsm_srr_combos_data_agg, left_on='Control GSM(s)', right_index=True, how='left')
chipline_ss.rename(columns={'srr_id': 'Control SRR(s)'}, inplace=True)

In [281]:
# add organism infor
chipline_ss.loc[:, 'Organism'] = chipline_ss.loc[:, 'Organism'].replace('Homo sapiens', 'Homo_Sapiens')
chipline_ss.loc[:, 'Organism'] = chipline_ss.loc[:, 'Organism'].replace('Mus musculus', 'Mus_musculus')

In [282]:
# adding the biorep information 
chipline_ss_tmp_list = []
sort_cols = ['GSE ID For ChIP-seq Data', 'Sample Name', 'Organism', 'Pulldown', 'GSM ID(s)', 'Control GSM(s)']
chipline_ss = chipline_ss.sort_values(sort_cols)
for grp, grp_df in chipline_ss.groupby(['GSE ID For ChIP-seq Data', 'Sample Name', 'Organism', 'Pulldown']):
    grp_df.loc[:, 'Rep No.'] = ['b{}'.format(x) for x in range(1, grp_df.shape[0] + 1)]
    chipline_ss_tmp_list.append(grp_df)
chipline_ss = pd.concat(chipline_ss_tmp_list)

In [283]:
# add the standard sample name 
def create_std_sample_name(sr):
    name = '{}.{}.{}.{}.{}'.format(sr['Sample Name'], sr['GSE ID For ChIP-seq Data'],
                                   sr['Organism'], sr['Pulldown'], sr['Rep No.'])
    return(name)
chipline_ss.loc[:, 'chipseq_std_sample_name'] = chipline_ss.apply(create_std_sample_name, axis=1)

In [286]:
# organize the reference genome of samples
chipline_ss.loc[:, 'Ref Genome'] = chipline_ss['Organism'].map({'Homo_Sapiens': 'grch38',
                                                                 'Mus_musculus': 'mm10'})
# add T2T samples
chipline_ss_grps = chipline_ss.groupby('Ref Genome')
t2t_chipline_ss = chipline_ss_grps.get_group('grch38').copy(deep=True)
t2t_chipline_ss.loc[:, 'Ref Genome'] = 't2t'

In [274]:
# generate teh final chipline samplesheet
chipline_ss = pd.concat([chipline_ss, t2t_chipline_ss])

In [275]:
# organize the columns 
chipline_ss = chipline_ss[['chipseq_std_sample_name', 'Main SRR(s)', 'Control SRR(s)', 'Ref Genome',
                           'Sample Name', 'GSE ID For ChIP-seq Data',
                            'Organism', 'Pulldown', 'Rep No.', 'GSM ID(s)', 'Control GSM(s)']]

In [276]:
# finalize and save
chipline_ss.fillna('N/A', inplace=True)
chipline_ss_fn = 'chipseq.chipline.batch2.txt'
chipline_ss.to_csv(chipline_ss_fn, sep='\t', index=False, header=False)

In [277]:
chipline_ss

Unnamed: 0,chipseq_std_sample_name,Main SRR(s),Control SRR(s),Ref Genome,Sample Name,GSE ID For ChIP-seq Data,Organism,Pulldown,Rep No.,GSM ID(s),Control GSM(s)
0,VCP65.GSE162609.Homo_Sapiens.H3K27ac.b1,SRR13178581-SRR13178579-SRR13178580-SRR1317858...,SRR13178588-SRR13178590-SRR13178589-SRR1317859...,grch38,VCP65,GSE162609,Homo_Sapiens,H3K27ac,b1,"GSM4952699, GSM4952700, GSM4952701","GSM4952702, GSM4952703, GSM4952704"
1,VCP67.GSE162609.Homo_Sapiens.H3K27ac.b1,SRR13178599-SRR13178598-SRR13178597-SRR1317860...,SRR13178606-SRR13178607-SRR13178608-SRR1317860...,grch38,VCP67,GSE162609,Homo_Sapiens,H3K27ac,b1,"GSM4952705, GSM4952706, GSM4952707","GSM4952708, GSM4952709, GSM4952710"
2,HCT116-NUP93-mAC.GSE165463.Homo_Sapiens.H3K27a...,SRR13523286,,grch38,HCT116-NUP93-mAC,GSE165463,Homo_Sapiens,H3K27ac,b1,GSM5034230,
4,HCT116-NUP93-mAC-IAA.GSE165463.Homo_Sapiens.H3...,SRR13523287,,grch38,HCT116-NUP93-mAC-IAA,GSE165463,Homo_Sapiens,H3K27ac,b1,GSM5034231,
39,YAP-M-tumor.GSE181867.Mus_musculus.H3K27ac.b1,SRR15414689,,mm10,YAP-M-tumor,GSE181867,Mus_musculus,H3K27ac,b1,GSM5513456,
...,...,...,...,...,...,...,...,...,...,...,...
16,HCT116.GSM5904695.Homo_Sapiens.H3K4me1.b1,SRR18054306-SRR18054305,SRR18054294,t2t,HCT116,GSM5904695,Homo_Sapiens,H3K4me1,b1,"GSM5904685, GSM5904686",GSM5904695
19,HCT116.GSM5904695.Homo_Sapiens.H3K4me3.b1,SRR18054298,SRR18054294,t2t,HCT116,GSM5904695,Homo_Sapiens,H3K4me3,b1,GSM5904693,GSM5904695
21,HCT116-FOXD2.GSM5904695.Homo_Sapiens.H3K27ac.b1,SRR18054304-SRR18054299,SRR18054296,t2t,HCT116-FOXD2,GSM5904695,Homo_Sapiens,H3K27ac,b1,"GSM5904691, GSM5904692",GSM5904696
23,HCT116-FOXD2.GSM5904695.Homo_Sapiens.H3K4me1.b1,SRR18054302-SRR18054301,SRR18054296,t2t,HCT116-FOXD2,GSM5904695,Homo_Sapiens,H3K4me1,b1,"GSM5904687, GSM5904688",GSM5904696
