In [1]:
import os
import sys
import pandas as pd
import tracker
latest_date = tracker.processing_dates[-1]
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/')

In [2]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/fastq/{}.fastq.google-samplesheet.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/fastq/{}.fastq.samplesheet'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_prefix = sys.argv[2]

In [3]:
latest_date

'2022.04.09.16.57'

In [4]:
# loading the samplesheet
df = pd.read_table(input_fn, skiprows=1)

In [5]:
df

Unnamed: 0,Sample Name (as used in the server),Study,GSE ID,GSM ID,SRR ID,GEO Title,GEO Source,GEO Description,Organism,Number of Reads,...,Download Status,HiCPro Status,HiChIP Peaks Status,hichipper Status,OTHER Status,FitHiChIP Status,Operator,Priority,Cluster Path/Branch,Comments
0,mES_25m_cells,"Mumbach et al., 2017",GSE101498,GSM2705031,SRR5831479,mES HiChIP H3K27ac 25m biological replicate 1 ...,v6.5 embryonic stem cell line,Protein-enriched long-range contact; mES_HiChI...,Mus musculus,373303396,...,1,1,0,0,0,0,,,,
1,mES_25m_cells,"Mumbach et al., 2017",GSE101498,GSM2705032,SRR5831480,mES HiChIP H3K27ac 25m biological replicate 1 ...,v6.5 embryonic stem cell line,Protein-enriched long-range contact; mES_HiChI...,Mus musculus,269130644,...,1,1,0,0,0,0,,,,
2,mES_25m_cells,"Mumbach et al., 2017",GSE101498,GSM2705033,SRR5831481,mES HiChIP H3K27ac 25m biological replicate 2 ...,v6.5 embryonic stem cell line,Protein-enriched long-range contact; mES_HiChI...,Mus musculus,306801366,...,1,1,0,0,0,0,,,,
3,mES_25m_cells,"Mumbach et al., 2017",GSE101498,GSM2705034,SRR5831482,mES HiChIP H3K27ac 25m biological replicate 2 ...,v6.5 embryonic stem cell line,Protein-enriched long-range contact; mES_HiChI...,Mus musculus,359787238,...,1,1,0,0,0,0,,,,
4,mES_500k_cells,"Mumbach et al., 2017",GSE101498,GSM2705035,SRR5831483,mES HiChIP H3K27ac 500k biological replicate 1,v6.5 embryonic stem cell line,Protein-enriched long-range contact; mES_HiChI...,Mus musculus,249255802,...,1,1,0,0,0,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,293T-PDS,"Li et al., 2020",GSE128106,GSM3664994,SRR8707617,HiChIP YY1-PDS rep1,293T YY1-TAPTAG,processed data file: HiChIP_PDS_Interaction_Ma...,Homo sapiens,50913880,...,0,0,0,0,0,0,,,,
179,293T-PDS,"Li et al., 2020",GSE128106,GSM3664995,SRR8707618,HiChIP YY1-PDS rep2,293T YY1-TAPTAG,processed data file: HiChIP_PDS_Interaction_Ma...,Homo sapiens,40894724,...,0,0,0,0,0,0,,,,
180,293T-TMPYP4,"Li et al., 2020",GSE128106,GSM3664996,SRR8707619,HiChIP YY1-TMPYP4 rep1,293T YY1-TAPTAG,processed data file: HiChIP_TMP_Interaction_Ma...,Homo sapiens,91431398,...,0,0,0,0,0,0,,,,
181,293T-TMPYP4,"Li et al., 2020",GSE128106,GSM3664997,SRR8707620,HiChIP YY1-TMPYP4 rep2,293T YY1-TAPTAG,processed data file: HiChIP_TMP_Interaction_Ma...,Homo sapiens,93121852,...,0,0,0,0,0,0,,,,


In [6]:
# extract those samples which are ready for processing
ready_df = df.loc[df['Start Processing'] == 1]

# extract only the columns needed
major_cols = ['Sample Name (as used in the server)',
 'GSE ID',
 'GSM ID',
 'SRR ID',
 'Organism',
 'Biological Replicate Serial No',
 'Technical Replicate Serial No',
 'ChIP-seq Pull Down',
 'Restriction Enzyme']
ready_df = ready_df[major_cols]

In [7]:
def parse_organism(string):
    new_words = []
    for word in string.split():
        new_words.append(word.capitalize())
    new_string = '_'.join(new_words)
    return(new_string)

ready_df.loc[:, 'Organism'] = ready_df.loc[:, 'Organism'].apply(parse_organism)

In [8]:
# getting the sample names
sample_names = []
for i, sr in ready_df.iterrows():
    sample_name = '{sample_name}.{gse_id}.{organism}.{antibody_target}.b{biological_rep}'
    sample_name = sample_name.format(sample_name=sr[0],
                                     gse_id=sr[1],
                                     organism=sr[4],
                                     antibody_target=sr[7], 
                                     biological_rep=sr[5])
    sample_names.append(sample_name)
ready_df.loc[:, 'sample_name'] = sample_names

In [9]:
# renaming the columns for easy computational use 
ready_df.columns = ['sample_name', 'gse_id', 'gsm_id', 'srr_id',
                    'organism', 'bio_rep', 'tech_rep', 'antibody_target',
                    'restriction_enzyme', 'std_sample_name']

In [10]:
# reorder the columns
reorder = ['std_sample_name',
             'gse_id',
             'gsm_id',
             'srr_id',
             'organism',
             'bio_rep',
             'tech_rep',
             'antibody_target',
             'restriction_enzyme',
             'sample_name']
ready_df = ready_df[reorder]

In [11]:
ready_df

Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,antibody_target,restriction_enzyme,sample_name
0,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,GSM2705031,SRR5831479,Mus_Musculus,1,1,H3K27ac,MboI,mES_25m_cells
1,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,GSM2705032,SRR5831480,Mus_Musculus,1,2,H3K27ac,MboI,mES_25m_cells
2,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b2,GSE101498,GSM2705033,SRR5831481,Mus_Musculus,2,1,H3K27ac,MboI,mES_25m_cells
3,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b2,GSE101498,GSM2705034,SRR5831482,Mus_Musculus,2,2,H3K27ac,MboI,mES_25m_cells
4,mES_500k_cells.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,GSM2705035,SRR5831483,Mus_Musculus,1,1,H3K27ac,MboI,mES_500k_cells
...,...,...,...,...,...,...,...,...,...,...
178,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,GSE128106,GSM3664994,SRR8707617,Homo_Sapiens,1,1,YY1,HindIII,293T-PDS
179,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,GSE128106,GSM3664995,SRR8707618,Homo_Sapiens,1,2,YY1,HindIII,293T-PDS
180,293T-TMPYP4.GSE128106.Homo_Sapiens.YY1.b1,GSE128106,GSM3664996,SRR8707619,Homo_Sapiens,1,1,YY1,HindIII,293T-TMPYP4
181,293T-TMPYP4.GSE128106.Homo_Sapiens.YY1.b1,GSE128106,GSM3664997,SRR8707620,Homo_Sapiens,1,2,YY1,HindIII,293T-TMPYP4


In [12]:
header_output = '{}.with_header.tsv'.format(output_prefix)
ready_df.to_csv(header_output, header=True, index=False, sep='\t')

without_header_output = '{}.without_header.tsv'.format(output_prefix)
ready_df.to_csv(without_header_output, header=False, index=False, sep='\t')

In [13]:
df.shape

(183, 25)

In [14]:
ready_df

Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,antibody_target,restriction_enzyme,sample_name
0,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,GSM2705031,SRR5831479,Mus_Musculus,1,1,H3K27ac,MboI,mES_25m_cells
1,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,GSM2705032,SRR5831480,Mus_Musculus,1,2,H3K27ac,MboI,mES_25m_cells
2,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b2,GSE101498,GSM2705033,SRR5831481,Mus_Musculus,2,1,H3K27ac,MboI,mES_25m_cells
3,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b2,GSE101498,GSM2705034,SRR5831482,Mus_Musculus,2,2,H3K27ac,MboI,mES_25m_cells
4,mES_500k_cells.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,GSM2705035,SRR5831483,Mus_Musculus,1,1,H3K27ac,MboI,mES_500k_cells
...,...,...,...,...,...,...,...,...,...,...
178,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,GSE128106,GSM3664994,SRR8707617,Homo_Sapiens,1,1,YY1,HindIII,293T-PDS
179,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,GSE128106,GSM3664995,SRR8707618,Homo_Sapiens,1,2,YY1,HindIII,293T-PDS
180,293T-TMPYP4.GSE128106.Homo_Sapiens.YY1.b1,GSE128106,GSM3664996,SRR8707619,Homo_Sapiens,1,1,YY1,HindIII,293T-TMPYP4
181,293T-TMPYP4.GSE128106.Homo_Sapiens.YY1.b1,GSE128106,GSM3664997,SRR8707620,Homo_Sapiens,1,2,YY1,HindIII,293T-TMPYP4


In [15]:
without_header_output

'results/samplesheets/fastq/2022.04.09.16.57.fastq.samplesheet.without_header.tsv'