In [1]:
import os
import sys
import pandas as pd
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/')

In [2]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/fastq/2022.03.30.fastq.google-samplesheet.tsv'
    output_prefix = 'results/samplesheets/fastq/2022.03.30.fastq.samplesheet'
else:
    input_fn = sys.argv[1]
    output_prefix = sys.argv[2]

In [3]:
# loading the samplesheet
df = pd.read_table(input_fn, skiprows=1)

# extract those samples which are ready for processing
ready_df = df.loc[df['Start Processing'] == 'Yes']

# extract only the columns needed
major_cols = ['Sample Name (as used in the server)',
 'GSE ID',
 'GSM ID',
 'SRR ID',
 'Organism',
 'Biological Replicate Serial No',
 'Technical Replicate Serial No',
 'ChIP-seq Pull Down',
 'Restriction Enzyme']
ready_df = ready_df[major_cols]

In [4]:
def parse_organism(string):
    new_words = []
    for word in string.split():
        new_words.append(word.capitalize())
    new_string = '_'.join(new_words)
    return(new_string)

ready_df.loc[:, 'Organism'] = ready_df.loc[:, 'Organism'].apply(parse_organism)

In [5]:
# getting the sample names
sample_names = []
for i, sr in ready_df.iterrows():
    sample_name = '{sample_name}.{gse_id}.{organism}.{antibody_target}.b{biological_rep}'
    sample_name = sample_name.format(sample_name=sr[0],
                                     gse_id=sr[1],
                                     organism=sr[4],
                                     antibody_target=sr[7], 
                                     biological_rep=sr[5])
    sample_names.append(sample_name)
ready_df.loc[:, 'sample_name'] = sample_names

In [6]:
# renaming the columns for easy computational use 
ready_df.columns = ['sample_name', 'gse_id', 'gsm_id', 'srr_id',
                    'organism', 'bio_rep', 'tech_rep', 'antibody_target',
                    'restriction_enzyme', 'std_sample_name']

In [7]:
# reorder the columns
reorder = ['std_sample_name',
             'gse_id',
             'gsm_id',
             'srr_id',
             'organism',
             'bio_rep',
             'tech_rep',
             'antibody_target',
             'restriction_enzyme',
             'sample_name']

ready_df = ready_df[reorder]

In [8]:
ready_df

Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,antibody_target,restriction_enzyme,sample_name
0,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,GSM2705031,SRR5831479,Mus_Musculus,1,1,H3K27ac,MboI,mES_25m_cells
1,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,GSM2705032,SRR5831480,Mus_Musculus,1,2,H3K27ac,MboI,mES_25m_cells
2,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b2,GSE101498,GSM2705033,SRR5831481,Mus_Musculus,2,1,H3K27ac,MboI,mES_25m_cells
3,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b2,GSE101498,GSM2705034,SRR5831482,Mus_Musculus,2,2,H3K27ac,MboI,mES_25m_cells
4,mES_500k_cells.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,GSM2705035,SRR5831483,Mus_Musculus,1,1,H3K27ac,MboI,mES_500k_cells
...,...,...,...,...,...,...,...,...,...,...
126,lgs101645.GSE116193.Homo_Sapiens.CTCF.b1,GSE116193,GSM3212926,SRR7417515,Homo_Sapiens,1,1,CTCF,MboI,lgs101645
127,lgs102580.GSE116193.Homo_Sapiens.CTCF.b1,GSE116193,GSM3212927,SRR7417516,Homo_Sapiens,1,1,CTCF,MboI,lgs102580
128,lgs102943.GSE116193.Homo_Sapiens.CTCF.b1,GSE116193,GSM3212928,SRR7417517,Homo_Sapiens,1,1,CTCF,MboI,lgs102943
129,lgs301315.GSE116193.Homo_Sapiens.CTCF.b1,GSE116193,GSM3212929,SRR7417518,Homo_Sapiens,1,1,CTCF,MboI,lgs301315


In [9]:
header_output = '{}.with_header.tsv'.format(output_prefix)
ready_df.to_csv(header_output, header=True, index=False, sep='\t')

without_header_output = '{}.without_header.tsv'.format(output_prefix)
ready_df.to_csv(without_header_output, header=False, index=False, sep='\t')

In [11]:
without_header_output

'results/samplesheets/fastq/2022.03.30.fastq.samplesheet.without_header.tsv'