In [1]:
import os
import sys
import pandas as pd
import tracker
latest_date = tracker.processing_dates[-1]
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/')

In [2]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/fastq/{}.fastq.google-samplesheet.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/fastq/{}.fastq.samplesheet'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_prefix = sys.argv[2]

In [3]:
# loading the samplesheet
df = pd.read_table(input_fn, skiprows=0)

In [4]:
df

Unnamed: 0,Sample Name (as used in the server),Study,GSE ID,GSM ID,SRR ID,GEO Title,GEO Source,GEO Description,Organism,Number of Reads,...,HiCPro Status,HiChIP Peaks Status,hichipper Status,OTHER Status,FitHiChIP Status,Operator,Priority,Cluster Path/Branch,Date Added,Comments
0,GM,"Mumbach et al., 2017",GSE101498,GSM2705041,SRR5831489,GM HiChIP H3K27ac biological replicate 1,GM12878 cell line,Protein-enriched long-range contact; GM_HiChIP...,Homo sapiens,664558514,...,1,0,0,0,0,,,,pre-5/16/2022,
1,GM,"Mumbach et al., 2017",GSE101498,GSM2705042,SRR5831490,GM HiChIP H3K27ac biological replicate 2,GM12878 cell line,Protein-enriched long-range contact; GM_HiChIP...,Homo sapiens,598957472,...,1,0,0,0,0,,,,pre-5/16/2022,
2,K562,"Mumbach et al., 2017",GSE101498,GSM2705043,SRR5831491,K562 HiChIP H3K27ac biological replicate 1,K562 cell line,Protein-enriched long-range contact; K562_HiCh...,Homo sapiens,426330778,...,1,0,0,0,0,,,,pre-5/16/2022,
3,K562,"Mumbach et al., 2017",GSE101498,GSM2705044,SRR5831492,K562 HiChIP H3K27ac biological replicate 2,K562 cell line,Protein-enriched long-range contact; K562_HiCh...,Homo sapiens,311815900,...,1,0,0,0,0,,,,pre-5/16/2022,
4,K562,"Mumbach et al., 2017",GSE101498,GSM2705045,SRR5831493,K562 HiChIP H3K27ac biological replicate 3,K562 cell line,Protein-enriched long-range contact; K562_HiCh...,Homo sapiens,356325902,...,1,0,0,0,0,,,,pre-5/16/2022,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480,K562,"Weintraub et al., 2017",GSE99519,GSM2774002,SRR6010263,HiChIP_K562_YY1,HiChIP_K562_YY1,,Homo sapiens,158111028,...,0,0,0,0,0,,,,5/16/2022,
481,iPSC-WT,"Kraft et al., 2022",GSE150906,GSM4560486,SRR11816734,iPSC_WT_A_H3K27me3_HiChIP,wild-type iPSC,H3K27me3 HiChIP library; iPSC_wt6_K1-TAAGGCGA-...,Homo sapiens,195443840,...,0,0,0,0,0,,,,6/16/2022,
482,iPSC-WT,"Kraft et al., 2022",GSE150906,GSM4560487,SRR11816735,iPSC_WT_B_H3K27me3_HiChIP,wild-type iPSC,H3K27me3 HiChIP library; iPSC_wt7_K3-AGGCAGAA-...,Homo sapiens,28509234,...,0,0,0,0,0,,,,6/16/2022,
483,iPSC-MUT,"Kraft et al., 2022",GSE150906,GSM4560488,SRR11816736,iPSC_MUT_A_H3K27me3_HiChIP,RNA-binding deficient EZH2 mutant iPSC,H3K27me3 HiChIP library; iPSC_mut18_K5-GGACTCC...,Homo sapiens,309053672,...,0,0,0,0,0,,,,6/16/2022,


In [17]:
# extract those samples which are ready for processing
ready_df = df.loc[(df['Start Processing'] == 1) & (df['Download Status'] == 0)]

# extract only the columns needed
major_cols = ['Sample Name (as used in the server)',
 'GSE ID',
 'GSM ID',
 'SRR ID',
 'Organism',
 'Biological Replicate Serial No',
 'Technical Replicate Serial No',
 'ChIP-seq Pull Down',
 'Restriction Enzyme']
ready_df = ready_df[major_cols]

In [18]:
def parse_organism(string):
    new_words = []
    for word in string.split():
        new_words.append(word.capitalize())
    new_string = '_'.join(new_words)
    return(new_string)

ready_df.loc[:, 'Organism'] = ready_df.loc[:, 'Organism'].apply(parse_organism)

In [19]:
# getting the sample names
sample_names = []
for i, sr in ready_df.iterrows():
    sample_name = '{sample_name}.{gse_id}.{organism}.{antibody_target}.b{biological_rep}'
    sample_name = sample_name.format(sample_name=sr[0],
                                     gse_id=sr[1],
                                     organism=sr[4],
                                     antibody_target=sr[7], 
                                     biological_rep=sr[5])
    sample_names.append(sample_name)
ready_df.loc[:, 'sample_name'] = sample_names

In [20]:
# renaming the columns for easy computational use 
ready_df.columns = ['sample_name', 'gse_id', 'gsm_id', 'srr_id',
                    'organism', 'bio_rep', 'tech_rep', 'antibody_target',
                    'restriction_enzyme', 'std_sample_name']

In [21]:
# reorder the columns
reorder = ['std_sample_name',
             'gse_id',
             'gsm_id',
             'srr_id',
             'organism',
             'bio_rep',
             'tech_rep',
             'antibody_target',
             'restriction_enzyme',
             'sample_name']
ready_df = ready_df[reorder]

In [22]:
ready_df

Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,antibody_target,restriction_enzyme,sample_name
86,HAVIC.GSE154512.Homo_Sapiens.H3K27ac.b1,GSE154512,GSM4672364,SRR12231664,Homo_Sapiens,1,1,H3K27ac,MboI,HAVIC
152,HCT116-AuxinNeg.GSE179544.Homo_Sapiens.RNA-Pol...,GSE179544,GSM5420289,SRR15050696,Homo_Sapiens,1,1,RNA-Pol-II,DpnII,HCT116-AuxinNeg
154,HCT116-AuxinPos.GSE179544.Homo_Sapiens.RNA-Pol...,GSE179544,GSM5420290,SRR15050697,Homo_Sapiens,1,1,RNA-Pol-II,DpnII,HCT116-AuxinPos
155,HCT116-AuxinPos.GSE179544.Homo_Sapiens.RNA-Pol...,GSE179544,GSM5420292,SRR15050699,Homo_Sapiens,1,2,RNA-Pol-II,DpnII,HCT116-AuxinPos
165,IMR90-Proliferating.GSE100856.Homo_Sapiens.CTC...,GSE100856,GSM2695301,SRR5808478,Homo_Sapiens,1,1,CTCF,HindIII,IMR90-Proliferating
...,...,...,...,...,...,...,...,...,...,...
480,K562.GSE99519.Homo_Sapiens.YY1.b1,GSE99519,GSM2774002,SRR6010263,Homo_Sapiens,1,2,YY1,MboI,K562
481,iPSC-WT.GSE150906.Homo_Sapiens.H3K27me3.b1,GSE150906,GSM4560486,SRR11816734,Homo_Sapiens,1,1,H3K27me3,MboI,iPSC-WT
482,iPSC-WT.GSE150906.Homo_Sapiens.H3K27me3.b1,GSE150906,GSM4560487,SRR11816735,Homo_Sapiens,1,2,H3K27me3,MboI,iPSC-WT
483,iPSC-MUT.GSE150906.Homo_Sapiens.H3K27me3.b1,GSE150906,GSM4560488,SRR11816736,Homo_Sapiens,1,1,H3K27me3,MboI,iPSC-MUT


In [23]:
header_output = '{}.with_header.tsv'.format(output_prefix)
ready_df.to_csv(header_output, header=True, index=False, sep='\t')

without_header_output = '{}.without_header.tsv'.format(output_prefix)
ready_df.to_csv(without_header_output, header=False, index=False, sep='\t')

In [24]:
df.shape

(485, 26)

In [25]:
ready_df.shape

(262, 10)

In [26]:
ready_df

Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,antibody_target,restriction_enzyme,sample_name
86,HAVIC.GSE154512.Homo_Sapiens.H3K27ac.b1,GSE154512,GSM4672364,SRR12231664,Homo_Sapiens,1,1,H3K27ac,MboI,HAVIC
152,HCT116-AuxinNeg.GSE179544.Homo_Sapiens.RNA-Pol...,GSE179544,GSM5420289,SRR15050696,Homo_Sapiens,1,1,RNA-Pol-II,DpnII,HCT116-AuxinNeg
154,HCT116-AuxinPos.GSE179544.Homo_Sapiens.RNA-Pol...,GSE179544,GSM5420290,SRR15050697,Homo_Sapiens,1,1,RNA-Pol-II,DpnII,HCT116-AuxinPos
155,HCT116-AuxinPos.GSE179544.Homo_Sapiens.RNA-Pol...,GSE179544,GSM5420292,SRR15050699,Homo_Sapiens,1,2,RNA-Pol-II,DpnII,HCT116-AuxinPos
165,IMR90-Proliferating.GSE100856.Homo_Sapiens.CTC...,GSE100856,GSM2695301,SRR5808478,Homo_Sapiens,1,1,CTCF,HindIII,IMR90-Proliferating
...,...,...,...,...,...,...,...,...,...,...
480,K562.GSE99519.Homo_Sapiens.YY1.b1,GSE99519,GSM2774002,SRR6010263,Homo_Sapiens,1,2,YY1,MboI,K562
481,iPSC-WT.GSE150906.Homo_Sapiens.H3K27me3.b1,GSE150906,GSM4560486,SRR11816734,Homo_Sapiens,1,1,H3K27me3,MboI,iPSC-WT
482,iPSC-WT.GSE150906.Homo_Sapiens.H3K27me3.b1,GSE150906,GSM4560487,SRR11816735,Homo_Sapiens,1,2,H3K27me3,MboI,iPSC-WT
483,iPSC-MUT.GSE150906.Homo_Sapiens.H3K27me3.b1,GSE150906,GSM4560488,SRR11816736,Homo_Sapiens,1,1,H3K27me3,MboI,iPSC-MUT


In [27]:
without_header_output

'results/samplesheets/fastq/2022.06.19.17.30.fastq.samplesheet.without_header.tsv'

In [28]:
df

Unnamed: 0,Sample Name (as used in the server),Study,GSE ID,GSM ID,SRR ID,GEO Title,GEO Source,GEO Description,Organism,Number of Reads,...,HiCPro Status,HiChIP Peaks Status,hichipper Status,OTHER Status,FitHiChIP Status,Operator,Priority,Cluster Path/Branch,Date Added,Comments
0,GM,"Mumbach et al., 2017",GSE101498,GSM2705041,SRR5831489,GM HiChIP H3K27ac biological replicate 1,GM12878 cell line,Protein-enriched long-range contact; GM_HiChIP...,Homo sapiens,664558514,...,1,0,0,0,0,,,,pre-5/16/2022,
1,GM,"Mumbach et al., 2017",GSE101498,GSM2705042,SRR5831490,GM HiChIP H3K27ac biological replicate 2,GM12878 cell line,Protein-enriched long-range contact; GM_HiChIP...,Homo sapiens,598957472,...,1,0,0,0,0,,,,pre-5/16/2022,
2,K562,"Mumbach et al., 2017",GSE101498,GSM2705043,SRR5831491,K562 HiChIP H3K27ac biological replicate 1,K562 cell line,Protein-enriched long-range contact; K562_HiCh...,Homo sapiens,426330778,...,1,0,0,0,0,,,,pre-5/16/2022,
3,K562,"Mumbach et al., 2017",GSE101498,GSM2705044,SRR5831492,K562 HiChIP H3K27ac biological replicate 2,K562 cell line,Protein-enriched long-range contact; K562_HiCh...,Homo sapiens,311815900,...,1,0,0,0,0,,,,pre-5/16/2022,
4,K562,"Mumbach et al., 2017",GSE101498,GSM2705045,SRR5831493,K562 HiChIP H3K27ac biological replicate 3,K562 cell line,Protein-enriched long-range contact; K562_HiCh...,Homo sapiens,356325902,...,1,0,0,0,0,,,,pre-5/16/2022,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480,K562,"Weintraub et al., 2017",GSE99519,GSM2774002,SRR6010263,HiChIP_K562_YY1,HiChIP_K562_YY1,,Homo sapiens,158111028,...,0,0,0,0,0,,,,5/16/2022,
481,iPSC-WT,"Kraft et al., 2022",GSE150906,GSM4560486,SRR11816734,iPSC_WT_A_H3K27me3_HiChIP,wild-type iPSC,H3K27me3 HiChIP library; iPSC_wt6_K1-TAAGGCGA-...,Homo sapiens,195443840,...,0,0,0,0,0,,,,6/16/2022,
482,iPSC-WT,"Kraft et al., 2022",GSE150906,GSM4560487,SRR11816735,iPSC_WT_B_H3K27me3_HiChIP,wild-type iPSC,H3K27me3 HiChIP library; iPSC_wt7_K3-AGGCAGAA-...,Homo sapiens,28509234,...,0,0,0,0,0,,,,6/16/2022,
483,iPSC-MUT,"Kraft et al., 2022",GSE150906,GSM4560488,SRR11816736,iPSC_MUT_A_H3K27me3_HiChIP,RNA-binding deficient EZH2 mutant iPSC,H3K27me3 HiChIP library; iPSC_mut18_K5-GGACTCC...,Homo sapiens,309053672,...,0,0,0,0,0,,,,6/16/2022,
