In [3]:
import os
import sys
import pandas as pd
import tracker
latest_date = tracker.processing_dates[-1]
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/')

In [4]:
latest_date

'2022.06.27.09.17'

In [5]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/fastq/{}.fastq.google-samplesheet.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/fastq/{}.fastq.samplesheet'.format(latest_date)
# if not using jupyter notebook
else:
    input_fn = sys.argv[1]
    output_prefix = sys.argv[2]

In [6]:
# loading the samplesheet
df = pd.read_table(input_fn, skiprows=0)

In [7]:
df

Unnamed: 0,Sample Name (as used in the server),Study,GSE ID,GSM ID,SRR ID,GEO Title,GEO Source,GEO Description,Organism,Number of Reads,...,HiCPro Status,HiChIP Peaks Status,hichipper Status,OTHER Status,FitHiChIP Status,Operator,Priority,Cluster Path/Branch,Date Added,Comments
0,GM,"Mumbach et al., 2017",GSE101498,GSM2705041,SRR5831489,GM HiChIP H3K27ac biological replicate 1,GM12878 cell line,Protein-enriched long-range contact; GM_HiChIP...,Homo sapiens,664558514,...,1,0,0,0,0,Joaquin,,,pre-5/16/2022,
1,GM,"Mumbach et al., 2017",GSE101498,GSM2705042,SRR5831490,GM HiChIP H3K27ac biological replicate 2,GM12878 cell line,Protein-enriched long-range contact; GM_HiChIP...,Homo sapiens,598957472,...,1,0,0,0,0,Joaquin,,,pre-5/16/2022,
2,K562,"Mumbach et al., 2017",GSE101498,GSM2705043,SRR5831491,K562 HiChIP H3K27ac biological replicate 1,K562 cell line,Protein-enriched long-range contact; K562_HiCh...,Homo sapiens,426330778,...,1,0,0,0,0,Joaquin,,,pre-5/16/2022,
3,K562,"Mumbach et al., 2017",GSE101498,GSM2705044,SRR5831492,K562 HiChIP H3K27ac biological replicate 2,K562 cell line,Protein-enriched long-range contact; K562_HiCh...,Homo sapiens,311815900,...,1,0,0,0,0,Joaquin,,,pre-5/16/2022,
4,K562,"Mumbach et al., 2017",GSE101498,GSM2705045,SRR5831493,K562 HiChIP H3K27ac biological replicate 3,K562 cell line,Protein-enriched long-range contact; K562_HiCh...,Homo sapiens,356325902,...,1,0,0,0,0,Joaquin,,,pre-5/16/2022,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480,K562,"Weintraub et al., 2017",GSE99519,GSM2774002,SRR6010263,HiChIP_K562_YY1,HiChIP_K562_YY1,,Homo sapiens,158111028,...,1,0,0,0,0,Joaquin,,,5/16/2022,
481,iPSC-WT,"Kraft et al., 2022",GSE150906,GSM4560486,SRR11816734,iPSC_WT_A_H3K27me3_HiChIP,wild-type iPSC,H3K27me3 HiChIP library; iPSC_wt6_K1-TAAGGCGA-...,Homo sapiens,195443840,...,1,0,0,0,0,Joaquin,,,6/16/2022,
482,iPSC-WT,"Kraft et al., 2022",GSE150906,GSM4560487,SRR11816735,iPSC_WT_B_H3K27me3_HiChIP,wild-type iPSC,H3K27me3 HiChIP library; iPSC_wt7_K3-AGGCAGAA-...,Homo sapiens,28509234,...,1,0,0,0,0,Joaquin,,,6/16/2022,
483,iPSC-MUT,"Kraft et al., 2022",GSE150906,GSM4560488,SRR11816736,iPSC_MUT_A_H3K27me3_HiChIP,RNA-binding deficient EZH2 mutant iPSC,H3K27me3 HiChIP library; iPSC_mut18_K5-GGACTCC...,Homo sapiens,309053672,...,1,0,0,0,0,Joaquin,,,6/16/2022,


In [8]:
# extract those samples which are ready for processing
ready_df = df.loc[(df['Start Processing'] == 1) & (df['Download Status'] == 0) & (df['Operator'] == "Kyra")]

# extract only the columns needed
major_cols = ['Sample Name (as used in the server)',
 'GSE ID',
 'GSM ID',
 'SRR ID',
 'Organism',
 'Biological Replicate Serial No',
 'Technical Replicate Serial No',
 'ChIP-seq Pull Down',
 'Restriction Enzyme']
ready_df = ready_df[major_cols]

In [9]:
# capitalizes organism
def parse_organism(string):
    new_words = []
    for word in string.split():
        new_words.append(word.capitalize())
    new_string = '_'.join(new_words)
    return(new_string)

ready_df.loc[:, 'Organism'] = ready_df.loc[:, 'Organism'].apply(parse_organism)

In [10]:
# getting the sample names
sample_names = []
for i, sr in ready_df.iterrows():
    sample_name = '{sample_name}.{gse_id}.{organism}.{antibody_target}.b{biological_rep}'
    sample_name = sample_name.format(sample_name=sr[0],
                                     gse_id=sr[1],
                                     organism=sr[4],
                                     antibody_target=sr[7], 
                                     biological_rep=sr[5])
    sample_names.append(sample_name)
ready_df.loc[:, 'sample_name'] = sample_names

In [11]:
# renaming the columns for easy computational use 
ready_df.columns = ['sample_name', 'gse_id', 'gsm_id', 'srr_id',
                    'organism', 'bio_rep', 'tech_rep', 'antibody_target',
                    'restriction_enzyme', 'std_sample_name']

In [12]:
# reorder the columns
reorder = ['std_sample_name',
             'gse_id',
             'gsm_id',
             'srr_id',
             'organism',
             'bio_rep',
             'tech_rep',
             'antibody_target',
             'restriction_enzyme',
             'sample_name']
ready_df = ready_df[reorder]

In [13]:
ready_df

Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,antibody_target,restriction_enzyme,sample_name
37,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,GSM3902791,SRR9590180,Homo_Sapiens,1,1,CTCF,MboI,A673_SA1m1
38,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,GSM3902792,SRR9590181,Homo_Sapiens,2,1,CTCF,MboI,A673_SA1m1
39,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,GSM3902793,SRR9590182,Homo_Sapiens,1,1,CTCF,MboI,A673_SA2m1
40,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,GSM3902794,SRR9590183,Homo_Sapiens,2,1,CTCF,MboI,A673_SA2m1
41,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b3,GSE133227,GSM3902795,SRR9590184,Homo_Sapiens,3,1,CTCF,MboI,A673_SA2m1
...,...,...,...,...,...,...,...,...,...,...
411,DND41.GSE165207.Homo_Sapiens.H3K27ac.b2,GSE165207,GSM5028230,SRR13492047,Homo_Sapiens,2,1,H3K27ac,Arima,DND41
412,DND41.GSE165207.Homo_Sapiens.H3K27ac.b2,GSE165207,GSM5028230,SRR13492048,Homo_Sapiens,2,2,H3K27ac,Arima,DND41
413,Jurkat.GSE165207.Homo_Sapiens.H3K27ac.b1,GSE165207,GSM5028231,SRR13492049,Homo_Sapiens,1,1,H3K27ac,Arima,Jurkat
414,Jurkat.GSE165207.Homo_Sapiens.H3K27ac.b1,GSE165207,GSM5028231,SRR13492050,Homo_Sapiens,1,2,H3K27ac,Arima,Jurkat


In [14]:
header_output = '{}.with_header.tsv'.format(output_prefix)
ready_df.to_csv(header_output, header=True, index=False, sep='\t')

without_header_output = '{}.without_header.tsv'.format(output_prefix)
ready_df.to_csv(without_header_output, header=False, index=False, sep='\t')

In [15]:
df.shape

(485, 26)

In [16]:
ready_df.shape

(103, 10)