### Usage

1. Download the HiChIP Tracker to your personal clone of the `hichip-db-loop-calling` repo as a `.tsv` file and store in `results/samplesheets/post-hicpro` (create this folder if it does not exist). Name using the following format: `YYYY.MM.DD.HH.NN.post-hicpro.google-samplesheet.tsv`. Note: always make sure to use two digits for month (MM), day (DD), hour (HH) and minute (NN).<br>

2. Save this date into the tracker package under: `workflow/scripts/trackers/tracker/__init__.py`

3. Convert this Google based samplesheet into the post-hicpro samplesheet by running this Jupyter Notebook: `hichip-db-loop-calling/workflow/scripts/trackers/converter.google_to_post-hicpro_samplesheet`

4. Update the softlink for `current-post-hicpro-without-header.tsv` within `results/samplesheets/post-hicpro`:  by using `ln -s -r -f YYYY.MM.DD.HH.NN.post-hicpro.samplesheet.without_header.tsv current-post-hicpro-without-header.tsv` where HH is hour and NN is minute in military time

5. You are all set to run the FitHiChIP peak calling, FitHiChIP loop calling, and HiCCUPS scripts using the indicies in `current-post-hicpro-without-header.tsv` corresponding to the samples you want to run! 


In [1]:
import os
import sys
import pandas as pd
import tracker
latest_date = tracker.processing_dates[-1]
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/')

In [2]:
latest_date

'2022.06.28.10.37'

In [3]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/post-hicpro/{}.post-hicpro.google-samplesheet.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/post-hicpro/{}.post-hicpro.samplesheet'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_prefix = sys.argv[2]

In [4]:
# loading the samplesheet
df = pd.read_table(input_fn, skiprows=0)

In [5]:
df

Unnamed: 0,Sample Name (as used in the server),Study,GSE ID,GSM ID,SRR ID,GEO Title,GEO Source,GEO Description,Organism,Number of Reads,...,HiCPro Status,FitHiChIP Peaks Status,FitHiChIP Loops Status,HiCCUPS Loops Status,Download/HiC-Pro Operator,Peaks/Loops Operator,Priority,Cluster Path/Branch,Date Added,Comments
0,GM,"Mumbach et al., 2017",GSE101498,GSM2705041,SRR5831489,GM HiChIP H3K27ac biological replicate 1,GM12878 cell line,Protein-enriched long-range contact; GM_HiChIP...,Homo sapiens,664558514,...,1,0,0,0,Joaquin,,,,pre-5/16/2022,
1,GM,"Mumbach et al., 2017",GSE101498,GSM2705042,SRR5831490,GM HiChIP H3K27ac biological replicate 2,GM12878 cell line,Protein-enriched long-range contact; GM_HiChIP...,Homo sapiens,598957472,...,1,0,0,0,Joaquin,,,,pre-5/16/2022,
2,K562,"Mumbach et al., 2017",GSE101498,GSM2705043,SRR5831491,K562 HiChIP H3K27ac biological replicate 1,K562 cell line,Protein-enriched long-range contact; K562_HiCh...,Homo sapiens,426330778,...,1,0,0,0,Joaquin,,,,pre-5/16/2022,
3,K562,"Mumbach et al., 2017",GSE101498,GSM2705044,SRR5831492,K562 HiChIP H3K27ac biological replicate 2,K562 cell line,Protein-enriched long-range contact; K562_HiCh...,Homo sapiens,311815900,...,1,0,0,0,Joaquin,,,,pre-5/16/2022,
4,K562,"Mumbach et al., 2017",GSE101498,GSM2705045,SRR5831493,K562 HiChIP H3K27ac biological replicate 3,K562 cell line,Protein-enriched long-range contact; K562_HiCh...,Homo sapiens,356325902,...,1,0,0,0,Joaquin,,,,pre-5/16/2022,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480,K562,"Weintraub et al., 2017",GSE99519,GSM2774002,SRR6010263,HiChIP_K562_YY1,HiChIP_K562_YY1,,Homo sapiens,158111028,...,1,0,0,0,Joaquin,,,,5/16/2022,
481,iPSC-WT,"Kraft et al., 2022",GSE150906,GSM4560486,SRR11816734,iPSC_WT_A_H3K27me3_HiChIP,wild-type iPSC,H3K27me3 HiChIP library; iPSC_wt6_K1-TAAGGCGA-...,Homo sapiens,195443840,...,1,0,0,0,Joaquin,,,,6/16/2022,
482,iPSC-WT,"Kraft et al., 2022",GSE150906,GSM4560487,SRR11816735,iPSC_WT_B_H3K27me3_HiChIP,wild-type iPSC,H3K27me3 HiChIP library; iPSC_wt7_K3-AGGCAGAA-...,Homo sapiens,28509234,...,1,0,0,0,Joaquin,,,,6/16/2022,
483,iPSC-MUT,"Kraft et al., 2022",GSE150906,GSM4560488,SRR11816736,iPSC_MUT_A_H3K27me3_HiChIP,RNA-binding deficient EZH2 mutant iPSC,H3K27me3 HiChIP library; iPSC_mut18_K5-GGACTCC...,Homo sapiens,309053672,...,1,0,0,0,Joaquin,,,,6/16/2022,


In [6]:
# extract those samples which are ready for post-hicpro analyses and processing
ready_df = df.loc[(df['Start Processing'] == 1) & (df['Download Status'] == 1) & (df['HiCPro Status'] == '1')]

# extract only the columns needed
major_cols = ['Sample Name (as used in the server)',
 'GSE ID',
 'Organism',
 'Biological Replicate Serial No',
 'ChIP-seq Pull Down',
 'Restriction Enzyme']
ready_df = ready_df[major_cols]

In [7]:
ready_df

Unnamed: 0,Sample Name (as used in the server),GSE ID,Organism,Biological Replicate Serial No,ChIP-seq Pull Down,Restriction Enzyme
0,GM,GSE101498,Homo sapiens,1,H3K27ac,MboI
1,GM,GSE101498,Homo sapiens,2,H3K27ac,MboI
2,K562,GSE101498,Homo sapiens,1,H3K27ac,MboI
3,K562,GSE101498,Homo sapiens,2,H3K27ac,MboI
4,K562,GSE101498,Homo sapiens,3,H3K27ac,MboI
...,...,...,...,...,...,...
480,K562,GSE99519,Homo sapiens,1,YY1,MboI
481,iPSC-WT,GSE150906,Homo sapiens,1,H3K27me3,MboI
482,iPSC-WT,GSE150906,Homo sapiens,1,H3K27me3,MboI
483,iPSC-MUT,GSE150906,Homo sapiens,1,H3K27me3,MboI


In [8]:
# capitalizes organism
def parse_organism(string):
    new_words = []
    for word in string.split():
        new_words.append(word.capitalize())
    new_string = '_'.join(new_words)
    return(new_string)

ready_df.loc[:, 'Organism'] = ready_df.loc[:, 'Organism'].apply(parse_organism)

In [9]:
# getting the sample names
sample_names = []
for i, sr in ready_df.iterrows():
    sample_name = '{sample_name}.{gse_id}.{organism}.{antibody_target}.b{biological_rep}'
    sample_name = sample_name.format(sample_name=sr[0],
                                     gse_id=sr[1],
                                     organism=sr[2],
                                     antibody_target=sr[4], 
                                     biological_rep=sr[3])
    sample_names.append(sample_name)
ready_df.loc[:, 'sample_name'] = sample_names

In [10]:
# renaming the columns for easy computational use 
ready_df.columns = ['sample_name', 'gse_id',
                    'organism', 'bio_rep', 'antibody_target',
                    'restriction_enzyme', 'std_sample_name']

In [11]:
ready_df

Unnamed: 0,sample_name,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,std_sample_name
0,GM,GSE101498,Homo_Sapiens,1,H3K27ac,MboI,GM.GSE101498.Homo_Sapiens.H3K27ac.b1
1,GM,GSE101498,Homo_Sapiens,2,H3K27ac,MboI,GM.GSE101498.Homo_Sapiens.H3K27ac.b2
2,K562,GSE101498,Homo_Sapiens,1,H3K27ac,MboI,K562.GSE101498.Homo_Sapiens.H3K27ac.b1
3,K562,GSE101498,Homo_Sapiens,2,H3K27ac,MboI,K562.GSE101498.Homo_Sapiens.H3K27ac.b2
4,K562,GSE101498,Homo_Sapiens,3,H3K27ac,MboI,K562.GSE101498.Homo_Sapiens.H3K27ac.b3
...,...,...,...,...,...,...,...
480,K562,GSE99519,Homo_Sapiens,1,YY1,MboI,K562.GSE99519.Homo_Sapiens.YY1.b1
481,iPSC-WT,GSE150906,Homo_Sapiens,1,H3K27me3,MboI,iPSC-WT.GSE150906.Homo_Sapiens.H3K27me3.b1
482,iPSC-WT,GSE150906,Homo_Sapiens,1,H3K27me3,MboI,iPSC-WT.GSE150906.Homo_Sapiens.H3K27me3.b1
483,iPSC-MUT,GSE150906,Homo_Sapiens,1,H3K27me3,MboI,iPSC-MUT.GSE150906.Homo_Sapiens.H3K27me3.b1


In [12]:
# reorder the columns
reorder = ['std_sample_name',
             'gse_id',
             'organism',
             'bio_rep',
             'antibody_target',
             'restriction_enzyme',
             'sample_name']
ready_df = ready_df[reorder]
final_df = ready_df.drop_duplicates()

In [17]:
header_output = '{}.with_header.tsv'.format(output_prefix)
final_df.to_csv(header_output, header=True, index=False, sep='\t')

In [18]:
without_header_output = '{}.without_header.tsv'.format(output_prefix)
final_df.to_csv(without_header_output, header=False, index=False, sep='\t')

In [21]:
final_df

Unnamed: 0,std_sample_name,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,sample_name
0,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,Homo_Sapiens,1,CTCF,MboI,A673_SA1m1
1,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,Homo_Sapiens,2,CTCF,MboI,A673_SA1m1
2,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,Homo_Sapiens,1,CTCF,MboI,A673_SA2m1
3,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,Homo_Sapiens,2,CTCF,MboI,A673_SA2m1
4,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b3,GSE133227,Homo_Sapiens,3,CTCF,MboI,A673_SA2m1
...,...,...,...,...,...,...,...
91,cbCD34+-HSPC.GSE165207.Homo_Sapiens.H3K27ac.b1,GSE165207,Homo_Sapiens,1,H3K27ac,Arima,cbCD34+-HSPC
96,DND41.GSE165207.Homo_Sapiens.H3K27ac.b1,GSE165207,Homo_Sapiens,1,H3K27ac,Arima,DND41
98,DND41.GSE165207.Homo_Sapiens.H3K27ac.b2,GSE165207,Homo_Sapiens,2,H3K27ac,Arima,DND41
100,Jurkat.GSE165207.Homo_Sapiens.H3K27ac.b1,GSE165207,Homo_Sapiens,1,H3K27ac,Arima,Jurkat
