### Usage

1. Download the HiChIP Tracker to your personal clone of the `hichip-db-loop-calling` repo as a `.tsv` file and store in `results/samplesheets/post-hicpro` (create this folder if it does not exist). Name using the following format: `YYYY.MM.DD.HH.NN.post-hicpro.google-samplesheet.tsv`. Note: always make sure to use two digits for month (MM), day (DD), hour (HH) and minute (NN).<br>

2. Save this date into the tracker package under: `workflow/scripts/trackers/tracker/__init__.py`

3. Convert this Google based samplesheet into the post-hicpro samplesheet by running this Jupyter Notebook: `hichip-db-loop-calling/workflow/scripts/trackers/converter.google_to_post-hicpro_samplesheet`

4. Update the softlink for `current-post-hicpro-without-header.tsv` within `results/samplesheets/post-hicpro`:  by using `ln -s -r -f YYYY.MM.DD.HH.NN.post-hicpro.samplesheet.without_header.tsv current-post-hicpro-without-header.tsv` where HH is hour and NN is minute in military time

5. You are all set to run the FitHiChIP peak calling, FitHiChIP loop calling, and HiCCUPS scripts using the indicies in `current-post-hicpro-without-header.tsv` corresponding to the samples you want to run! 


In [1]:
import os
import sys
import pandas as pd
import tracker
latest_date = tracker.post_hicpro_processing_dates[-1]
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/')

In [2]:
latest_date

'2022.08.02.10.00'

In [3]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/post-hicpro/{}.post-hicpro.google-samplesheet.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/post-hicpro/{}.post-hicpro.samplesheet'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_prefix = sys.argv[2]

In [4]:
# loading the samplesheet
df = pd.read_table(input_fn, skiprows=0)

In [5]:
df

Unnamed: 0,Sample Name (as used in the server),Study,GSE ID,GSM ID,SRR ID,GEO Title,GEO Source,GEO Description,Organism,Number of Reads,...,FitHiChIP FitHiChIP L25 Loops Status,"Chr1 HiCCUPS 5,10,25 Loops Status","All Chrs HiCCUPS 5,10,25 Loops Status",Download/HiC-Pro Operator,Chip-Seq Peaks Operator,HiChIP Peaks/Loops Operator,Priority,Cluster Path/Branch,Date Added,Comments
0,293T,"Li et al., 2020",GSE128106,GSM3664990,SRR8707613,HiChIP YY1 rep1,293T YY1-TAPTAG,processed data file: HiChIP_Kin_Interaction_Ma...,Homo sapiens,220406226,...,0,1,-1,Joaquin,Nikhil,Kyra,,,pre-5/16/2022,
1,293T,"Li et al., 2020",GSE128106,GSM3664991,SRR8707614,HiChIP YY1 rep2,293T YY1-TAPTAG,processed data file: HiChIP_Kin_Interaction_Ma...,Homo sapiens,234171286,...,0,1,-1,Joaquin,Nikhil,Kyra,,,pre-5/16/2022,
2,293T,"Li et al., 2020",GSE128106,GSM3664992,SRR8707615,HiChIP YY1 rep3,293T YY1-TAPTAG,processed data file: HiChIP_Kin_Interaction_Ma...,Homo sapiens,21901000,...,0,1,-1,Joaquin,Nikhil,Kyra,,,pre-5/16/2022,
3,293T,"Li et al., 2020",GSE128106,GSM3664993,SRR8707616,HiChIP YY1 rep4,293T YY1-TAPTAG,processed data file: HiChIP_Kin_Interaction_Ma...,Homo sapiens,18913262,...,0,1,-1,Joaquin,Nikhil,Kyra,,,pre-5/16/2022,
4,293T-PDS,"Li et al., 2020",GSE128106,GSM3664994,SRR8707617,HiChIP YY1-PDS rep1,293T YY1-TAPTAG,processed data file: HiChIP_PDS_Interaction_Ma...,Homo sapiens,50913880,...,0,1,-1,Joaquin,Nikhil,Kyra,,,pre-5/16/2022,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480,VCAP-DHT-2hr,"Guo et al., 2021",GSE157107,GSM4782223,SRR12643179,VCAP_H3K27ac_HiChIP_DHT_2hr,VCaP cells,H3K27ac HiChIP in VCaP cells,Homo sapiens,7398342,...,0,1,-1,Joaquin,Nikhil,Kyra,,,5/16/2022,
481,VCaP-DMSO-4h,"Xiao et al., 2022",GSE171591,GSM5229035,SRR14159848,VCaP_DMSO_4h_H3K4me3 (HiChIP-seq),VCaP,,Homo sapiens,251548098,...,-1,1,-1,Joaquin,,Kyra,,,5/16/2022,
482,VCaP-DMSO-4h,"Xiao et al., 2022",GSE171591,GSM5229037,SRR14159850,VCaP_DMSO_4h_H3K27Ac (HiChIP-seq),VCaP,,Homo sapiens,177992192,...,-1,1,-1,Joaquin,,Kyra,,,5/16/2022,
483,VCaP-DMSO-4h,"Xiao et al., 2022",GSE171591,GSM5229039,SRR14159852,VCaP_DMSO_4h_CTCF (HiChIP-seq),VCaP,,Homo sapiens,179864588,...,-1,1,1,Joaquin,,Kyra,,,5/16/2022,


In [6]:
# extract those samples which are ready for post-hicpro analyses and processing
ready_df = df.loc[(df['Priority'] == 'Immune')]

# extract only the columns needed
major_cols = ['Sample Name (as used in the server)',
 'GSE ID',
 'Organism',
 'Biological Replicate Serial No',
 'ChIP-seq Pull Down',
 'Restriction Enzyme']
ready_df = ready_df[major_cols]

In [7]:
ready_df

Unnamed: 0,Sample Name (as used in the server),GSE ID,Organism,Biological Replicate Serial No,ChIP-seq Pull Down,Restriction Enzyme
98,GM12878,GSE101498,Homo sapiens,1,H3K27ac,MboI
99,GM12878,GSE101498,Homo sapiens,2,H3K27ac,MboI
100,GM12878,GSE115524,Homo sapiens,1,CTCF,MboI
101,GM12878,GSE115524,Homo sapiens,2,CTCF,MboI
102,GM12878,GSE80820,Homo sapiens,1,SMC1A,MboI
103,GM12878,GSE80820,Homo sapiens,1,SMC1A,MboI
104,GM12878,GSE80820,Homo sapiens,2,SMC1A,MboI
105,GM12878,GSE80820,Homo sapiens,2,SMC1A,MboI
121,H9,GSE105028,Homo sapiens,1,Rad21,DpnII
122,H9,GSE105028,Homo sapiens,1,Rad21,DpnII


In [8]:
# capitalizes organism
def parse_organism(string):
    new_words = []
    for word in string.split():
        new_words.append(word.capitalize())
    new_string = '_'.join(new_words)
    return(new_string)

ready_df.loc[:, 'Organism'] = ready_df.loc[:, 'Organism'].apply(parse_organism)

In [9]:
# getting the sample names
sample_names = []
for i, sr in ready_df.iterrows():
    sample_name = '{sample_name}.{gse_id}.{organism}.{antibody_target}.b{biological_rep}'
    sample_name = sample_name.format(sample_name=sr[0],
                                     gse_id=sr[1],
                                     organism=sr[2],
                                     antibody_target=sr[4], 
                                     biological_rep=sr[3])
    sample_names.append(sample_name)
ready_df.loc[:, 'sample_name'] = sample_names

In [10]:
# renaming the columns for easy computational use 
ready_df.columns = ['sample_name', 'gse_id',
                    'organism', 'bio_rep', 'antibody_target',
                    'restriction_enzyme', 'std_sample_name']

In [11]:
# reorder the columns
reorder = ['std_sample_name',
             'gse_id',
             'organism',
             'bio_rep',
             'antibody_target',
             'restriction_enzyme',
             'sample_name']
ready_df = ready_df[reorder]
final_df = ready_df.drop_duplicates()

In [12]:
final_df

Unnamed: 0,std_sample_name,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,sample_name
98,GM12878.GSE101498.Homo_Sapiens.H3K27ac.b1,GSE101498,Homo_Sapiens,1,H3K27ac,MboI,GM12878
99,GM12878.GSE101498.Homo_Sapiens.H3K27ac.b2,GSE101498,Homo_Sapiens,2,H3K27ac,MboI,GM12878
100,GM12878.GSE115524.Homo_Sapiens.CTCF.b1,GSE115524,Homo_Sapiens,1,CTCF,MboI,GM12878
101,GM12878.GSE115524.Homo_Sapiens.CTCF.b2,GSE115524,Homo_Sapiens,2,CTCF,MboI,GM12878
102,GM12878.GSE80820.Homo_Sapiens.SMC1A.b1,GSE80820,Homo_Sapiens,1,SMC1A,MboI,GM12878
104,GM12878.GSE80820.Homo_Sapiens.SMC1A.b2,GSE80820,Homo_Sapiens,2,SMC1A,MboI,GM12878
121,H9.GSE105028.Homo_Sapiens.Rad21.b1,GSE105028,Homo_Sapiens,1,Rad21,DpnII,H9
125,H9.GSE105028.Homo_Sapiens.CTCF.b1,GSE105028,Homo_Sapiens,1,CTCF,DpnII,H9
128,H9.GSE105028.Homo_Sapiens.OCT4.b1,GSE105028,Homo_Sapiens,1,OCT4,DpnII,H9
130,H9.GSE105028.Homo_Sapiens.NANOG.b1,GSE105028,Homo_Sapiens,1,NANOG,DpnII,H9


In [15]:
header_output = '{}.with_header.tsv'.format(output_prefix)
final_df.to_csv(header_output, header=True, index=False, sep='\t')

In [16]:
without_header_output = '{}.without_header.tsv'.format(output_prefix)
final_df.to_csv(without_header_output, header=False, index=False, sep='\t')