### Usage

1. Download the HiChIP Tracker to your personal clone of the `hichip-db-loop-calling` repo as a `.tsv` file and store in `results/samplesheets/post-hicpro` (create this folder if it does not exist). Name using the following format: `YYYY.MM.DD.HH.NN.post-hicpro.google-samplesheet.tsv`. Note: always make sure to use two digits for month (MM), day (DD), hour (HH) and minute (NN).<br>

2. Save this date into the tracker package under: `workflow/scripts/trackers/tracker/__init__.py`

3. Convert this Google based samplesheet into the post-hicpro samplesheet by running this Jupyter Notebook: `hichip-db-loop-calling/workflow/scripts/trackers/converter.google_to_post-hicpro_samplesheet`

4. Update the softlink for `current-post-hicpro-without-header.tsv` within `results/samplesheets/post-hicpro`:  by using `ln -s -r -f YYYY.MM.DD.HH.NN.post-hicpro.samplesheet.without_header.tsv current-post-hicpro-without-header.tsv` where HH is hour and NN is minute in military time

5. You are all set to run the FitHiChIP peak calling, FitHiChIP loop calling, and HiCCUPS scripts using the indicies in `current-post-hicpro-without-header.tsv` corresponding to the samples you want to run! 


In [1]:
import os
import sys
import pandas as pd
import tracker
latest_date = tracker.post_hicpro_processing_dates[-1]
os.chdir('/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/')

In [2]:
latest_date

'2023.04.04.13.39'

In [56]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/post-hicpro/{}.post-hicpro.google-samplesheet.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/post-hicpro/{}.post-hicpro.samplesheet'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_prefix = sys.argv[2]

In [57]:
# loading the samplesheet
df = pd.read_table(input_fn, skiprows=0)

In [58]:
df

Unnamed: 0,Sample Name (as used in the server),Cellosaurus ID,Study,GSE ID,GSM ID,SRR ID,GEO Title,GEO Source,GEO Description,Organism,...,Download/HiC-Pro Operator,Chip-Seq Peaks Operator,HiChIP Peaks/Loops Operator,Priority,Cluster Path/Branch,Date Added,Cell Category,Cell Type,Cell Line or Primary,Comments
0,3134_siCTRL_1hr_Dex,CVCL_H641,"Rinaldi et al., 2022",GSE162617,GSM4955433,SRR13192949,siCTRL 1h Dex rep#1 GR HiCHIP,cell line derived from mouse mammary epithelia...,,Mus musculus,...,Kyra,,,,,5/16/2022,,,,
1,3134_siNIPBL_1hr_Dex,CVCL_H641,"Rinaldi et al., 2022",GSE162617,GSM4955434,SRR13192950,siNIPBL 1h Dex rep#1 GR HiCHIP,cell line derived from mouse mammary epithelia...,,Mus musculus,...,Kyra,,,,,5/16/2022,,,,
2,3134_WT,CVCL_H641,"Rinaldi et al., 2022",GSE162617,GSM4955432,SRR13192948,untreated rep1 GR HiChIP,cell line derived from mouse mammary epithelia...,,Mus musculus,...,Kyra,,,,,5/16/2022,,,,
3,3T3,CVCL_0594,"Tsao et al., 2022",GSE192387,GSM5746145,SRR17296607,HiChIP_3T3_Batf,NIH/3T3 Fibroblasts,HiChIP with BATF antibody on NIH/3T3 fibroblas...,Mus musculus,...,Kyra,,,,,5/16/2022,,,,
4,3T3,CVCL_0594,"Tsao et al., 2022",GSE192387,GSM5746148,SRR17296610,HiChIP_3T3_CTCF,NIH/3T3 Fibroblasts,HiChIP with CTCF antibody on NIH/3T3 fibroblasts,Mus musculus,...,Kyra,,,,,5/16/2022,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241,Treg_WTGFP,,"Ramirez et al., 2022",GSE189442,GSM5701997,SRR17021172,H3K27Ac.HiChIP.WTGFP#2,spleen,,Mus musculus,...,Kyra,,,,,5/16/2022,,,,
242,Villus,,"Chen et al., 2021",GSE148691,GSM4476739,SRR11548368,H3K4me3_HiChIP_WT_Villus_rep1,WT adult duodenal villus cells,H3K4me3_HiChIP_WT_Villus_CombinedReplicates.in...,Mus musculus,...,Kyra,,,,,5/16/2022,,,,
243,Villus,,"Chen et al., 2021",GSE148691,GSM4476740,SRR11548369,H3K4me3_HiChIP_WT_Villus_rep2,WT adult duodenal villus cells,H3K4me3_HiChIP_WT_Villus_CombinedReplicates.in...,Mus musculus,...,Kyra,,,,,5/16/2022,,,,
244,Villus_Hnf4DKO,,"Chen et al., 2021",GSE148691,GSM4476743,SRR11548372,H3K4me3_HiChIP_Hnf4DKO_Villus_rep1,Hnf4DKO adult duodenal villus cells,H3K4me3_HiChIP_Hnf4DKO_Villus_CombinedReplicat...,Mus musculus,...,Kyra,,,,,5/16/2022,,,,


In [59]:
# extract those samples which are ready for post-hicpro analyses and processing
ready_df = df.loc[(df['Download Status'] == 1)]

# extract only the columns needed
major_cols = ['Sample Name (as used in the server)', 'Cellosaurus ID',
 'GSE ID',
 'Organism',
 'Biological Replicate Serial No',
 'ChIP-seq Pull Down',
 'Restriction Enzyme']
ready_df = ready_df[major_cols]

In [60]:
ready_df

Unnamed: 0,Sample Name (as used in the server),Cellosaurus ID,GSE ID,Organism,Biological Replicate Serial No,ChIP-seq Pull Down,Restriction Enzyme
0,3134_siCTRL_1hr_Dex,CVCL_H641,GSE162617,Mus musculus,1,GR,MboI
1,3134_siNIPBL_1hr_Dex,CVCL_H641,GSE162617,Mus musculus,1,GR,MboI
2,3134_WT,CVCL_H641,GSE162617,Mus musculus,1,GR,MboI
3,3T3,CVCL_0594,GSE192387,Mus musculus,1,BATF,MboI
4,3T3,CVCL_0594,GSE192387,Mus musculus,1,CTCF,MboI
...,...,...,...,...,...,...,...
241,Treg_WTGFP,,GSE189442,Mus musculus,2,H3K27ac,MboI
242,Villus,,GSE148691,Mus musculus,1,H3K4me3,MboI
243,Villus,,GSE148691,Mus musculus,1,H3K4me3,MboI
244,Villus_Hnf4DKO,,GSE148691,Mus musculus,1,H3K4me3,MboI


In [19]:
check = "init.hichip_sample.mm10.insert.tsv"
check_df = pd.read_table(check)
check_geoid = check_df["geo_id"].to_list()
my_geoid = ready_df["gse_id"].to_list()

In [34]:
len(check_geoid)

234

In [35]:
len(my_geoid)

242

In [66]:
check_dict = {}
my_dict = {}
for i in check_geoid:
    if i not in check_dict:
        check_dict[i] = 1
    else:
        check_dict[i] = check_dict[i] + 1
for i in my_geoid:
    if i not in my_dict:
        my_dict[i] = 1
    else:
        my_dict[i] = my_dict[i] + 1

In [65]:
check_dict

{'GSE101498': 10,
 'GSE110898': 2,
 'GSE112176': 4,
 'GSE112717': 4,
 'GSE113339': 37,
 'GSE115524': 8,
 'GSE121671': 2,
 'GSE126362': 12,
 'GSE135296': 2,
 'GSE141113': 8,
 'GSE141847': 10,
 'GSE142004': 8,
 'GSE145793': 4,
 'GSE147919': 8,
 'GSE148691': 8,
 'GSE150536': 4,
 'GSE150906': 2,
 'GSE153884': 8,
 'GSE157666': 3,
 'GSE159629': 4,
 'GSE160656': 13,
 'GSE162617': 3,
 'GSE166177': 6,
 'GSE178344': 2,
 'GSE189442': 14,
 'GSE192387': 6,
 'GSE193079': 6,
 'GSE80820': 10,
 'GSE99519': 26}

In [67]:
my_dict

{'GSE101498': 10,
 'GSE110898': 2,
 'GSE112176': 4,
 'GSE112717': 4,
 'GSE113339': 37,
 'GSE115524': 12,
 'GSE121671': 2,
 'GSE126362': 12,
 'GSE135296': 2,
 'GSE141113': 8,
 'GSE141847': 10,
 'GSE142004': 8,
 'GSE145793': 4,
 'GSE147919': 8,
 'GSE148691': 8,
 'GSE150536': 4,
 'GSE150906': 2,
 'GSE153884': 8,
 'GSE157666': 3,
 'GSE159629': 4,
 'GSE160656': 13,
 'GSE162617': 3,
 'GSE166177': 6,
 'GSE178344': 2,
 'GSE189442': 14,
 'GSE192387': 6,
 'GSE193079': 6,
 'GSE80820': 14,
 'GSE99519': 26}

In [8]:
# capitalizes organism
def parse_organism(string):
    new_words = []
    for word in string.split():
        new_words.append(word.capitalize())
    new_string = '_'.join(new_words)
    return(new_string)

ready_df.loc[:, 'Organism'] = ready_df.loc[:, 'Organism'].apply(parse_organism)

In [9]:
# getting the sample names
sample_names = []
for i, sr in ready_df.iterrows():
    sample_name = '{sample_name}.{gse_id}.{organism}.{antibody_target}.b{biological_rep}'
    sample_name = sample_name.format(sample_name=sr[0],
                                     gse_id=sr[2],
                                     organism=sr[3],
                                     antibody_target=sr[6], 
                                     biological_rep=sr[4])
    sample_names.append(sample_name)
ready_df.loc[:, 'sample_name'] = sample_names

In [10]:
# renaming the columns for easy computational use 
ready_df.columns = ['sample_name', 'cellosaurus_id', 'gse_id',
                    'organism', 'bio_rep', 'antibody_target',
                    'restriction_enzyme', 'std_sample_name']

In [11]:
# reorder the columns
reorder = ['std_sample_name',
             'cellosaurus_id',
             'gse_id',
             'organism',
             'bio_rep',
             'antibody_target',
             'restriction_enzyme',
             'sample_name']
ready_df = ready_df[reorder]
final_df = ready_df.drop_duplicates()

In [12]:
final_df

Unnamed: 0,std_sample_name,cellosaurus_id,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,sample_name
0,3134_siCTRL_1hr_Dex.GSE162617.Mus_Musculus.Mbo...,CVCL_H641,GSE162617,Mus_Musculus,1,GR,MboI,3134_siCTRL_1hr_Dex
1,3134_siNIPBL_1hr_Dex.GSE162617.Mus_Musculus.Mb...,CVCL_H641,GSE162617,Mus_Musculus,1,GR,MboI,3134_siNIPBL_1hr_Dex
2,3134_WT.GSE162617.Mus_Musculus.MboI.b1,CVCL_H641,GSE162617,Mus_Musculus,1,GR,MboI,3134_WT
3,3T3.GSE192387.Mus_Musculus.MboI.b1,CVCL_0594,GSE192387,Mus_Musculus,1,BATF,MboI,3T3
4,3T3.GSE192387.Mus_Musculus.MboI.b1,CVCL_0594,GSE192387,Mus_Musculus,1,CTCF,MboI,3T3
...,...,...,...,...,...,...,...,...
239,Treg_WT.GSE112176.Mus_Musculus.MboI.b2,,GSE112176,Mus_Musculus,2,H3K27ac,MboI,Treg_WT
240,Treg_WTGFP.GSE189442.Mus_Musculus.MboI.b1,,GSE189442,Mus_Musculus,1,H3K27ac,MboI,Treg_WTGFP
241,Treg_WTGFP.GSE189442.Mus_Musculus.MboI.b2,,GSE189442,Mus_Musculus,2,H3K27ac,MboI,Treg_WTGFP
242,Villus.GSE148691.Mus_Musculus.MboI.b1,,GSE148691,Mus_Musculus,1,H3K4me3,MboI,Villus


In [13]:
header_output = '{}.with_header.tsv'.format(output_prefix)
final_df.to_csv(header_output, header=True, index=False, sep='\t')

In [14]:
without_header_output = '{}.without_header.tsv'.format(output_prefix)
final_df.to_csv(without_header_output, header=False, index=False, sep='\t')