In [10]:
import pandas as pd
import string
import os

In [11]:
# key: row number in ChIP-seq tracker
# value: filepath of fastq file
samples = {
    187: "/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/data/pieqtl_chipseq_hg38/CD4N_merged_donors_hg38.fq",
    188: "/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/data/pieqtl_chipseq_hg38/CD8N_merged_donors_hg38.fq",
    189: "/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/data/pieqtl_chipseq_hg38/Mono_merged_donors_hg38.fq",
    190: "/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/data/pieqtl_chipseq_hg38/NB_merged_donors_hg38.fq",
    192: "/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/data/pieqtl_chipseq_hg38/NCM_merged_donors_hg38.fq",
    191: "/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/data/pieqtl_chipseq_hg38/NK_merged_donors_hg38.fq"
}

In [12]:
def get_sample_foldername(row_number):
    df = pd.read_csv('../../../results/samplesheets/fastq/chipseq_human_tracker.tsv',sep='\t')
    
    # reset dataframe row numbers to match row numbers on Google sheets
    df = df.reset_index(drop=True)
    df.index += 2
    
    # construct the name of the sample using the naming scheme:
    # {sample_name}.{gse_id}.{organism}.{target of antibody}.b{biological_rep}
    
    name = df['Sample Name'].loc[row_number]
    gse_id = df['GSE ID of Corresponding HiChIP Data'].loc[row_number]
    
    organism = df['Organism'].loc[row_number]
    # capitalize each word and replace each space with an underscore
    organism = string.capwords(organism).replace(" ", "_")
    
    target_of_antibody = df['ChIP-seq Pull Down'].loc[row_number]
    biological_rep = df['Replicate Serial No'].loc[row_number]
    
    sample_foldername = name + "." + gse_id + "." + organism + "." + target_of_antibody + ".b" + str(biological_rep)
    return sample_foldername
    

In [13]:
! echo "Commands to run in hichip-db-loop-calling folder:"
! echo

for key in samples:
    # obtain name of folder containing sample
    sample_foldername = get_sample_foldername(key)
    
    # create the folder for the sample
    folderbasepath = "results/fastqs/chipseq/"
    
    ! echo "mkdir -p {folderbasepath}/{sample_foldername}"
    
    sample_name = os.path.basename(samples[key]).removesuffix(".fq")
    
    # create a gzipped file of the sample in the newly created folder
    ! echo "gzip -c {samples[key]} > {folderbasepath}/{sample_foldername}/{sample_name}.fastq.gz"
    ! echo

Commands to run in hichip-db-loop-calling folder:

mkdir -p results/fastqs/chipseq//CD4N_merged_donors_hg38.phs001703v3p1.Homo_Sapiens.H3K27ac.b1
gzip -c /mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/data/pieqtl_chipseq_hg38/CD4N_merged_donors_hg38.fq > results/fastqs/chipseq//CD4N_merged_donors_hg38.phs001703v3p1.Homo_Sapiens.H3K27ac.b1/CD4N_merged_donors_hg38.fastq.gz

mkdir -p results/fastqs/chipseq//CD8N_merged_donors_hg38.phs001703v3p1.Homo_Sapiens.H3K27ac.b1
gzip -c /mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/data/pieqtl_chipseq_hg38/CD8N_merged_donors_hg38.fq > results/fastqs/chipseq//CD8N_merged_donors_hg38.phs001703v3p1.Homo_Sapiens.H3K27ac.b1/CD8N_merged_donors_hg38.fastq.gz

mkdir -p results/fastqs/chipseq//Mono_merged_donors_hg38.phs001703v3p1.Homo_Sapiens.H3K27ac.b1
gzip -c /mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/data/pieqtl_chipseq_hg38/Mono_merged_donors_hg38.fq > results/fastqs/chipseq//Mono_merged_donors_hg38.p