Writing a metadata file for the vsn preprocessing pipeline (https://vsn-pipelines.readthedocs.io/en/latest/scatac-seq.html)

In [1]:
import os
import glob
import pandas as pd

%load_ext lab_black

In [2]:
filepaths = sorted(glob.glob("*k/*k_fastq/*"))
filenames = [x.split("/")[-1] for x in filepaths]
samplenames = sorted(list(set([x.split("__")[0] for x in filenames])))

In [3]:
metadata = pd.DataFrame(samplenames, columns=["sample_name"])
metadata.index = metadata["sample_name"]

In [4]:
tech_dict = {
    "BIO_ddseq_m1c1": "biorad",
    "BIO_ddseq_m1c2": "biorad",
    "BIO_ddseq_m1c3": "biorad",
    "BIO_ddseq_m1c4": "biorad",
    "BIO_ddseq_m1c5": "biorad",
    "BIO_ddseq_m1c6": "biorad",
    "BIO_ddseq_m1c7": "biorad",
    "BIO_ddseq_m1c8": "biorad",
    "BIO_ddseq_m2c1": "biorad",
    "BIO_ddseq_m2c2": "biorad",
    "BIO_ddseq_m2c3": "biorad",
    "BIO_ddseq_m2c4": "biorad",
    "OHS_s3atac_mouse": "s3atac",
    "TXG_10xmultiome_e18mousebrainfresh": "multiome",
    "TXG_10xv11_adultmousecortexchromiumx": "atac_revcomp",
    "TXG_10xv1_adultmousefresh": "atac",
    "TXG_10xv2_adultmousecortex": "atac_revcomp",
    "TXG_10xv2_adultmousecortexchromiumx": "atac_revcomp",
    "VIB_hydrop_1": "hydrop_3x96",
    "VIB_hydrop_2": "hydrop_3x96",
    "VIB_hydrop_3": "hydrop_3x96",
    "VIB_hydrop_4": "hydrop_3x96",
    "VIB_hydrop_5": "hydrop_3x96",
}

In [5]:
with open(f"metadata.tsv", "w") as f:
    f.write(
        f"sample_name\ttechnology\tfastq_PE1_path\tfastq_barcode_path\tfastq_PE2_path\n"
    )
    for depth_str in "50k 45k 40k 35k 30k 25k 20k 15k 10k 5k".split():
        fastq_repo_path = f"/lustre1/project/stg_00090/scatac_benchmark/public_downsample_series/{depth_str}/{depth_str}_fastq"
        for sample in tech_dict.keys():
            if len(glob.glob(f"{fastq_repo_path}/{sample}*")) > 0:
                if tech_dict[sample] != "biorad":
                    f.write(
                        f"{sample}.{depth_str}\t{tech_dict[sample]}\t{fastq_repo_path}/{sample}__R1.{depth_str}.fastq.gz\t{fastq_repo_path}/{sample}__R2.{depth_str}.fastq.gz\t{fastq_repo_path}/{sample}__R3.{depth_str}.fastq.gz\n"
                    )
                else:
                    f.write(
                        f"{sample}.{depth_str}\t{tech_dict[sample]}\t{fastq_repo_path}/{sample}__R1.{depth_str}.fastq.gz\t\t{fastq_repo_path}/{sample}__R2.{depth_str}.fastq.gz\n"
                    )

In [6]:
!cat metadata.tsv

sample_name	technology	fastq_PE1_path	fastq_barcode_path	fastq_PE2_path
BIO_ddseq_m2c1.50k	biorad	/lustre1/project/stg_00090/scatac_benchmark/public_downsample_series/50k/50k_fastq/BIO_ddseq_m2c1__R1.50k.fastq.gz		/lustre1/project/stg_00090/scatac_benchmark/public_downsample_series/50k/50k_fastq/BIO_ddseq_m2c1__R2.50k.fastq.gz
BIO_ddseq_m2c3.50k	biorad	/lustre1/project/stg_00090/scatac_benchmark/public_downsample_series/50k/50k_fastq/BIO_ddseq_m2c3__R1.50k.fastq.gz		/lustre1/project/stg_00090/scatac_benchmark/public_downsample_series/50k/50k_fastq/BIO_ddseq_m2c3__R2.50k.fastq.gz
BIO_ddseq_m2c4.50k	biorad	/lustre1/project/stg_00090/scatac_benchmark/public_downsample_series/50k/50k_fastq/BIO_ddseq_m2c4__R1.50k.fastq.gz		/lustre1/project/stg_00090/scatac_benchmark/public_downsample_series/50k/50k_fastq/BIO_ddseq_m2c4__R2.50k.fastq.gz
OHS_s3atac_mouse.50k	s3atac	/lustre1/project/stg_00090/scatac_benchmark/public_downsample_series/50k/50k_fastq/OHS_s3atac_mouse__R1.50k.fastq.gz	/lustre1/pro

In [None]:
# with open(f"metadata.tsv", "w") as f:
for depth_str in "50k 45k 40k 35k 30k 25k 20k 15k 10k 5k".split():
    fastq_repo_path = f"/lustre1/project/stg_00090/scatac_benchmark/public_downsample_series/{depth_str}/{depth_str}_fastq"
    for sample in tech_dict.keys():
        if len(glob.glob(f"{fastq_repo_path}/{sample}*")) > 0:
            if tech_dict[sample] != "biorad":
                f.write(
                    f"{sample}.{depth_str}\t{tech_dict[sample]}\t{fastq_repo_path}/{sample}__R1.{depth_str}.fastq.gz\t{fastq_repo_path}/{sample}__R2.{depth_str}.fastq.gz\t{fastq_repo_path}/{sample}__R3.{depth_str}.fastq.gz\n"
                )
            else:
                f.write(
                    f"{sample}.{depth_str}\t{tech_dict[sample]}\t{fastq_repo_path}/{sample}__R1.{depth_str}.fastq.gz\t\t{fastq_repo_path}/{sample}__R2.{depth_str}.fastq.gz\n"
                )