Writing a metadata file for the vsn preprocessing pipeline (https://vsn-pipelines.readthedocs.io/en/latest/scatac-seq.html)

In [58]:
import os
import glob
import pandas as pd

%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [59]:
filepaths = sorted(
    glob.glob(
        "/lustre1/project/stg_00090/scatac_benchmark/1_data_repository/publicdata_full_fastq/all_merged/[!BIO]*.fastq.gz"
    )
)
filenames = [x.split("/")[-1] for x in filepaths]
filenames

df_raw_filenames = pd.DataFrame(filepaths, columns=["filename"])

df_raw_filenames["sample_name"] = [
    x.split("/")[-1].split(".")[0].split("__")[0] for x in df_raw_filenames["filename"]
]

df_raw_filenames["read"] = [x.split("_R")[1][0] for x in df_raw_filenames["filename"]]

df_raw_filenames

samples_df = pd.pivot(
    df_raw_filenames, values="filename", index="sample_name", columns="read"
)

samples_df["sample_name"] = samples_df.index + ".FULL"

samples_df.columns = [
    "fastq_PE1_path",
    "fastq_barcode_path",
    "fastq_PE2_path",
    "sample_name",
]

In [60]:
df_raw_filenames

Unnamed: 0,filename,sample_name,read
0,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xmultiome_e18mousebrainfresh,1
1,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xmultiome_e18mousebrainfresh,2
2,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xmultiome_e18mousebrainfresh,3
3,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xv11_adultmousecortexchromiumx,1
4,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xv11_adultmousecortexchromiumx,2
5,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xv11_adultmousecortexchromiumx,3
6,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xv1_adultmousefresh,1
7,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xv1_adultmousefresh,2
8,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xv1_adultmousefresh,3
9,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xv2_adultmousecortex,1


In [72]:
tech_dict = {
    "TXG_10xmultiome_e18mousebrainfresh.FULL": "multiome",
    "TXG_10xv11_adultmousecortexchromiumx.FULL": "atac_revcomp",
    "TXG_10xv1_adultmousefresh.FULL": "revcomp",
    "TXG_10xv2_adultmousecortex.FULL": "atac_revcomp",
    "TXG_10xv2_adultmousecortexchromiumx.FULL": "atac_revcomp",
    "VIB_hydrop_1.FULL": "hydrop_3x96",
    "VIB_hydrop_2.FULL": "hydrop_3x96",
    "VIB_hydrop_3.FULL": "hydrop_3x96",
    "VIB_hydrop_4.FULL": "hydrop_3x96",
    "VIB_hydrop_5.FULL": "hydrop_3x96",
}

In [73]:
samples_df["technology"] = [tech_dict[x] for x in samples_df["sample_name"]]

In [74]:
samples_df

Unnamed: 0_level_0,fastq_PE1_path,fastq_barcode_path,fastq_PE2_path,sample_name,technology
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TXG_10xmultiome_e18mousebrainfresh,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xmultiome_e18mousebrainfresh.FULL,multiome
TXG_10xv11_adultmousecortexchromiumx,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xv11_adultmousecortexchromiumx.FULL,atac_revcomp
TXG_10xv1_adultmousefresh,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xv1_adultmousefresh.FULL,revcomp
TXG_10xv2_adultmousecortex,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xv2_adultmousecortex.FULL,atac_revcomp
TXG_10xv2_adultmousecortexchromiumx,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xv2_adultmousecortexchromiumx.FULL,atac_revcomp
VIB_hydrop_1,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,VIB_hydrop_1.FULL,hydrop_3x96
VIB_hydrop_2,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,VIB_hydrop_2.FULL,hydrop_3x96
VIB_hydrop_3,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,VIB_hydrop_3.FULL,hydrop_3x96
VIB_hydrop_4,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,VIB_hydrop_4.FULL,hydrop_3x96
VIB_hydrop_5,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,VIB_hydrop_5.FULL,hydrop_3x96


In [75]:
filepaths = sorted(
    glob.glob(
        "/lustre1/project/stg_00090/scatac_benchmark/1_data_repository/publicdata_full_fastq/all_merged/BIO*.fastq.gz"
    )
)
filenames = [x.split("/")[-1] for x in filepaths]
filenames

df_raw_filenames = pd.DataFrame(filepaths, columns=["filename"])

df_raw_filenames["sample_name"] = [
    x.split("/")[-1].split(".")[0].split("__")[0] for x in df_raw_filenames["filename"]
]

df_raw_filenames["read"] = [x.split("_R")[1][0] for x in df_raw_filenames["filename"]]

df_raw_filenames

samples_df_biorad = pd.pivot(
    df_raw_filenames, values="filename", index="sample_name", columns="read"
)

samples_df_biorad["sample_name"] = samples_df_biorad.index + ".FULL"

samples_df_biorad.columns = [
    "fastq_PE1_path",
    "fastq_PE2_path",
    "sample_name",
]

In [76]:
samples_df_biorad["technology"] = "biorad"

In [77]:
filepaths = sorted(
    glob.glob(
        "/lustre1/project/stg_00090/scatac_benchmark/1_data_repository/publicdata_full_fastq/all_merged/OHS*.fastq.gz"
    )
)
filenames = [x.split("/")[-1] for x in filepaths]
filenames

df_raw_filenames = pd.DataFrame(filepaths, columns=["filename"])

df_raw_filenames["sample_name"] = [
    x.split("/")[-1].split(".")[0].split("__")[0] for x in df_raw_filenames["filename"]
]

df_raw_filenames["read"] = [x.split("_R")[1][0] for x in df_raw_filenames["filename"]]

df_raw_filenames

samples_df_s3 = pd.pivot(
    df_raw_filenames, values="filename", index="sample_name", columns="read"
)

samples_df_s3["sample_name"] = samples_df_s3.index + ".FULL"

samples_df_s3.columns = [
    "fastq_PE1_path",
    "fastq_barcode_path",
    "fastq_PE2_path",
    "sample_name",
]
samples_df_s3["technology"] = "s3atac"

In [78]:
samples_df_merged = pd.concat([samples_df, samples_df_biorad, samples_df_s3])

In [79]:
samples_df_merged

Unnamed: 0_level_0,fastq_PE1_path,fastq_barcode_path,fastq_PE2_path,sample_name,technology
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TXG_10xmultiome_e18mousebrainfresh,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xmultiome_e18mousebrainfresh.FULL,multiome
TXG_10xv11_adultmousecortexchromiumx,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xv11_adultmousecortexchromiumx.FULL,atac_revcomp
TXG_10xv1_adultmousefresh,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xv1_adultmousefresh.FULL,revcomp
TXG_10xv2_adultmousecortex,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xv2_adultmousecortex.FULL,atac_revcomp
TXG_10xv2_adultmousecortexchromiumx,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,TXG_10xv2_adultmousecortexchromiumx.FULL,atac_revcomp
VIB_hydrop_1,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,VIB_hydrop_1.FULL,hydrop_3x96
VIB_hydrop_2,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,VIB_hydrop_2.FULL,hydrop_3x96
VIB_hydrop_3,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,VIB_hydrop_3.FULL,hydrop_3x96
VIB_hydrop_4,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,VIB_hydrop_4.FULL,hydrop_3x96
VIB_hydrop_5,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,/lustre1/project/stg_00090/scatac_benchmark/1_...,VIB_hydrop_5.FULL,hydrop_3x96


In [80]:
samples_df_merged = samples_df_merged[
    [
        "sample_name",
        "technology",
        "fastq_PE1_path",
        "fastq_barcode_path",
        "fastq_PE2_path",
    ]
]

In [81]:
samples_df_merged.to_csv("metadata.tsv", sep="\t", index=False)

In [82]:
!cat metadata.tsv

sample_name	technology	fastq_PE1_path	fastq_barcode_path	fastq_PE2_path
TXG_10xmultiome_e18mousebrainfresh.FULL	multiome	/lustre1/project/stg_00090/scatac_benchmark/1_data_repository/publicdata_full_fastq/all_merged/TXG_10xmultiome_e18mousebrainfresh__R1.FULL.fastq.gz	/lustre1/project/stg_00090/scatac_benchmark/1_data_repository/publicdata_full_fastq/all_merged/TXG_10xmultiome_e18mousebrainfresh__R2.FULL.fastq.gz	/lustre1/project/stg_00090/scatac_benchmark/1_data_repository/publicdata_full_fastq/all_merged/TXG_10xmultiome_e18mousebrainfresh__R3.FULL.fastq.gz
TXG_10xv11_adultmousecortexchromiumx.FULL	atac_revcomp	/lustre1/project/stg_00090/scatac_benchmark/1_data_repository/publicdata_full_fastq/all_merged/TXG_10xv11_adultmousecortexchromiumx__R1.FULL.fastq.gz	/lustre1/project/stg_00090/scatac_benchmark/1_data_repository/publicdata_full_fastq/all_merged/TXG_10xv11_adultmousecortexchromiumx__R2.FULL.fastq.gz	/lustre1/project/stg_00090/scatac_benchmark/1_data_repository/publicdata_full_fa