# Generate tables for GEO metadata

In [306]:
import os
import glob
import pandas as pd

%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [418]:
sample_id_short_alias_dict = {
    "BIO_ddseq_1": "ddSEQ Bi1",
    "BIO_ddseq_2": "ddSEQ Bi2",
    "BIO_ddseq_3": "ddSEQ Bi3",
    "BIO_ddseq_4": "ddSEQ Bi4",
    "BRO_mtscatac_1": "mtscATAC Br1",
    "BRO_mtscatac_2": "mtscATAC Br2",
    "CNA_10xmultiome_1": "MO C1",
    "CNA_10xmultiome_2": "MO C2",
    "CNA_10xv11_1": "v1.1 C1",
    "CNA_10xv11_2": "v1.1 C2",
    "CNA_10xv11_3": "v1.1 C3",
    "CNA_10xv11_4": "v1.1 C4",
    "CNA_10xv11_5": "v1.1 C5",
    "CNA_10xv2_1": "v2 C1",
    "CNA_ddseq_1": "DONOTUSE",
    "CNA_ddseq_2": "DONOTUSE",
    "CNA_10xv2_1": "v2 C1",
    "CNA_10xv2_2": "v2 C2",
    "CNA_hydrop_1": "HyDrop C1",
    "CNA_hydrop_2": "HyDrop C2",
    "CNA_hydrop_3": "HyDrop C3",
    "CNA_mtscatac_1": "mtscATAC C1",
    "CNA_mtscatac_2": "mtscATAC C2",
    "EPF_hydrop_1": "HyDrop E1",
    "EPF_hydrop_2": "HyDrop E2",
    "EPF_hydrop_3": "HyDrop E3",
    "EPF_hydrop_4": "HyDrop E4",
    "HAR_ddseq_1": "ddSEQ H1",
    "HAR_ddseq_2": "ddSEQ H2",
    "MDC_mtscatac_1": "mtscATAC M1",
    "MDC_mtscatac_2": "mtscATAC M2",
    "OHS_s3atac_1": "s3 O1",
    "OHS_s3atac_2": "s3 O2",
    "SAN_10xmultiome_1": "MO Sa1",
    "SAN_10xmultiome_2": "MO Sa2",
    "STA_10xv11_1": "v1.1 St1",
    "STA_10xv11_2": "v1.1 St2",
    "TXG_10xv11_1": "v1.1 T1",
    "TXG_10xv2_1": "v2 T1",
    "TXG_10xv2_2": "v2 T2",
    "UCS_ddseq_1": "ddSEQ U1",
    "UCS_ddseq_2": "ddSEQ U2",
    "VIB_10xmultiome_1": "MO V1",
    "VIB_10xmultiome_2": "MO V2",
    "VIB_10xv1_1": "v1 V1",
    "VIB_10xv1_2": "v1 V2",
    "VIB_10xv2_1": "v2 V1",
    "VIB_10xv2_2": "v2 V2",
    "VIB_hydrop_1": "HyDrop V1",
    "VIB_hydrop_2": "HyDrop V2",
    "VIB_hydrop_11": "HyDrop V1",
    "VIB_hydrop_12": "HyDrop V1",
    "VIB_hydrop_21": "HyDrop V2",
    "VIB_hydrop_22": "HyDrop V2",
}

In [419]:
filepaths = sorted(glob.glob("full_fastq/*.fastq.gz"))

techs = set([x.split("/")[-1].split("_")[1] for x in filepaths])

df = pd.DataFrame(filepaths, columns=["path"])
df["path"] = [x.split("/")[1] for x in df["path"]]

df["sample"] = [x.split("/")[-1].split("__")[0] for x in df["path"]]
df["read"] = [x.split("/")[-1].split("__")[1].split(".")[0] for x in df["path"]]

df_pivot = df.pivot(index="sample", columns="read")

df_pivot["tech"] = [x.split("_")[1] for x in df_pivot.index]

In [420]:
df_pivot = df_pivot.drop(["CNA_ddseq_1", "CNA_ddseq_2"], axis=0)

In [421]:
df_pivot = df_pivot.drop(["VIB_mtscatac_1", "VIB_mtscatac_2"], axis=0)

In [422]:
filepaths = sorted(glob.glob("full_fragments/*.tsv.gz"))

techs = set([x.split("/")[-1].split("_")[1] for x in filepaths])

df_fixedcells = pd.DataFrame(filepaths, columns=["path"])
df_fixedcells["path"] = [x.split("/")[1] for x in df_fixedcells["path"]]

df_fixedcells["sample"] = [
    x.split("/")[-1].split(".")[0] for x in df_fixedcells["path"]
]
df_fixedcells.index = df_fixedcells["sample"]
df_fixedcells = df_fixedcells.drop("sample", axis=1)
df = df_fixedcells.copy()

filepaths = sorted(glob.glob("full_fragments/*tbi"))
df_fixedcells = pd.DataFrame(filepaths, columns=["path"])
df_fixedcells["path"] = [x.split("/")[1] for x in df_fixedcells["path"]]

df_fixedcells["sample"] = [
    x.split("/")[-1].split(".")[0] for x in df_fixedcells["path"]
]
df_fixedcells.index = df_fixedcells["sample"]


df["tbi"] = df_fixedcells["path"]

filepaths = sorted(glob.glob("fixedcells_fragments/*fragments.tsv.gz"))
df_fixedcells = pd.DataFrame(filepaths, columns=["path"])
df_fixedcells["path"] = [x.split("/")[1] for x in df_fixedcells["path"]]

df_fixedcells["sample"] = [
    x.split("/")[-1].split(".")[0] for x in df_fixedcells["path"]
]
df_fixedcells.index = df_fixedcells["sample"]
df["fragments_fixedcells"] = df_fixedcells["path"]
filepaths = sorted(glob.glob("fixedcells_fragments/*tbi"))
df_fixedcells = pd.DataFrame(filepaths, columns=["path"])
df_fixedcells["path"] = [x.split("/")[1] for x in df_fixedcells["path"]]

df_fixedcells["sample"] = [
    x.split("/")[-1].split(".")[0] for x in df_fixedcells["path"]
]
df_fixedcells.index = df_fixedcells["sample"]


df["tbi_fixedcells"] = df_fixedcells["path"]

In [423]:
df = df.drop(["CNA_ddseq_1", "CNA_ddseq_2"], axis=0)

# check for individual techs

In [407]:
tech = "s3atac"

In [408]:
for x in df_pivot[df_pivot["tech"] == tech].index:
    print(x)

OHS_s3atac_1
OHS_s3atac_2


In [409]:
for x in df_pivot[df_pivot["tech"] == tech].index:
    print(f'{" ".join(x.split("_")) + " (" + sample_id_short_alias_dict[x] + ")"}')

OHS s3atac 1 (s3 O1)
OHS s3atac 2 (s3 O2)


In [410]:
list(df_test.columns)

[('path', 'R1'), ('path', 'R2'), ('path', 'R3'), ('tech', '')]

In [411]:
df_test = df_pivot[df_pivot["tech"] == tech]
df_test[[("path", "R1"), ("path", "R2"), ("path", "R3")]]

Unnamed: 0_level_0,path,path,path
read,R1,R2,R3
sample,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
OHS_s3atac_1,OHS_s3atac_1__R1.FULL.fastq.gz,OHS_s3atac_1__R2.FULL.fastq.gz,OHS_s3atac_1__R3.FULL.fastq.gz
OHS_s3atac_2,OHS_s3atac_2__R1.FULL.fastq.gz,OHS_s3atac_2__R2.FULL.fastq.gz,OHS_s3atac_2__R3.FULL.fastq.gz


# now fragments files

In [424]:
df["tech"] = [x.split("_")[1] for x in df.index]

In [425]:
df[["path", "tbi", "fragments_fixedcells", "tbi_fixedcells"]][df["tech"] == tech]

Unnamed: 0_level_0,path,tbi,fragments_fixedcells,tbi_fixedcells
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
OHS_s3atac_1,OHS_s3atac_1.FULL.fragments.tsv.gz,OHS_s3atac_1.FULL.fragments.tsv.gz.tbi,OHS_s3atac_1.FIXEDCELLS.fragments.tsv.gz,OHS_s3atac_1.FIXEDCELLS.fragments.tsv.gz.tbi
OHS_s3atac_2,OHS_s3atac_2.FULL.fragments.tsv.gz,OHS_s3atac_2.FULL.fragments.tsv.gz.tbi,OHS_s3atac_2.FIXEDCELLS.fragments.tsv.gz,OHS_s3atac_2.FIXEDCELLS.fragments.tsv.gz.tbi


In [426]:
md5sums_df = pd.read_csv(
    "full_fastq/md5sum.txt",
    sep="  ",
    header=None,
)
md5sums_df.columns = ["md5sum", "path"]
md5sums_df["path"] = [x.split("/")[-1] for x in md5sums_df["path"]]
md5sums_df.index = md5sums_df["path"]

md5sums_df.columns = ["md5sum", "path_original"]

md5sums_df = md5sums_df.sort_values(by="path_original")
with pd.option_context(
    "display.max_rows",
    None,
):
    print(md5sums_df.loc[[x for x in md5sums_df.index if tech in x]]["md5sum"])

path
OHS_s3atac_1__R1.FULL.fastq.gz    d2fc826e2621c4b221903abb82ef4890
OHS_s3atac_1__R2.FULL.fastq.gz    3226e16c646774acbde3045640326272
OHS_s3atac_1__R3.FULL.fastq.gz    a78c7807e3340f1735a81b7c500b3f43
OHS_s3atac_2__R1.FULL.fastq.gz    de7cc81883f0751c8cd8ad8b67b8225c
OHS_s3atac_2__R2.FULL.fastq.gz    c7bb005955487aa4f10fa73660e70beb
OHS_s3atac_2__R3.FULL.fastq.gz    c87bfd628b28de1e670032517a9e89d8
Name: md5sum, dtype: object


  md5sums_df = pd.read_csv(


In [427]:
md5sums_df = pd.read_csv(
    "full_fragments/md5sum.txt",
    sep="  ",
    header=None,
)
md5sums_df.columns = ["md5sum", "path"]
md5sums_df["path"] = [x.split("/")[-1] for x in md5sums_df["path"]]
md5sums_df.index = md5sums_df["path"]

md5sums_df.columns = ["md5sum", "path_original"]

md5sums_df = md5sums_df.sort_values(by="path_original")
with pd.option_context(
    "display.max_rows",
    None,
):
    print(md5sums_df.loc[[x for x in md5sums_df.index if tech in x]]["md5sum"])

md5sums_df = pd.read_csv(
    "fixedcells_fragments/md5sum.txt",
    sep="  ",
    header=None,
)
md5sums_df.columns = ["md5sum", "path"]
md5sums_df["path"] = [x.split("/")[-1] for x in md5sums_df["path"]]
md5sums_df.index = md5sums_df["path"]

md5sums_df.columns = ["md5sum", "path_original"]

md5sums_df = md5sums_df.sort_values(by="path_original")
with pd.option_context(
    "display.max_rows",
    None,
):
    print(md5sums_df.loc[[x for x in md5sums_df.index if tech in x]]["md5sum"])

path
OHS_s3atac_1.FULL.fragments.tsv.gz        c3ca12f129dd1049caef383634f42311
OHS_s3atac_1.FULL.fragments.tsv.gz.tbi    59d784f0796b23c8c283952b54eb0fcf
OHS_s3atac_2.FULL.fragments.tsv.gz        90d92d65a4c2a1e7b98cd00e76df54a5
OHS_s3atac_2.FULL.fragments.tsv.gz.tbi    6c39dd1550d4543943fc084cab5340aa
Name: md5sum, dtype: object
path
OHS_s3atac_1.FIXEDCELLS.fragments.tsv.gz        56517410cbac8ec591532992280641b2
OHS_s3atac_1.FIXEDCELLS.fragments.tsv.gz.tbi    94a6fcf608ced24123bf6b6f82d5f44e
OHS_s3atac_2.FIXEDCELLS.fragments.tsv.gz        fa23ef9afd26197186481601727b6623
OHS_s3atac_2.FIXEDCELLS.fragments.tsv.gz.tbi    f1609bf0408a5993234d0a68eabb3de3
Name: md5sum, dtype: object


  md5sums_df = pd.read_csv(
  md5sums_df = pd.read_csv(
