In [1]:
import polars as pl

pl.Config(fmt_str_lengths=1000, tbl_width_chars=1000, tbl_rows = 36)
df = pl.read_csv("4DNESN49VY8X_raw_files_2024-08-01-16h-46m.tsv", separator="\t", skip_rows=0).drop_nulls()
df = df.filter(pl.col("Paired End") == 1)
df = df.rename({"File Accession":"File Accession End 1", "Related File":"File Accession End 2"})

In [4]:
import subprocess
from pathlib import Path

techreps = {}
samples = {"condition":[], "biorep":[], "techrep":[], "fastq1":[], "fastq2":[], "assembly":[], "aligner":[], "index_dir": [], "index_prefix": [], "enzymes":[]}


for row in df.iter_rows(named = True):
    condition = "_".join(row["Condition"].split()[:-2]).replace(",", "")
    biorep = int(row["Replicate Info"].replace(",", "").split()[1])
    key = (condition, biorep)
    techreps[key] = 1 if key not in techreps else techreps[key] + 1
    techrep = techreps[key]

    accession_end1 = row["File Accession End 1"]
    accession_end2 = row["File Accession End 2"]

    for i, accession in enumerate([accession_end1, accession_end2]):
        end = i + 1
        old_name = f"downsampled/{accession}.fastq.gz"
        new_name = f"downsampled/{condition}_BR{biorep}_TR{techrep}_R{end}.fastq.gz"

        if Path(old_name).exists():
            cmd = subprocess.run(f"mv {old_name} {new_name}", shell=True)

    enzymes = row["Assay Details"].replace(" and ", " ")
    accession = row["Experiment Set Accession"]

    base_dir = "vignettes/systematic/downsampled"
    fastq1 = f"{base_dir}/{condition}_BR{biorep}_TR{techrep}_R1.fastq.gz"
    fastq2 = f"{base_dir}/{condition}_BR{biorep}_TR{techrep}_R2.fastq.gz"

    samples["condition"] += [condition]
    samples["biorep"] += [biorep]
    samples["techrep"] += [techrep]
    samples["fastq1"] += [fastq1]
    samples["fastq2"] += [fastq2]
    samples["assembly"] += ["hg38"]
    samples["aligner"] += ["bwa"]
    samples["enzymes"] += [enzymes]
    samples["index_dir"] += ["resources/bwa"]
    samples["index_prefix"] += ["GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta"]

samples = pl.DataFrame(samples)
samples.write_csv("samples.tsv", separator="\t")
samples

condition,biorep,techrep,fastq1,fastq2,assembly,aligner,index_dir,index_prefix,enzymes
str,i64,i64,str,str,str,str,str,str,str
"""Formaldehyde+DSG_DdeI_and_DpnII_HFFc6""",1,1,"""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR1_R1.fastq.gz""","""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR1_R2.fastq.gz""","""hg38""","""bwa""","""resources/bwa""","""GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta""","""DdeI DpnII"""
"""Formaldehyde+DSG_DdeI_and_DpnII_HFFc6""",1,2,"""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR2_R1.fastq.gz""","""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR2_R2.fastq.gz""","""hg38""","""bwa""","""resources/bwa""","""GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta""","""DdeI DpnII"""
"""Formaldehyde+DSG_DdeI_and_DpnII_HFFc6""",1,3,"""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR3_R1.fastq.gz""","""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR3_R2.fastq.gz""","""hg38""","""bwa""","""resources/bwa""","""GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta""","""DdeI DpnII"""
"""Formaldehyde+DSG_DdeI_and_DpnII_HFFc6""",1,4,"""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR4_R1.fastq.gz""","""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR4_R2.fastq.gz""","""hg38""","""bwa""","""resources/bwa""","""GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta""","""DdeI DpnII"""
"""Formaldehyde+DSG_DdeI_and_DpnII_HFFc6""",1,5,"""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR5_R1.fastq.gz""","""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR5_R2.fastq.gz""","""hg38""","""bwa""","""resources/bwa""","""GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta""","""DdeI DpnII"""
"""Formaldehyde+DSG_DdeI_and_DpnII_HFFc6""",1,6,"""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR6_R1.fastq.gz""","""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR6_R2.fastq.gz""","""hg38""","""bwa""","""resources/bwa""","""GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta""","""DdeI DpnII"""
"""Formaldehyde+DSG_DdeI_and_DpnII_HFFc6""",1,7,"""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR7_R1.fastq.gz""","""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR7_R2.fastq.gz""","""hg38""","""bwa""","""resources/bwa""","""GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta""","""DdeI DpnII"""
"""Formaldehyde+DSG_DdeI_and_DpnII_HFFc6""",1,8,"""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR8_R1.fastq.gz""","""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR8_R2.fastq.gz""","""hg38""","""bwa""","""resources/bwa""","""GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta""","""DdeI DpnII"""
"""Formaldehyde+DSG_DdeI_and_DpnII_HFFc6""",1,9,"""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR9_R1.fastq.gz""","""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR1_TR9_R2.fastq.gz""","""hg38""","""bwa""","""resources/bwa""","""GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta""","""DdeI DpnII"""
"""Formaldehyde+DSG_DdeI_and_DpnII_HFFc6""",2,1,"""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR2_TR1_R1.fastq.gz""","""vignettes/systematic/downsampled/Formaldehyde+DSG_DdeI_and_DpnII_HFFc6_BR2_TR1_R2.fastq.gz""","""hg38""","""bwa""","""resources/bwa""","""GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta""","""DdeI DpnII"""
