# Writing a metadata file

In [1]:
import os
import glob
import pandas as pd

In [2]:
filepaths = sorted(glob.glob('../1_data_repository/libds_fastq/*'))
filenames = [x.split('/')[-1] for x in filepaths]
samplenames = sorted(list(set([x.split('__')[0] for x in filenames])))

In [3]:
metadata = pd.DataFrame(samplenames, columns=['sample_name'])
metadata.index = metadata['sample_name']

In [4]:
tech_dict_template = {x:'hydrop_2x384' for x in metadata['sample_name']}
tech_dict_template

{'BIO_ddseq_1': 'hydrop_2x384',
 'BIO_ddseq_2': 'hydrop_2x384',
 'BIO_ddseq_3': 'hydrop_2x384',
 'BIO_ddseq_4': 'hydrop_2x384',
 'BRO_mtscatac_1': 'hydrop_2x384',
 'BRO_mtscatac_2': 'hydrop_2x384',
 'CNA_10xmultiome_1': 'hydrop_2x384',
 'CNA_10xmultiome_2': 'hydrop_2x384',
 'CNA_10xv11_1': 'hydrop_2x384',
 'CNA_10xv11_2': 'hydrop_2x384',
 'CNA_10xv11_3': 'hydrop_2x384',
 'CNA_10xv11_4': 'hydrop_2x384',
 'CNA_10xv11_5': 'hydrop_2x384',
 'CNA_10xv2_1': 'hydrop_2x384',
 'CNA_10xv2_2': 'hydrop_2x384',
 'CNA_hydrop_1': 'hydrop_2x384',
 'CNA_hydrop_2': 'hydrop_2x384',
 'CNA_hydrop_3': 'hydrop_2x384',
 'CNA_mtscatac_1': 'hydrop_2x384',
 'CNA_mtscatac_2': 'hydrop_2x384',
 'EPF_hydrop_1': 'hydrop_2x384',
 'EPF_hydrop_2': 'hydrop_2x384',
 'EPF_hydrop_3': 'hydrop_2x384',
 'EPF_hydrop_4': 'hydrop_2x384',
 'HAR_ddseq_1': 'hydrop_2x384',
 'HAR_ddseq_2': 'hydrop_2x384',
 'MDC_mtscatac_1': 'hydrop_2x384',
 'MDC_mtscatac_2': 'hydrop_2x384',
 'OHS_s3atac_1': 'hydrop_2x384',
 'OHS_s3atac_2': 'hydrop_2x38

Copy the dict and change "atac" to each sample's method

In [5]:
tech_dict = {
    'BIO_ddseq_1': 'biorad',
    'BIO_ddseq_2': 'biorad',
    'BIO_ddseq_3': 'biorad',
    'BIO_ddseq_4': 'biorad',
    'BRO_mtscatac_1': 'atac_revcomp',
    'BRO_mtscatac_2': 'atac_revcomp',
    'CNA_10xmultiome_1': 'multiome_revcomp',
    'CNA_10xmultiome_2': 'multiome_revcomp',
    'CNA_10xv11_1': 'atac',
    'CNA_10xv11_2': 'atac',
    'CNA_10xv11_3': 'atac_revcomp',
    'CNA_10xv11_4': 'atac_revcomp',
    'CNA_10xv11_5': 'atac_revcomp',
    'CNA_10xv2_1': 'atac_revcomp',
    'CNA_10xv2_2': 'atac_revcomp',
    'CNA_hydrop_1': 'hydrop_2x384',
    'CNA_hydrop_2': 'hydrop_2x384',
    'CNA_hydrop_3': 'hydrop_2x384',
    'CNA_mtscatac_1': 'atac_revcomp',
    'CNA_mtscatac_2': 'atac_revcomp',
    'EPF_hydrop_1': 'hydrop_2x384',
    'EPF_hydrop_2': 'hydrop_2x384',
    'EPF_hydrop_3': 'hydrop_2x384',
    'EPF_hydrop_4': 'hydrop_2x384',
    'HAR_ddseq_1': 'biorad',
    'HAR_ddseq_2': 'biorad',
    'MDC_mtscatac_1': 'atac_revcomp',
    'MDC_mtscatac_2': 'atac_revcomp',
    'SAN_10xmultiome_1': 'multiome',
    'SAN_10xmultiome_2': 'multiome',
    'STA_10xv11_1': 'atac_revcomp',
    'STA_10xv11_2': 'atac_revcomp',
    'TXG_10xv11_1': 'atac_revcomp',
    'TXG_10xv2_1': 'atac_revcomp',
    'TXG_10xv2_2': 'atac_revcomp',
    'UCS_ddseq_1': 'biorad',
    'UCS_ddseq_2': 'biorad',
    'VIB_10xmultiome_1': 'multiome_revcomp',
    'VIB_10xmultiome_2': 'multiome_revcomp',
    'VIB_10xv1_1': 'atac',
    'VIB_10xv1_2': 'atac',
    'VIB_10xv2_1': 'atac_revcomp',
    'VIB_10xv2_2': 'atac_revcomp',
    'VIB_hydrop_11': 'hydrop_2x384',
    'VIB_hydrop_12': 'hydrop_2x384',
    'VIB_hydrop_21': 'hydrop_2x384',
    'VIB_hydrop_22': 'hydrop_2x384',
    'OHS_s3atac_1': 'OHS_s3atac_1',
    'OHS_s3atac_2': 'OHS_s3atac_2'
}

In [6]:
print(set(tech_dict.keys()) - set(tech_dict_template.keys()))
print(set(tech_dict_template.keys()) - set(tech_dict.keys()))

set()
set()


In [7]:
tech_dict_sub = {}
for sample in tech_dict_template.keys():
    tech_dict_sub[sample] = tech_dict[sample]

In [8]:
tech_dict_sub = tech_dict

In [9]:
tech_dict_sub.keys()

dict_keys(['BIO_ddseq_1', 'BIO_ddseq_2', 'BIO_ddseq_3', 'BIO_ddseq_4', 'BRO_mtscatac_1', 'BRO_mtscatac_2', 'CNA_10xmultiome_1', 'CNA_10xmultiome_2', 'CNA_10xv11_1', 'CNA_10xv11_2', 'CNA_10xv11_3', 'CNA_10xv11_4', 'CNA_10xv11_5', 'CNA_10xv2_1', 'CNA_10xv2_2', 'CNA_hydrop_1', 'CNA_hydrop_2', 'CNA_hydrop_3', 'CNA_mtscatac_1', 'CNA_mtscatac_2', 'EPF_hydrop_1', 'EPF_hydrop_2', 'EPF_hydrop_3', 'EPF_hydrop_4', 'HAR_ddseq_1', 'HAR_ddseq_2', 'MDC_mtscatac_1', 'MDC_mtscatac_2', 'SAN_10xmultiome_1', 'SAN_10xmultiome_2', 'STA_10xv11_1', 'STA_10xv11_2', 'TXG_10xv11_1', 'TXG_10xv2_1', 'TXG_10xv2_2', 'UCS_ddseq_1', 'UCS_ddseq_2', 'VIB_10xmultiome_1', 'VIB_10xmultiome_2', 'VIB_10xv1_1', 'VIB_10xv1_2', 'VIB_10xv2_1', 'VIB_10xv2_2', 'VIB_hydrop_11', 'VIB_hydrop_12', 'VIB_hydrop_21', 'VIB_hydrop_22', 'OHS_s3atac_1', 'OHS_s3atac_2'])

In [12]:
depth_str = str('35k')
fastq_repo_path = f'/lustre1/project/stg_00090/scatac_benchmark/fixedcells_downsample_series/{depth_str}/{depth_str}_fastq'

with open(f'metadata.{depth_str}.tsv', 'w') as f:
    f.write(f'sample_name\ttechnology\tfastq_PE1_path\tfastq_barcode_path\tfastq_PE2_path\n')
    for sample in tech_dict_sub.keys():
        if tech_dict[sample] != 'biorad':
            f.write(f'{sample}.{depth_str}\t{tech_dict[sample]}\t{fastq_repo_path}/{sample}__R1.{depth_str}.fastq.gz\t{fastq_repo_path}/{sample}__R2.{depth_str}.fastq.gz\t{fastq_repo_path}/{sample}__R3.{depth_str}.fastq.gz\n')
        else:
            f.write(f'{sample}.{depth_str}\t{tech_dict[sample]}\t{fastq_repo_path}/{sample}__R1.{depth_str}.fastq.gz\t\t{fastq_repo_path}/{sample}__R2.{depth_str}.fastq.gz\n')

In [14]:
!cat metadata.35k.tsv

sample_name	technology	fastq_PE1_path	fastq_barcode_path	fastq_PE2_path
BIO_ddseq_1.35k	biorad	/lustre1/project/stg_00090/scatac_benchmark/fixedcells_downsample_series/35k/35k_fastq/BIO_ddseq_1__R1.35k.fastq.gz		/lustre1/project/stg_00090/scatac_benchmark/fixedcells_downsample_series/35k/35k_fastq/BIO_ddseq_1__R2.35k.fastq.gz
BIO_ddseq_2.35k	biorad	/lustre1/project/stg_00090/scatac_benchmark/fixedcells_downsample_series/35k/35k_fastq/BIO_ddseq_2__R1.35k.fastq.gz		/lustre1/project/stg_00090/scatac_benchmark/fixedcells_downsample_series/35k/35k_fastq/BIO_ddseq_2__R2.35k.fastq.gz
BIO_ddseq_3.35k	biorad	/lustre1/project/stg_00090/scatac_benchmark/fixedcells_downsample_series/35k/35k_fastq/BIO_ddseq_3__R1.35k.fastq.gz		/lustre1/project/stg_00090/scatac_benchmark/fixedcells_downsample_series/35k/35k_fastq/BIO_ddseq_3__R2.35k.fastq.gz
BIO_ddseq_4.35k	biorad	/lustre1/project/stg_00090/scatac_benchmark/fixedcells_downsample_series/35k/35k_fastq/BIO_ddseq_4__R1.35k.fastq.gz		/lustre1/project/stg