Writing a metadata file for the vsn preprocessing pipeline (https://vsn-pipelines.readthedocs.io/en/latest/scatac-seq.html)

In [1]:
import os
import glob
import pandas as pd

In [2]:
filepaths = sorted(glob.glob('../1_data_repository/full_fastq/*'))

In [3]:
filenames = [x.split('/')[-1] for x in filepaths]

In [4]:
samplenames = sorted(list(set([x.split('__')[0] for x in filenames])))

In [5]:
metadata = pd.DataFrame(samplenames, columns=['sample_name'])
metadata.index = metadata['sample_name']

In [6]:
tech_dict_template = {x:'hydrop_2x384' for x in metadata['sample_name']}
tech_dict_template

{'BIO_ddseq_1': 'hydrop_2x384',
 'BIO_ddseq_2': 'hydrop_2x384',
 'BIO_ddseq_3': 'hydrop_2x384',
 'BIO_ddseq_4': 'hydrop_2x384',
 'BRO_mtscatac_1': 'hydrop_2x384',
 'BRO_mtscatac_2': 'hydrop_2x384',
 'CNA_10xmultiome_1': 'hydrop_2x384',
 'CNA_10xmultiome_2': 'hydrop_2x384',
 'CNA_10xv11_1': 'hydrop_2x384',
 'CNA_10xv11_2': 'hydrop_2x384',
 'CNA_10xv11_3': 'hydrop_2x384',
 'CNA_10xv11_4': 'hydrop_2x384',
 'CNA_10xv11_5': 'hydrop_2x384',
 'CNA_10xv2_1': 'hydrop_2x384',
 'CNA_10xv2_2': 'hydrop_2x384',
 'CNA_hydrop_1': 'hydrop_2x384',
 'CNA_hydrop_2': 'hydrop_2x384',
 'CNA_hydrop_3': 'hydrop_2x384',
 'CNA_mtscatac_1': 'hydrop_2x384',
 'CNA_mtscatac_2': 'hydrop_2x384',
 'EPF_hydrop_1': 'hydrop_2x384',
 'EPF_hydrop_2': 'hydrop_2x384',
 'EPF_hydrop_3': 'hydrop_2x384',
 'EPF_hydrop_4': 'hydrop_2x384',
 'HAR_ddseq_1': 'hydrop_2x384',
 'HAR_ddseq_2': 'hydrop_2x384',
 'MDC_mtscatac_1': 'hydrop_2x384',
 'MDC_mtscatac_2': 'hydrop_2x384',
 'OHS_s3atac_1': 'hydrop_2x384',
 'OHS_s3atac_2': 'hydrop_2x38

Copy the dict and change "atac" to each sample's method

In [16]:
tech_dict = {
    'BIO_ddseq_1': 'biorad',
    'BIO_ddseq_2': 'biorad',
    'BIO_ddseq_3': 'biorad',
    'BIO_ddseq_4': 'biorad',
    'BRO_mtscatac_1': 'atac_revcomp',
    'BRO_mtscatac_2': 'atac_revcomp',
    'CNA_10xmultiome_1': 'multiome_revcomp',
    'CNA_10xmultiome_2': 'multiome_revcomp',
    'CNA_10xv11_1': 'atac',
    'CNA_10xv11_2': 'atac',
    'CNA_10xv11_3': 'atac_revcomp',
    'CNA_10xv11_4': 'atac_revcomp',
    'CNA_10xv11_5': 'atac_revcomp',
    'CNA_10xv2_1': 'atac_revcomp',
    'CNA_10xv2_2': 'atac_revcomp',
    'CNA_ddseq_1': 'biorad',
    'CNA_ddseq_2': 'biorad',
    'CNA_hydrop_1': 'hydrop_2x384',
    'CNA_hydrop_2': 'hydrop_2x384',
    'CNA_hydrop_3': 'hydrop_2x384',
    'CNA_mtscatac_1': 'atac_revcomp',
    'CNA_mtscatac_2': 'atac_revcomp',
    'EPF_hydrop_1': 'hydrop_2x384',
    'EPF_hydrop_2': 'hydrop_2x384',
    'EPF_hydrop_3': 'hydrop_2x384',
    'EPF_hydrop_4': 'hydrop_2x384',
    'HAR_ddseq_1': 'biorad',
    'HAR_ddseq_2': 'biorad',
    'MDC_mtscatac_1': 'atac_revcomp',
    'MDC_mtscatac_2': 'atac_revcomp',
    'SAN_10xmultiome_1': 'multiome',
    'SAN_10xmultiome_2': 'multiome',
    'STA_10xv11_1': 'atac_revcomp',
    'STA_10xv11_2': 'atac_revcomp',
    'TXG_10xv11_1': 'atac_revcomp',
    'TXG_10xv2_1': 'atac_revcomp',
    'TXG_10xv2_2': 'atac_revcomp',
    'UCS_ddseq_1': 'biorad',
    'UCS_ddseq_2': 'biorad',
    'VIB_10xmultiome_1': 'multiome_revcomp',
    'VIB_10xmultiome_2': 'multiome_revcomp',
    'VIB_10xv11_1': 'atac',
    'VIB_10xv11_2': 'atac',
    'VIB_10xv1_1': 'atac',
    'VIB_10xv1_2': 'atac',
    'VIB_10xv2_1': 'atac_revcomp',
    'VIB_10xv2_2': 'atac_revcomp',
    'VIB_hydrop_11': 'hydrop_2x384',
    'VIB_hydrop_12': 'hydrop_2x384',
    'VIB_hydrop_21': 'hydrop_2x384',
    'VIB_hydrop_22': 'hydrop_2x384',
    'VIB_mtscatac_1': 'atac_revcomp',
    'VIB_mtscatac_2': 'atac_revcomp',
    'OHS_s3atac_1': 'OHS_s3atac_1',
    'OHS_s3atac_2': 'OHS_s3atac_2'
}

In [17]:
tech_dict

{'BIO_ddseq_1': 'biorad',
 'BIO_ddseq_2': 'biorad',
 'BIO_ddseq_3': 'biorad',
 'BIO_ddseq_4': 'biorad',
 'BRO_mtscatac_1': 'atac_revcomp',
 'BRO_mtscatac_2': 'atac_revcomp',
 'CNA_10xmultiome_1': 'multiome_revcomp',
 'CNA_10xmultiome_2': 'multiome_revcomp',
 'CNA_10xv11_1': 'atac',
 'CNA_10xv11_2': 'atac',
 'CNA_10xv11_3': 'atac_revcomp',
 'CNA_10xv11_4': 'atac_revcomp',
 'CNA_10xv11_5': 'atac_revcomp',
 'CNA_10xv2_1': 'atac_revcomp',
 'CNA_10xv2_2': 'atac_revcomp',
 'CNA_ddseq_1': 'biorad',
 'CNA_ddseq_2': 'biorad',
 'CNA_hydrop_1': 'hydrop_2x384',
 'CNA_hydrop_2': 'hydrop_2x384',
 'CNA_hydrop_3': 'hydrop_2x384',
 'CNA_mtscatac_1': 'atac_revcomp',
 'CNA_mtscatac_2': 'atac_revcomp',
 'EPF_hydrop_1': 'hydrop_2x384',
 'EPF_hydrop_2': 'hydrop_2x384',
 'EPF_hydrop_3': 'hydrop_2x384',
 'EPF_hydrop_4': 'hydrop_2x384',
 'HAR_ddseq_1': 'biorad',
 'HAR_ddseq_2': 'biorad',
 'MDC_mtscatac_1': 'atac_revcomp',
 'MDC_mtscatac_2': 'atac_revcomp',
 'SAN_10xmultiome_1': 'multiome',
 'SAN_10xmultiome_2'

In [21]:
stragglers_list = sorted(['UCS_ddseq_1', 'UCS_ddseq_2'])

In [22]:
tech_dict_sub = {}
for sample in stragglers_list:
    tech_dict_sub[sample] = tech_dict[sample]

In [23]:
print(set(tech_dict.keys()) - set(tech_dict_template.keys()))
print(set(tech_dict_template.keys()) - set(tech_dict.keys()))

{'VIB_10xv11_1', 'VIB_mtscatac_2', 'CNA_ddseq_2', 'CNA_ddseq_1', 'VIB_10xv11_2', 'VIB_mtscatac_1'}
set()


tech_dict_sub=tech_dict

In [24]:
fastq_repo_path = '/dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/1_data_repository/full_fastq'

with open('metadata.tsv', 'w') as f:
    f.write(f'sample_name\ttechnology\tfastq_PE1_path\tfastq_barcode_path\tfastq_PE2_path\n')
    for sample in tech_dict_sub.keys():
        if tech_dict[sample] != 'biorad':
            f.write(f'{sample}\t{tech_dict[sample]}\t{fastq_repo_path}/{sample}__R1.FULL.fastq.gz\t{fastq_repo_path}/{sample}__R2.FULL.fastq.gz\t{fastq_repo_path}/{sample}__R3.FULL.fastq.gz\n')
        else:
            f.write(f'{sample}\t{tech_dict[sample]}\t{fastq_repo_path}/{sample}__R1.FULL.fastq.gz\t\t{fastq_repo_path}/{sample}__R2.FULL.fastq.gz\n')

In [25]:
!cat metadata.tsv

sample_name	technology	fastq_PE1_path	fastq_barcode_path	fastq_PE2_path
UCS_ddseq_1	biorad	/dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/1_data_repository/full_fastq/UCS_ddseq_1__R1.FULL.fastq.gz		/dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/1_data_repository/full_fastq/UCS_ddseq_1__R2.FULL.fastq.gz
UCS_ddseq_2	biorad	/dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/1_data_repository/full_fastq/UCS_ddseq_2__R1.FULL.fastq.gz		/dodrio/scratch/projects/starting_2022_023/benchmark/scatac_benchmark/1_data_repository/full_fastq/UCS_ddseq_2__R2.FULL.fastq.gz
