In [2]:
import pandas as pd
import os
import glob

In [3]:
basedir = '/lustre/scratch127/cellgen/cellgeni/reprocessing-datasets-project/irods'

In [4]:
count_dict = dict()
for filepath in glob.glob(f'{basedir}/*/*accessions.tsv'):
    with open(filepath, 'r') as file:
        lines = [line.rstrip().split('\t') for line in file.readlines()]
        count = {line[1]:[line[0], len(line[2]), filepath] for line in lines}
        count_dict.update(count)

In [40]:
source_dir = '/lustre/scratch127/cellgen/cellgeni/reprocessing-datasets-project/irods/E-MTAB-8007'
target_dir = '/archive/cellgeni/datasets'

In [53]:
def accessions_row_to_meta(row):
    sample_dict = {
        'experiment': row[2].split(','),
        'run': row[3].split(',')
    }
    return sample_dict

def solo_qc_row_to_meta(header, row):
    return dict(zip(header[1:], row[1:]))


def get_accessions_meta(accessions_file, sep='\t'):
    with open(accessions_file, 'r') as file:
        # split line and remove \n
        lines = [line.rstrip().split(sep) for line in file.readlines()]
        # convert to Dict[sample, meta]
        samples = {line[1]: accessions_row_to_meta(line) for line in lines}
    return samples


def get_solo_qc_meta(solo_qc_file, sep='\t'):
    with open(solo_qc_file, 'r') as file:
        header = file.readline().rstrip().split(sep)
        # split line and remove \n
        lines = [line.rstrip().split(sep) for line in file.readlines()]
        # convert to Dict[sample, meta]
        meta = {line[0]: solo_qc_row_to_meta(header, line) for line in lines}
    return meta


def write_meta(meta, output_dir, target_keys, key_convert):
    for sample_meta in meta:
        # get sample name
        sample = sample_meta['sample']
        # filter redundunt keys, change key names if neccessary and convert keys to lower case
        filtered_meta = {key_convert.get(key, key).lower():value for key, value in sample_meta.items() if key in target_keys}
        # convert dict into lines (if there are several values for the same key then several lines are created)
        lines = [f'{key}\t{val}\n' for key, values in filtered_meta.items() for val in (values if isinstance(values, list) else [values])]
        # get a filepath to metadata
        filepath = os.path.join(output_dir, f'{sample}.tsv')
        # write metadata
        with open(filepath, 'w') as file:
            file.writelines(lines)

In [54]:
import csv

target_keys = ['sample', 'experiment', 'run', 'Rd_all', 'WL', 'Species', 'Paired', 'Strand']
key_convert = {
    'Rd_all': 'total_reads',
    'WL': 'whitelist'
}

source_dir = source_dir.rstrip('/')
target_dir = target_dir.rstrip('/')
output_dir = 'metadata'

dataset = os.path.basename(source_dir)

accessions_file = os.path.join(source_dir, f'{dataset}.accessions.tsv')
solo_qc_file = os.path.join(source_dir, f'{dataset}.solo_qc.tsv')
sep = '\t'

accessions_meta = get_accessions_meta(accessions_file)
solo_qc_meta = get_solo_qc_meta(solo_qc_file)


# concatenate dicts
meta = [dict(sample=key, **accessions_meta[key], **solo_qc_meta[key]) for key in solo_qc_meta.keys()]

os.makedirs(output_dir, exist_ok=True)

write_meta(meta, output_dir, target_keys, key_convert)