In [35]:
import os
import glob
import sys

import pandas as pd
from itertools import chain

# Validation Functions

In [36]:
from functools import wraps

def validate_checklist_values(func):
    @wraps(func)
    def wrapper(checklist, *args, **kwargs):
        # Ensure the first argument is a dictionary
        if isinstance(checklist, dict) and all(value in (True, None) for value in checklist.values()):
            func(checklist, *args, **kwargs)
    return wrapper

In [37]:
def check_if_dataset(basedir, dataset):
    dataset_path = os.path.join(basedir, dataset)
    is_dataset_name = lambda dataset_name: ("GSE" in dataset_name) or ('E-MTAB' in dataset_name) or ('PRJNA' in dataset_name) or ('SDY' in dataset_name)
    return is_dataset_name(dataset) and os.path.isdir(dataset_path)

## Check if metafiles exist

In [38]:
def make_full_path(basedir, dataset, filename):
    return os.path.join(basedir, dataset, filename)

def exist_nonempty(path, type='file'):
    if type == 'file':
        return os.path.isfile(path) and os.path.getsize(path) > 0
    elif type == 'dir':
        return os.path.isdir(path) and os.listdir(path)
    else:
        raise ValueError

@validate_checklist_values
def check_metafiles_exist(checklist, basedir, datasetname, metafile_suffixes):
    # draft output dict 
    checklist.update({"meta_exist": True, "meta_lost": None})
    # make paths from suffixes
    meta_paths = [make_full_path(basedir, datasetname, f'{datasetname}.{meta_suffix}') for meta_suffix in metafile_suffixes]
    # find lost files
    lost_files = [file for file in meta_paths if not exist_nonempty(file, type='file')]
    if lost_files:
        checklist['meta_exist'] = False
        checklist['meta_lost'] = ','.join(lost_files)


def check_db_meta_exist(checklist, basedir, datasetname, db_meta_suffixes):
    # make paths from suffixes
    db_meta_paths = [make_full_path(basedir, datasetname, f'{datasetname}{meta_suffix}') for meta_suffix in db_meta_suffixes]
    # check if exist
    exist_list = [exist_nonempty(file, type='file') for file in db_meta_paths]
    checklist['db_meta_exist'] = any(exist_list)

## Check if metafiles contain all samples

In [39]:
def read_sample_x_run(filepath):
    process_line = lambda line: line.rstrip().split('\t', 1)
    with open(filepath, 'r') as file:
        splited_lines = list(map(process_line, file.readlines()))
        sample_to_run = {line[0]:(line[1].split(',') if len(line) == 2 else None) for line in splited_lines}
    return sample_to_run


def get_first_column(filepath):
    first_col = lambda line: line.rstrip().split('\t', 1)[0]
    with open(filepath, 'r') as file:
        first_column_list = [first_col(line) for line in file.readlines()]
    return first_column_list


def validate_file(basedir, dataset, filename, correct_list):
    filepath = os.path.join(basedir, dataset, filename)
    list_from_file = get_first_column(filepath)
    return set(list_from_file) == set(correct_list)

def check_sample_x_run_file(checklist, filepath):
    # read sample x run file
    sample_to_run = read_sample_x_run(filepath)
    # check if all samples have runs
    samples_with_lost_runs = [key for key, value in sample_to_run.items() if value is None]
    checklist['all samples have runs'] = not samples_with_lost_runs
    checklist['missing_runs_smaples'] = ','.join(samples_with_lost_runs) if samples_with_lost_runs else None
    return sample_to_run
    
        
@validate_checklist_values
def check_metafiles(checklist, basedir, dataset, sample_to_run):
    # get sample to run dict
    samples = list(sample_to_run.keys())
    runs = list(chain.from_iterable(sample_to_run.values()))
    
    # validate files
    checklist['all runs in run.list'] = validate_file(basedir, dataset, f'{dataset}.run.list', runs)
    checklist['all samples in sample.list'] = validate_file(basedir, dataset, f'{dataset}.sample.list', samples)
    checklist['all runs in parsed.list'] = validate_file(basedir, dataset, f'{dataset}.parsed.tsv', runs)
    

## Check if all fastqs are there

In [40]:
@validate_checklist_values
def validate_fastqs(checklist, basedir, dataset, sample_to_runs):
    samples = set(sample_to_runs.keys())
    # get fastqdir path
    fastqdir_path = os.path.join(basedir, dataset, 'fastqs')
    # check if dir exists
    if exist_nonempty(fastqdir_path, type='dir'):
        checklist['fastqdir_nonemptyexist'] = True
        lost_samples = samples.difference(os.listdir(fastqdir_path))
        # check if all files are present 
        if not lost_samples:
            checklist['all_fastq_samples'] = True
            # check if all runs are present
            fastqnum_persample = {sample: len(os.listdir(os.path.join(fastqdir_path, sample))) for sample in samples}
            lostrun_samples_list = [sample for sample in samples if fastqnum_persample[sample] / 2 != len(sample_to_runs[sample])]
            if not lostrun_samples_list:
                checklist['all_fastq_runs_exist'] = True
            else:
                # write missing sample with missing runs
                checklist['all_fastq_runs_exist'] = False
                checklist['missing_runs_fastq_samples'] = lostrun_samples_list
        else:
            # write missing samples to checklist
            checklist['all_fastq_samples'] = False
            checklist['missing_fastq_samples'] = ','.join(lost_samples)
            
    else:
        checklist['fastqdir_nonemptyexist'] = False

## Validate starsolo output

In [41]:
def validate_starsolo(checklist, basedir, dataset, sample_to_runs):
    samples = set(sample_to_runs.keys())
    # check if all starsolo sample dirs exists and not empty
    not_ok_dirs = [sample for sample in samples if not exist_nonempty(os.path.join(basedir, dataset, sample), type='dir')]
    if not not_ok_dirs:
        checklist['starsolo_allnonemptyexist'] = True
        # check there is output dir and no tmp dir
        no_output_dirs = [sample for sample in samples if not exist_nonempty(os.path.join(basedir, dataset, sample, 'output'), type='dir')]
        no_final_log_file = [sample for sample in samples if not exist_nonempty(os.path.join(basedir, dataset, sample, 'Log.final.out'), type='file')]
        tmp_exists = [sample for sample in samples if exist_nonempty(os.path.join(basedir, dataset, sample, '_STARtmp'), type='dir')]
        
        checklist['starsolo_existOutput'] = not bool(no_output_dirs)
        checklist['starsolo_emptyOutput_samples'] = ','.join(no_output_dirs) if no_output_dirs else None
        checklist['starsolo_existFinalLog'] = not bool(no_final_log_file)
        checklist['starsolo_noFinalLog_samples'] = ','.join(no_final_log_file) if no_final_log_file else None
        checklist['starsolo_noTmp'] = not bool(tmp_exists)
        checklist['starsolo_existTmp_samples'] = ','.join(tmp_exists) if tmp_exists else None
    else:
        checklist['starsolo_allnonemptyexist'] = False
        checklist['missing_starsolo_samples'] = ','.join(not_ok_dirs)

## Validate starsolo qc file

In [42]:
def validate_solo_qc(checklist, basedir, dataset, sample_to_runs):
    samples = set(sample_to_runs.keys())
    solo_qc_path = os.path.join(basedir, dataset, f'{dataset}.solo_qc.tsv')
    # check if file exists
    checklist['solo_qc_exists'] = exist_nonempty(solo_qc_path, type='file')
    if checklist['solo_qc_exists']:
        # check if file is empty
        with open(solo_qc_path, 'r') as file:
            lines = file.readlines()
            checklist['solo_qc_nonempty'] = bool(len(lines) - 1)
        # check if all samples are in the file
        if checklist['solo_qc_nonempty']:
            samples_in_file = get_first_column(solo_qc_path)[1:]
            lost_samples = samples.difference(samples_in_file)
            checklist['solo_qc_all_samples'] = not bool(lost_samples)
            checklist['missing_solo_qc_samples'] = ','.join(lost_samples) if lost_samples else None
            

In [43]:
def validate_basedir(basedir, checklist_columns, metafile_suffixes, db_metafile_suffixes):
    # get valid dataset names
    datasets = [dataset for dataset in os.listdir(basedir) if check_if_dataset(basedir, dataset)]

    checklist_dict = dict()
    for dataset in datasets:
        checklist = {col:None for col in checklist_columns}

        # check that all metadata files exist
        check_metafiles_exist(checklist, basedir, dataset, metafile_suffixes)

        # read sample_to_run file
        if isinstance(checklist, dict) and all(value in (True, None) for value in checklist.values()):
            sample_x_run_path = os.path.join(basedir, dataset, f'{dataset}.sample_x_run.tsv')
            sample_to_run = check_sample_x_run_file(checklist, sample_x_run_path)

            # check that all metadata is consistent and all directories and files exist
            check_metafiles(checklist, basedir, dataset, sample_to_run)

            # validate directories
            validate_fastqs(checklist, basedir, dataset, sample_to_run)

            # validate starsolo output
            validate_starsolo(checklist, basedir, dataset, sample_to_run)

            # validate solo_qc.tsv
            validate_solo_qc(checklist, basedir, dataset, sample_to_run)
            
            # check that there exists db metadata files
            check_db_meta_exist(checklist, basedir, dataset, db_metafile_suffixes)
            
            checklist_dict[dataset] = checklist
    return pd.DataFrame(checklist_dict).T.sort_index()

# Check all directories

In [44]:
ok_datasets = dict()

In [45]:
metafile_suffixes = ['run.list', 'sample.list', 'sample_x_run.tsv', 'parsed.tsv']
db_metafile_suffixes = ['.idf.txt', '.sdrf.txt', '_family.soft']

informative_columns = [
    'meta_exist',
    'db_meta_exist',
    'all samples have runs',
    'all runs in run.list',
    'all samples in sample.list',
    'all runs in parsed.list',
    'fastqdir_nonemptyexist',
    'all_fastq_samples',
    'all_fastq_runs_exist',
    'starsolo_allnonemptyexist',
    'starsolo_existOutput',
    'starsolo_existFinalLog',
    'starsolo_noTmp',
    'solo_qc_exists',
    'solo_qc_nonempty',
    'solo_qc_all_samples'
]

additional_columns = [
    'meta_lost',
    'missing_runs_fastq_samples',
    'missing_fastq_samples',
    'missing_starsolo_samples',
    'starsolo_emptyOutput_samples',
    'starsolo_noFinalLog_samples',
    'starsolo_existTmp_samples',
    'missing_solo_qc_samples'
]

must_be_true_columns = [
    'meta_exist',
    'missing_runs_smaples',
    'all runs in run.list',
    'all samples in sample.list',
    'all runs in parsed.list',
    'starsolo_allnonemptyexist',
    'starsolo_existOutput',
    'starsolo_existFinalLog',
    'starsolo_noTmp',
    'solo_qc_exists',
    'solo_qc_nonempty',
    'solo_qc_all_samples'
]

In [46]:
checklist_columns = informative_columns + additional_columns

## 0_Current
Let's start with `/lustre/scratch127/cellgen/cellgeni/aljes/reprocess/reprocessing-datasets-project/0_Current`. We need to chech that:
- All files in metadata actually exist
- All metadata files exist: `run.list.tsv`, `sample_x_run.tsv`, `solo_qc.tsv`, `parsed.tsv`
- QC are not empty

In [47]:
basedir = "/lustre/scratch127/cellgen/cellgeni/aljes/reprocess/reprocessing-datasets-project/0_Current"

In [48]:
checklist_df = validate_basedir(basedir, checklist_columns, metafile_suffixes, db_metafile_suffixes)
checklist_df.head()

Unnamed: 0,meta_exist,db_meta_exist,all samples have runs,all runs in run.list,all samples in sample.list,all runs in parsed.list,fastqdir_nonemptyexist,all_fastq_samples,all_fastq_runs_exist,starsolo_allnonemptyexist,...,solo_qc_all_samples,meta_lost,missing_runs_fastq_samples,missing_fastq_samples,missing_starsolo_samples,starsolo_emptyOutput_samples,starsolo_noFinalLog_samples,starsolo_existTmp_samples,missing_solo_qc_samples,missing_runs_smaples
GSE112903,True,True,True,True,True,True,False,,,False,...,,,,,"GSM3090973,GSM3090974",,,,,
GSE114802,True,True,True,True,True,True,True,True,True,True,...,True,,,,,,,,,
GSE115982,True,True,True,True,True,True,True,True,True,True,...,True,,,,,,,,,
GSE116113,True,True,True,True,True,True,True,True,True,True,...,True,,,,,,,,,
GSE117211,True,True,True,True,True,True,True,True,True,True,...,True,,,,,,,,,


In [49]:
checklist_df.to_csv('0_current_validation.csv')

In [50]:
ok_datasets['0_current'] = checklist_df[checklist_df[must_be_true_columns].all(axis=1)].index.tolist()
print(f"ok_datasets={len(ok_datasets['0_current'])}, all_datasets={checklist_df.shape[0]}")

ok_datasets=234, all_datasets=325


In [51]:
((checklist_df[must_be_true_columns].all(axis=1)) & (checklist_df.db_meta_exist)).sum()

np.int64(234)

In [52]:
with open('0_current_validation.txt', 'w') as f:
    for line in ok_datasets['0_current']:
        f.write(f"{line}\n")

## 0_Current/0_SUCCESS

In [53]:
basedir = "/lustre/scratch127/cellgen/cellgeni/aljes/reprocess/reprocessing-datasets-project/0_Current/0_SUCCESS"

In [54]:
checklist_df = validate_basedir(basedir, checklist_columns, metafile_suffixes, db_metafile_suffixes)
checklist_df.head()

Unnamed: 0,meta_exist,db_meta_exist,all samples have runs,all runs in run.list,all samples in sample.list,all runs in parsed.list,fastqdir_nonemptyexist,all_fastq_samples,all_fastq_runs_exist,starsolo_allnonemptyexist,...,solo_qc_all_samples,meta_lost,missing_runs_fastq_samples,missing_fastq_samples,missing_starsolo_samples,starsolo_emptyOutput_samples,starsolo_noFinalLog_samples,starsolo_existTmp_samples,missing_solo_qc_samples,missing_runs_smaples
GSE103544,True,True,True,True,True,True,False,,,True,...,True,,,,,,,,,
GSE103574,True,True,True,True,True,True,False,,,True,...,True,,,,,,,,,
GSE104556,True,True,True,True,True,True,False,,,True,...,True,,,,,,,,,
GSE106543,True,True,True,True,True,True,False,,,True,...,True,,,,,,,,,
GSE108067,True,True,True,True,True,True,False,,,True,...,True,,,,,,,,,


In [55]:
checklist_df.to_csv('0_success_validation.csv')

In [56]:
ok_datasets['0_success'] = checklist_df[checklist_df[must_be_true_columns].all(axis=1)].index.tolist()
print(f"ok_datasets={len(ok_datasets['0_success'])}, all_datasets={checklist_df.shape[0]}")

ok_datasets=858, all_datasets=859


In [57]:
((checklist_df[must_be_true_columns].all(axis=1)) & (checklist_df.db_meta_exist)).sum()

np.int64(858)

In [58]:
with open('0_success_validation.txt', 'w') as f:
    for line in ok_datasets['0_success']:
        f.write(f"{line}\n")

## Main dir

In [59]:
basedir = "/lustre/scratch127/cellgen/cellgeni/aljes/reprocess/reprocessing-datasets-project"

In [60]:
checklist_df = validate_basedir(basedir, checklist_columns, metafile_suffixes, db_metafile_suffixes)
checklist_df.head()

Unnamed: 0,meta_exist,db_meta_exist,all samples have runs,all runs in run.list,all samples in sample.list,all runs in parsed.list,fastqdir_nonemptyexist,all_fastq_samples,all_fastq_runs_exist,starsolo_allnonemptyexist,...,solo_qc_all_samples,meta_lost,missing_runs_fastq_samples,missing_fastq_samples,missing_starsolo_samples,starsolo_emptyOutput_samples,starsolo_noFinalLog_samples,starsolo_existTmp_samples,missing_solo_qc_samples,missing_runs_smaples
E-MTAB-10018,True,True,True,True,True,True,False,,,True,...,True,,,,,,,,,
E-MTAB-10042,True,True,True,True,True,True,False,,,True,...,True,,,,,,,,,
E-MTAB-10060,True,True,True,True,True,True,False,,,True,...,True,,,,,,,,,
E-MTAB-10143,True,True,True,True,True,True,False,,,False,...,False,,,,"ERS11457215,ERS11457212,ERS11457195,ERS1145721...",,,,"ERS11457215,ERS11457210,ERS11457192,ERS1145719...",
E-MTAB-10169,True,True,True,True,True,True,False,,,False,...,False,,,,"ERS5913312,ERS5913318,ERS5913311,ERS5913322,ER...",,,,"ERS5913312,ERS5913318,ERS5913324,ERS5913311,ER...",


In [61]:
checklist_df.to_csv('main_dir_validation.csv')

In [62]:
ok_datasets['main_dir'] = checklist_df[checklist_df[must_be_true_columns].all(axis=1)].index.tolist()
print(f"ok_datasets={len(ok_datasets['main_dir'])}, all_datasets={checklist_df.shape[0]}")

ok_datasets=201, all_datasets=278


In [63]:
checklist_df[((checklist_df[must_be_true_columns].all(axis=1)) & (checklist_df.db_meta_exist != True))]

Unnamed: 0,meta_exist,db_meta_exist,all samples have runs,all runs in run.list,all samples in sample.list,all runs in parsed.list,fastqdir_nonemptyexist,all_fastq_samples,all_fastq_runs_exist,starsolo_allnonemptyexist,...,solo_qc_all_samples,meta_lost,missing_runs_fastq_samples,missing_fastq_samples,missing_starsolo_samples,starsolo_emptyOutput_samples,starsolo_noFinalLog_samples,starsolo_existTmp_samples,missing_solo_qc_samples,missing_runs_smaples
PRJNA826269,True,False,True,True,True,True,False,,,True,...,True,,,,,,,,,


In [72]:
set(ok_datasets['0_current']).intersection(ok_datasets['0_success'])

{'GSE115982'}

In [66]:
ok_datasets.keys()

dict_keys(['0_current', '0_success', 'main_dir'])

In [64]:
with open('main_dir_validation.txt', 'w') as f:
    for line in ok_datasets['main_dir']:
        f.write(f"{line}\n")