## Digest data sets to simplified json objects for GEO submission

This notebook exports ExperimentSets and related Experiments and Biosamples as json files, compatible with GEO submission. Important metadata from other item types is recorded in the relevant file (e.g. Biosource info is condensed in the exported Biosample json).

* Part 0. Initialize all functions.
* Part 1. List all sets to export. **IMPORTANT**: make sure that the sets do not have **restricted files** (e.g. from HeLa cells).
* Part 2. Generate simplified dictionaries for each ExpSet, Experiment and Biosample.
* Part 3. Save dictionaries as json files.
* Part 4. Patch items with date of export for external submission.

In [None]:
import json
import time
from pathlib import Path
from datetime import datetime, timezone
from dcicutils import ff_utils
from functions.notebook_functions import get_key
from functions.geo_minimization import *

my_auth = get_key('andrea_data')
URL = 'https://data.4dnucleome.org'
DB = 'GEO'
FORMATS = ['fastq', 'mcool', 'pairs']

### Part 0

In [None]:
def boildown_protocol(protocol_object):
    '''used for both protocol and document items'''
    if protocol_object.get('attachment') is None:
        protocol_object = ff_utils.get_metadata(protocol_object['uuid'], key=my_auth)
    output_dict = {}
    protocol_simple_interesting_values = ['protocol_type', 'description', 'url']
    for key in protocol_object:
        if key == 'attachment':
            output_dict['download'] = URL + protocol_object['@id'] + protocol_object['attachment']['href']
        elif key in protocol_simple_interesting_values:
            output_dict[key] = protocol_object[key]
    return output_dict


def boildown_protocols(protocols):
    return [boildown_protocol(p) for p in protocols]


def boildown_experimental_protocol(protocol, protocol_variations):
    ''' Combine experimental protocol and protocol variations.
    Return a list of protocol objects'''
    experimental_protocols = []
    if protocol:
        experimental_protocols.append(protocol)
    if protocol_variations:
        experimental_protocols.extend(protocol_variations)
    return boildown_protocols(experimental_protocols)


def boildown_cell_culture_details(cell_culture_details, biosources):
    ''' get SOP_cell_culture from Biosource (not embedded in Biosample) and
    'protocols_additional' and 'authentication_protocols' from BCC (not embedded in Biosample)
    '''
    protocols = []
    for bio in biosources:
        biosource = ff_utils.get_metadata(bio['@id'], key=my_auth)
        if biosource.get('SOP_cell_line'):
            protocols.append(biosource['SOP_cell_line'])
    for cell_culture in cell_culture_details:
        bcc = ff_utils.get_metadata(cell_culture['uuid'], key=my_auth)
        protocols.extend(bcc.get('protocols_additional', []))
        protocols.extend(bcc.get('authentication_protocols', []))
    return boildown_protocols(protocols)

In [None]:
# Summarize files. Uses get_metadata, works for raw and processed files.

file_quality_metric_interesting_values = [
    'Sequence length',
    'Total Sequences'
]

file_interesting_values = [
    'paired_end',
    'accession',
    'display_title',
    'file_type',
#     'file_type_detailed',  # has also file_format['display_title']
#     'href',
    'file_size',
    'file_classification',
    'filename',  # is it necessary? not embedded in expset
    'instrument',  # is it necessary? not embedded in expset
    'public_release',  # is it necessary? not embedded in expset
    'genome_assembly',
    'md5sum',
#     'content_md5sum',
]


def boildown_files(files_list):
    '''Takes list of raw files and produces list of runs.
    Each run is a pair of files (if paired end) or a single file (if single end).'''
    files = [boildown_file(f) for f in files_list]
    runs = []
    for a_file in files:
        if a_file.get('paired_end') is None:
            runs.append([a_file])
        elif a_file.get('paired_end') == '1':
            pe1 = a_file
            for another_file in files:
                if another_file['accession'] == a_file['related_files']:
                    pe2 = another_file
                    break
            runs.append([pe1, pe2])
    return runs


def boildown_file(file_object):
    '''get_metadata to get all info'''
    file_dictionary = {}
    file = ff_utils.get_metadata(file_object['accession'], key=my_auth)
    for key, value in file.items():
        if key in file_interesting_values:
            if isinstance(value, list):
                if len(value) > 0:
                    file_dictionary[key] = ', '.join(value)
            else:
                file_dictionary[key] = str(value)
        elif key == 'file_format':
            file_dictionary[key] = boildown_title(value)
        elif key == 'quality_metric':
            for k in value.keys():
                if k in file_quality_metric_interesting_values:
                    file_dictionary[k] = str(value[k])
        elif key == 'related_files':
            file_dictionary[key] = boildown_related_files(value)
        elif key == 'workflow_run_outputs' and len(value) > 0:
            wfr = value[0]  # file derives only from the first wfr in the list
            file_dictionary['workflow_run'] = URL + wfr['@id']
#             file_dictionary['workflow'] = wfr['workflow']['display_title']
            file_dictionary['derived_from'] = ", ".join([
                file['value']['display_title'] for file in wfr['input_files']
            ])
    return file_dictionary


def boildown_processed_files(processed_files_list):
    '''Specific for HiC pipeline'''
    output_list = []
    for pf in processed_files_list:
        if pf['file_format']['display_title'] in ['pairs', 'mcool']:
            file_dict = boildown_file(pf)
            file_dict['data_processing'] = 'https://data.4dnucleome.org/resources/data-analysis/hi_c-processing-pipeline'
            output_list.append(file_dict)
    return output_list

In [None]:
expset_function_dispatch = {
    'accession': same,
    'lab': boildown_title,
    'contributing_labs': boildown_list_to_titles,
    'submitted_by': boildown_title,
    'award': boildown_award,
    'description': same,
    'dataset_label': same,
    'condition': same,
    'last_modified': boildown_date_modified,
    'public_release': same,
    'experiments_in_set': boildown_experiments_in_set,
    'documents': boildown_protocols,  # get_metadata for each protocol
    'number_of_experiments': same,
    'replicate_exps': boildown_replicate_exps,
    'processed_files': boildown_processed_files,
    'external_references': boildown_external_references,  # use instead of dbxrefs because it is validated
    'produced_in_pub': boildown_publication,
}


def simplify_expset(expset_accession, my_auth):
    ''' Keys are explicitly declared in the function dispatch dictionary.
    They will be added in the same order. Keys not present are ignored.'''
    expset_object = ff_utils.get_metadata(expset_accession, my_auth)
    expset_dict = {}
    for key in expset_function_dispatch.keys():
        if expset_object.get(key):
            value = expset_object[key]
            result = expset_function_dispatch[key](value)
            if isinstance(result, dict):
                for k, v in result.items():
                    expset_dict[k] = v
            else:
                expset_dict[key] = result
        else:  # add empty fields, just to have an overview of all keys
            expset_dict[key] = ''
    if not expset_dict.get('produced_in_pub'):
        data_usage_section = ff_utils.get_metadata("621e8359-3885-40ce-965d-91894aa7b758", key=my_auth)
        expset_dict['produced_in_pub'] = data_usage_section['content']
    return expset_dict

In [None]:
# specific for ExperimentHiC
experiment_ignore = [
    'filesets', 'notes', 'aliases', '@type', 'badges', 'date_created',
    'experiment_relation', 'dbxrefs', 'references', 'follows_sop',
    'library_preparation_date', 'other_processed_files', 'project_release',
    'publications_of_exp', 'quality_metric_flags', 'reference_files',
    'sop_mapping', 'static_content', 'static_headers', 'status', 'tags',
    'schema_version', '@id', 'uuid', 'principals_allowed', '@context',
    'actions', 'aggregated-items', 'validation-errors', 'authentication_docs',
    'biosample_quantity_units',  # combined with biosample quantity
    'alternate_accessions', 'produced_in_pub', 'experiment_sets'
]

experiment_function_dispatch = {
    'accession': same,
    'lab': boildown_title,
    'contributing_labs': boildown_list_to_titles,
    'submitted_by': boildown_title,
    'award': boildown_award,
    'display_title': boildown_exp_display_title,
    'description': same,
    'last_modified': boildown_date_modified,
    'public_release': same,
    'experiment_type': boildown_title,
    'biological_replicate_number': '',
    'technical_replicate_number': '',
    'experiment_categorizer': boildown_exp_categorizer,  # 'combined' key (enzyme or target) + value
    'biosample': boildown_title, # boildown_biosample,
    'biosample_quantity': boildown_biosample_quantity,
    'experiment_summary': same,
    'digestion_enzyme': boildown_title,
    'digestion_temperature': same,
    'digestion_time': same,
    'ligation_temperature': same,
    'ligation_time': same,
    'ligation_volume': same,
    'tagging_method': same,
    'average_fragment_size': same,
    'biotin_removed': same,
    'crosslinking_method': same,
    'crosslinking_temperature': same,
    'crosslinking_time': same,
    'enzyme_lot_number': same,
    'fragment_size_range': same,
    'fragment_size_selection_method': same,
    'fragmentation_method': same,
    'library_prep_kit': same,
    'pcr_cycles': same,
    'protocol': boildown_experimental_protocol,  # includes protocol_variation
#     'protocol_variation': boildown_protocols,
    'documents': boildown_protocols,  # get_metadata for each protocol
    'external_references': boildown_external_references,
    'files': boildown_files,  # get_metadata for each file
    'processed_files': boildown_processed_files,  # do not want these files
}

def simplify_experiment(experiment_accession, my_auth, biorep, tecrep):
    ''' Keys are explicitly declared in the function dispatch dictionary.
    They will be added in the same order. Keys not present are ignored.'''
    experiment_object = ff_utils.get_metadata(experiment_accession, my_auth)
    experiment_dict = {}
    for key in experiment_function_dispatch.keys():
        # start exceptions
        if key == 'biosample_quantity':
            experiment_dict[key] = experiment_function_dispatch[key](
                experiment_object[key], experiment_object['biosample_quantity_units'])
        elif key == 'biological_replicate_number':
            experiment_dict[key] = biorep
        elif key == 'technical_replicate_number':
            experiment_dict[key] = tecrep
        elif key == 'protocol':
            experiment_dict['experimental_protocol'] = experiment_function_dispatch[key](
                experiment_object.get(key), experiment_object.get('protocol_variation'))
        # end exceptions
        
        elif experiment_object.get(key):
            value = experiment_object[key]
            result = experiment_function_dispatch[key](value)
#             if isinstance(result, dict):
#                 for k, v in result.items():
#                     experiment_dict[k] = v
#             else:
            experiment_dict[key] = result
        else:  # add empty fields, just to have an overview of all keys
            experiment_dict[key] = ''

    return experiment_dict

In [None]:
def boildown_individual(individual_embedded):
    '''GET metadata from individual id and return all keys in interesting values'''
    individual_interesting_values = [
        'age', 'age_units', 'sex', 'life_stage', 'mouse_life_stage',
        'mouse_strain', 'ethnicity', 'health_status',
    ]
    individual_dict = {}
    individual = ff_utils.get_metadata(individual_embedded['@id'], key=my_auth)
    for key, value in individual.items():
        if key in individual_interesting_values:
            individual_dict[key] = value
        elif key == 'organism':
            individual_dict['organism_id'] = get_organism_id(individual)
    return individual_dict


biosource_function_dispatch = {
#     'display_title': same,  # do not need that
    'biosource_type': same,
    'biosource_vendor': boildown_title,
    'individual': boildown_individual,
#     'SOP_cell_line': boildown_sop_cell_line,  # already retrieved from Biosample
}


def minimize_biosource(biosources_list):
    ''' Biosources list is obtained from biosample.
    Often (always?) there is just one Biosource. Return list of accessions otherwise.
    Most of the interesting values are embedded fields, apart from SOP_cell_line
    which requires to get_metadata.'''
    if len(biosources_list) > 1:
        return ', '.join([bios['accession'] for bios in biosources_list])
    # most cases have only 1 biosource
    biosource = biosources_list[0]
    biosource_dict = {}
    for key in biosource_function_dispatch.keys():
        if biosource.get(key):
            value = biosource[key]
            result = biosource_function_dispatch[key](value)
            if isinstance(result, dict):  # here only individual
                for k, v in result.items():
                    biosource_dict[k] = v
            else:
                biosource_dict[key] = result
        else:  # add empty fields, just to have an overview of all keys
            biosource_dict[key] = ''
    return biosource_dict

In [None]:
biosample_function_dispatch = {
    'accession': same,
    'biosource_summary': same,  # calcprop (biosource_name), OK also with multiple biosources
    'biosample_type': same,  # calcprop (biosource_type or mixed biosources if many)
    'tissue_organ_info': boildown_tissue_organ_info,  # OK also with multiple biosources
    'description': same,  # is it needed? not embedded in expset
    'modifications_summary': same,  # calcprop
    'treatments_summary': same,  # calcprop
    'biosource': minimize_biosource,  # dict
    'biosample_protocols': boildown_protocols,  # get_metadata for each protocol
    'cell_culture_details': boildown_cell_culture_details,  # get_metadata for each protocol and also SOP
    'last_modified': boildown_date_modified,
    'documents': boildown_protocols,
    'external_references': boildown_external_references,  # dbxrefs. is it needed?
}


def simplify_biosample(biosample_accession, my_auth):
    ''' Keys are explicitly declared in the function dispatch dictionary.
    They will be added in the same order. Keys not present are ignored.'''
    biosample_object = ff_utils.get_metadata(biosample_accession, my_auth)
    biosample_dict = {}
    for key in biosample_function_dispatch.keys():
        # start exceptions
        if key == 'cell_culture_details':
            biosample_dict['cell_culture_protocols'] = biosample_function_dispatch[key](
                biosample_object.get(key), biosample_object['biosource'])
        # end exceptions

        elif biosample_object.get(key):
            value = biosample_object[key]
            result = biosample_function_dispatch[key](value)
            if isinstance(result, dict):
                for k, v in result.items():
                    biosample_dict[k] = v
            else:
                biosample_dict[key] = result
        else:  # add empty fields, just to have an overview of all keys
            biosample_dict[key] = ''

    return biosample_dict

### Part 1

In [None]:
# List ExpSets (or a search query) to export for GEO submission

sets_list = ['4DNESRJ8KV4Q'] # ['4DNESACCCCCC', '4DNESACCCCCC']
search_url = ''  # '/search/?type=ExperimentSetReplicate&condition=Enzyme%20DpnII%20-%20in%20situ%20Hi-C%20on%20cells%20cultured%20prior%20to%204DN%20SOP'  # '/search/?type=ExperimentSetReplicate&award.project=4DN&status=pre-release&dataset_label=Micro-C+on+JM8.N4+Cells'

sets_to_submit = []
if sets_list:
    sets_to_submit.extend([ff_utils.get_metadata(set_id, key=my_auth) for set_id in sets_list])
elif search_url:
    sets_to_submit = ff_utils.search_metadata(search_url, my_auth)

print(len(sets_to_submit), 'Experiment Sets to export for GEO submission')

In [None]:
# Get all the items
store = {}  # key is @id, value is the object with frame=embedded

def _get_if_not_in_store(item):
    at_id = item['@id']
    if at_id not in store:
        store[at_id] = ff_utils.get_metadata(at_id, key=my_auth)
    return

time1 = time.time()
for expset in sets_to_submit:
    store[expset['@id']] = expset
    
    # Experiment, Biosample
    for experiment in expset.get('experiments_in_set'):
        _get_if_not_in_store(experiment)
        _get_if_not_in_store(experiment['biosample'])

        # BiosampleCellCulture, Protocols
        for cell_culture in experiment['biosample'].get('cell_culture_details'):
            _get_if_not_in_store(cell_culture)
            for protocol in (cell_culture.get('protocols_additional', [])
                             + cell_culture.get('authentication_protocols', [])):
                _get_if_not_in_store(protocol)

        # Biosource, Individual, Organism
        for biosource in experiment['biosample']['biosource']:
            _get_if_not_in_store(biosource)
            _get_if_not_in_store(biosource['individual'])
            # insert somewhere here some control on HeLa or other restricted files
            _get_if_not_in_store(biosource['individual']['organism'])
#             if biosource.get('SOP_cell_line'):  # already embedded in biosource
#                 _get_if_not_in_store(protocol)

        # Exp. Protocol + variations
        for protocol in ([(experiment.get('protocol', '')]
                           + experiment.get('protocol_variations', [])):
            _get_if_not_in_store(protocol)

        # Raw and Processed Files (in experiment)
        for file in experiment.get('files', []) + experiment.get('processed_files', []):
            if file['file_format']['display_title'] in FORMATS:
                _get_if_not_in_store(file)

    # Processed Files (in experiment set)
    for file in expset.get('processed_files', []):
        if file['file_format']['display_title'] in FORMATS:
            _get_if_not_in_store(file)

time2 = time.time()
print(round((time2-time1), 1), 'sec for collection')
for item in store:
    print(item)

### Part 2

In [None]:
# Export simplified dictionaries for each ExpSet, Experiment, Biosample

date_dict = {}  # record date of export

def get_date_exported(database):
    '''get date of metadata export (to be reported in the external_submissions)'''
    date = datetime.now(timezone.utc).isoformat()
    return {'date_exported': date, 'database': database}

# Experiment Sets
expset_dicts = []
experiments_to_submit = {}
for accession in sets_to_submit:
    expset_dictionary = simplify_expset(accession, my_auth)
    expset_dicts.append(expset_dictionary)
    date_dict[accession] = get_date_exported(DB)
    
    # get Experiments and replicate structure from the exported ExpSet
    for rep in expset_dictionary['replicate_exps']:
        experiments_to_submit[rep['replicate']] = {'bio_rep_no': rep['biological_replicate_number'],
                                                   'tec_rep_no': rep['technical_replicate_number']}

# Experiments
exp_dicts = []
biosamples_to_submit = []
for accession, rep in experiments_to_submit.items():
    exp_dictionary = simplify_experiment(accession, my_auth, rep['bio_rep_no'], rep['tec_rep_no'])
    exp_dicts.append(exp_dictionary)
    date_dict[accession] = get_date_exported(DB)
    
    # get Biosamples from the exported Experiments
    biosamples_to_submit.append(exp_dictionary['biosample'])

# Biosamples
bio_dicts = []
for accession in list(set(biosamples_to_submit)):
    bio_dictionary = simplify_biosample(accession, my_auth)
    bio_dicts.append(bio_dictionary)
    date_dict[accession] = get_date_exported(DB)

print('Exp Sets exported:\t', sets_to_submit)
print('Experiments exported:\t', [k for k in experiments_to_submit.keys()])
print('Biosamples exported:\t', list(set(biosamples_to_submit)))

### Part 3

In [None]:
# save json files

directory = Path("~/Documents/GEO/submissions").expanduser()
overwrite = False  # overwrites existing files if set to True

for dictionary in (expset_dicts + exp_dicts + bio_dicts):
    file_name = dictionary['accession'] + '.json'
    full_path = Path.joinpath(directory, file_name)
    if full_path.exists() and not overwrite:
        print(file_name, 'already exists in the folder', directory)
    else:
        with open(full_path, 'w') as fp:
            json.dump(dictionary, fp, indent=4)
        print(file_name, 'file saved')

### Part 4

In [None]:
# 4. patch export date for each item in external_submission
action = False

count = 0
for accession, patch_body in date_dict.items():
    if action:
        res = ff_utils.patch_metadata({"external_submission": patch_body}, accession, key=my_auth)
        if res['status'] != 'success':
            print(res)
            break
        else:
            count += 1
    else:
        print(accession, patch_body)
print('patched {} items'.format(count))