## Digest data sets to simplified json objects for GEO submission

This notebook exports ExperimentSets and related Experiments and Biosamples as json files, compatible with GEO submission. Embedded objects and complex data structures are simplified in these three output files.

* Part 0. Initialize all functions.
* Part 1. List all sets to export.
* Part 2. Generate simplified dictionaries for each ExpSet, Experiment and Biosample.
* Part 3. Save dictionaries as json files.
* Part 4. Patch items with date of export for external submission.

**NOTES**

Things to check manually:
* HeLa or other restricted files
* Multiple biosources

**ToDo**

* File provenance
* Add other experiment types
* better handling of experiment type
* molecule

In [None]:
import json
import time
from pathlib import Path
from datetime import datetime, timezone
from dcicutils import ff_utils
from functions.notebook_functions import get_key
from functions.geo_minimization import *

my_auth = get_key('andrea_data')
DB = 'GEO'
FORMATS = ['mcool', 'pairs']

### Part 0 - Initialize all functions

In [None]:
store = {}  # key is @id, value is the object with frame=embedded

def get_item_in_store(item):
    '''Gets metadata for the item with frame=embedded and keeps it in store (dict)'''
    if isinstance(item, dict):
        item = item['@id']
    if item not in store:
        store[item] = ff_utils.get_metadata(item, key=my_auth)
    return store[item]

In [None]:
data_use_static_section = ff_utils.get_metadata("621e8359-3885-40ce-965d-91894aa7b758", key=my_auth)
data_use_guidelines = data_use_static_section['content']

In [None]:
# Files (get from store)
def boildown_files(files_list):
    '''Takes list of raw files and produces list of runs.
    Each run is a pair of files (if paired end) or a single file (if single end).'''
    files = [boildown_file(get_item_in_store(f)) for f in files_list]
    runs = []
    for a_file in files:
        if a_file.get('paired_end') is None:
            runs.append([a_file])
        elif a_file.get('paired_end') == '1':
            pe1 = a_file
            for another_file in files:
                if another_file['accession'] == a_file['related_files']:
                    pe2 = another_file
                    break
            runs.append([pe1, pe2])
    return runs


def boildown_processed_files(processed_files_list):
    '''Specific for HiC pipeline'''
    output_list = []
    for pf in processed_files_list:
        if pf['file_format']['display_title'] in FORMATS:
            file_dict = boildown_file(get_item_in_store(pf))
            file_dict['data_processing'] = 'https://data.4dnucleome.org/resources/data-analysis/hi_c-processing-pipeline'
            output_list.append(file_dict)
    return output_list

In [None]:
# Experiment Set
expset_simple_values = [
    'accession', 'description', 'dataset_label',
    'condition', 'public_release', 'number_of_experiments'
]

expset_function_dispatch = {
    '@id': boildown_at_id,
    'lab': boildown_title,
    'contributing_labs': boildown_list_to_titles,
    'submitted_by': boildown_title,
    'award': boildown_award,
#     'last_modified': boildown_date_modified,
    'experiments_in_set': boildown_experiments_in_set,
#     'documents': boildown_protocols,  # get_metadata for each protocol
    'replicate_exps': boildown_replicate_exps,
    'processed_files': boildown_processed_files,
    'external_references': boildown_external_references,  # use instead of dbxrefs because it is validated
    'produced_in_pub': boildown_publication,  # returns also !Series_citation
}


def simplify_expset(expset_object):
    ''' Keys are explicitly declared. Keys not present are ignored.'''
    expset_dict = {}
    for key, value in expset_object.items():
        if key in expset_simple_values:
            add_value_to_output_dict(key, value, expset_dict)
        elif key in expset_function_dispatch:
            result = expset_function_dispatch[key](value)
            add_value_to_output_dict(key, result, expset_dict)
    if not expset_dict.get('produced_in_pub'):
        expset_dict['data_use_guidelines'] = data_use_guidelines
    return expset_dict

In [None]:
# Protocols
def boildown_experimental_protocol(experiment_object):
    ''' Return experimental_protocol (list), by combining protocol and protocol_variations'''
    protocols = []
    if experiment_object.get('protocol'):
        protocol = get_item_in_store(experiment_object['protocol'])
        protocols.append(protocol)
    if experiment_object.get('protocol_variations'):
        protocol_variations = [p for p in get_item_in_store(experiment_object['protocol_variations'])]
        protocols.extend(protocol_variations)
    protocols_list = boildown_protocols(protocols)
    return {'experimental_protocol': protocols_list}


def boildown_cell_culture_details(biosample_object):
    ''' Return cell_culture_protocols (list), by combining SOP_cell_culture from Biosource, and
    'protocols_additional' and 'authentication_protocols' from BCC
    '''
    protocols = []
    biosources = biosample_object['biosource']
    for bsr in biosources:
        biosource = get_item_in_store(bsr)
        if biosource.get('SOP_cell_line'):
            protocol = get_item_in_store(biosource['SOP_cell_line'])
            protocols.append(protocol)
    cell_culture_details = biosample_object.get('cell_culture_details', [])
    for bs_cc in cell_culture_details:
        protocols_add = [get_item_in_store(protocol) for protocol in bs_cc.get('protocols_additional', [])] 
        protocols.extend(protocols_add)
#         protocols_auth = [get_item_in_store(protocol) for protocol in bs_cc.get('authentication_protocols', [])] 
#         protocols.extend(protocols_auth)
    protocols_list = boildown_protocols(protocols)
    return {'cell_culture_protocols': protocols_list}


def boildown_biosample_protocols(biosample_object):
    ''' Get each protocol object from store'''
    protocols = [get_item_in_store(protocol) for protocol in biosample_object.get('biosample_protocols', [])]
    protocols_list = boildown_protocols(protocols)
    return protocols_list

In [None]:
# Experiment #### specific for ExperimentHiC
experiment_simple_values = [
    'accession', 'description', 'public_release', 'experiment_summary',
    'digestion_temperature', 'digestion_time', 'ligation_temperature',
    'ligation_time', 'ligation_volume', 'tagging_method', 'average_fragment_size',
    'biotin_removed', 'crosslinking_method', 'crosslinking_temperature',
    'crosslinking_time', 'enzyme_lot_number', 'fragment_size_range',
    'fragment_size_selection_method', 'fragmentation_method',
    'library_prep_kit', 'pcr_cycles',
]

experiment_function_dispatch = {
    '@id': boildown_at_id,
    'lab': boildown_title,
    'contributing_labs': boildown_list_to_titles,
    'submitted_by': boildown_title,
    'award': boildown_award,
    'display_title': boildown_exp_display_title,
#     'last_modified': boildown_date_modified,
    'experiment_type': boildown_title,
    'experiment_categorizer': boildown_exp_categorizer,  # 'combined' key (enzyme or target) + value
    'biosample': boildown_title, # boildown_biosample,
    'biosample_quantity': boildown_biosample_quantity,  # includes units
    'digestion_enzyme': boildown_title,
    'protocol': boildown_experimental_protocol,  # includes protocol_variation # !Sample_extract_protocol,
    'protocol_variation': boildown_experimental_protocol,  # includes protocol # !Sample_extract_protocol,
#     'documents': boildown_protocols,  # get_metadata for each protocol
#     'external_references': boildown_external_references,
    'files': boildown_files,  # get_metadata for each file
    'processed_files': boildown_processed_files,
}

def simplify_experiment(experiment_object):
    ''' Keys are explicitly declared. Keys not present are ignored.'''
    experiment_dict = {}
    for key, value in experiment_object.items():
        if key in experiment_simple_values:
            add_value_to_output_dict(key, value, experiment_dict)
        elif key in ['biosample_quantity', 'protocol']:  # pass the entire object
            result = experiment_function_dispatch[key](experiment_object)
            add_value_to_output_dict(key, result, experiment_dict)
        elif key in experiment_function_dispatch:  # pass only the value
            result = experiment_function_dispatch[key](value)
            add_value_to_output_dict(key, result, experiment_dict)
    return experiment_dict

In [None]:
individual_interesting_values = [
    'age', 'age_units', 'sex', 'life_stage', 'mouse_life_stage',
    'mouse_strain', 'ethnicity', 'health_status',
]

def boildown_individual(biosample_object):
    '''Get individual and return all keys in interesting values'''
    individual_dict = {}
    individual = get_item_in_store(biosample_object['biosource'][0]['individual'])
    for key, value in individual.items():
        if key in individual_interesting_values:
            add_value_to_output_dict(key, value, individual_dict)
        elif key == 'organism':
            organism_object = get_item_in_store(individual['organism'])
            result = boildown_organism(organism_object)
            add_value_to_output_dict(key, result, individual_dict)
    return individual_dict


biosource_function_dispatch = {
    'biosource_vendor': boildown_title,
    'cell_line': boildown_title,
    'individual': boildown_individual,
    'SOP_cell_line': boildown_cell_culture_details,  # also retrieved from Biosample if cell_culture_details exists
}

def minimize_biosource(biosample_object):
    ''' Biosources list is obtained from biosample.
    Often (always?) there is just one Biosource. Return list of accessions otherwise.
    Most of the interesting values are embedded fields, apart from SOP_cell_line
    which requires to get_metadata.'''
    biosources_list = biosample_object['biosource']
    if len(biosources_list) > 1:
        return ', '.join([bsr['accession'] for bsr in biosources_list])
    # most cases have only 1 biosource
    biosource = biosources_list[0]
    biosource_dict = {}
    for key, value in biosource.items():
        if key in ['individual', 'SOP_cell_line']:  # pass the entire object
            result = biosource_function_dispatch[key](biosample_object)
            add_value_to_output_dict(key, result, biosource_dict)
        elif key in biosource_function_dispatch:
            result = biosource_function_dispatch[key](value)
            add_value_to_output_dict(key, result, biosource_dict)
    return biosource_dict

In [None]:
biosample_simple_values = [
    'accession', 'biosource_summary', 'biosample_type', 'description',
    'modifications_summary', 'treatments_summary',
]

biosample_function_dispatch = {
    '@id': boildown_at_id,
    'tissue_organ_info': boildown_tissue_organ_info,  # OK also with multiple biosources
    'biosource': minimize_biosource,
    'biosample_protocols': boildown_biosample_protocols,
    'cell_culture_details': boildown_cell_culture_details,  # returns cell_culture_protocols
#     'last_modified': boildown_date_modified,
#     'documents': boildown_protocols,
#     'external_references': boildown_external_references,  # dbxrefs
}


def simplify_biosample(biosample_object):
    ''' Keys are explicitly declared. Keys not present are ignored.'''
    biosample_dict = {}
    for key, value in biosample_object.items():
        if key in biosample_simple_values:
            add_value_to_output_dict(key, value, biosample_dict)
        elif key in ['cell_culture_details', 'biosource']:  # pass the entire object
            result = biosample_function_dispatch[key](biosample_object)
            add_value_to_output_dict(key, result, biosample_dict)
        elif key in biosample_function_dispatch:
            result = biosample_function_dispatch[key](value)
            add_value_to_output_dict(key, result, biosample_dict)
    return biosample_dict

### Part 1 - List all sets to export

In [None]:
# List ExpSets (or a search query) to export for GEO submission

sets_list = ['4DNESRJ8KV4Q']  # ['4DNESACCCCCC', '4DNESACCCCCC']
search_url = ''  # '/search/?type=ExperimentSetReplicate&condition=Enzyme%20DpnII%20-%20in%20situ%20Hi-C%20on%20cells%20cultured%20prior%20to%204DN%20SOP'

sets_to_submit = []
if sets_list:
    sets_to_submit.extend([ff_utils.get_metadata(set_id, key=my_auth) for set_id in sets_list])
elif search_url:
    sets_to_submit = ff_utils.search_metadata(search_url, my_auth)

print(len(sets_to_submit), 'Experiment Sets to export for GEO submission')

### Part 2 - Generate simplified dictionaries

In [None]:
# Export simplified dictionaries for each ExpSet, Experiment, Biosample

date_dict = {}  # record date of export

def _get_date_exported():
    '''get date of metadata export (to be reported in the external_submissions)'''
    date = datetime.now(timezone.utc).isoformat()
    return {'date_exported': date, 'database': DB}

# Experiment Sets
es_dicts = []
experiments_to_submit = []
for es in sets_to_submit:
    es_dictionary = simplify_expset(es)
    for ex in es_dictionary['replicate_exps']:  # get Experiments from the exported ExpSet
        experiments_to_submit.append(ex['@id'])
        ex.pop('@id')  # remove from json output
    es_dicts.append(es_dictionary)
    date_dict[es['@id']] = _get_date_exported


# Experiments
ex_dicts = []
biosamples_to_submit = []
for ex in list(set(experiments_to_submit)):
    ex_dictionary = simplify_experiment(get_item_in_store(ex))
    ex_dicts.append(ex_dictionary)
    date_dict[ex] = _get_date_exported
    # get Biosample from the exported Experiment
    biosamples_to_submit.append('/biosamples/' + ex_dictionary['biosample'] + '/')

# Biosamples
bs_dicts = []
for bs in list(set(biosamples_to_submit)):
    bs_dictionary = simplify_biosample(get_item_in_store(bs))
    bs_dicts.append(bs_dictionary)
    date_dict[bs] = _get_date_exported

# TODO: Files (raw, processed, suppl) need to be added to date_dict as well

print('Exp Sets exported:\t', [es['@id'] for es in sets_to_submit])
print('Experiments exported:\t', list(set(experiments_to_submit)))
print('Biosamples exported:\t', list(set(biosamples_to_submit)))

### Part 3 - Save dictionaries as json files

In [None]:
directory = Path("~/Documents/GEO/test").expanduser()
overwrite = False  # overwrites existing files if set to True

for dictionary in (es_dicts + ex_dicts + bs_dicts):
    file_name = dictionary['accession'] + '.json'
    full_path = Path.joinpath(directory, file_name)
    if full_path.exists() and not overwrite:
        print(file_name, 'already exists in the folder', directory)
    else:
        with open(full_path, 'w') as fp:
            json.dump(dictionary, fp, indent=4)
        print(file_name, 'file saved')

### Part 4 - Patch export date for each item in external_submission

In [None]:
action = False

count = 0
for item_id, patch_body in date_dict.items():
    if action:
        res = ff_utils.patch_metadata({"external_submission": patch_body}, item_id, key=my_auth)
        if res['status'] != 'success':
            print(res)
            break
        else:
            count += 1
    else:
        print(item_id, patch_body)
print('patched {} items'.format(count))