In [363]:
7#!/usr/bin/env python

"""
This script will download files from an amazon s3 box.
First, it will look for any files (json) and download to a
json_folder to store all jobs submitted.
Then, it will download any raw files specified in the document
and create an executable copy of each jobfile in a work_folder.

"""
import argparse
import xml.etree.ElementTree as et 
import pandas as pd
import numpy as np
import os
import glob
from xml.dom import minidom
from collections import defaultdict
import yaml

from yaml.representer import Representer
yaml.add_representer(defaultdict, Representer.represent_dict)

import json

def get_metadata_from_xml(input_file, transcriptome, module, module_version, module_script):
    """
    Transforms the metadata XML file into a dictionary that can be dumped into 1 or more properly 
    formatted JSON files.
    
    input_file: string
    transcriptome: string
    chemistry: string
    cr11: string
    module: string
    module_version: string
    module_script: string
    """
    my_expt_set_metadata = []    

    xmldoc = minidom.parse(input_file)
    expt_packages = xmldoc.getElementsByTagName('EXPERIMENT_PACKAGE')
    for expt_package in expt_packages:
        my_expt_metadata = defaultdict(list)
        
        ### Pipeline-specific. use function input to describe module/pipelines used. ###
        my_expt_metadata['module'] = module
        my_expt_metadata['module_version'] = module_version
        my_expt_metadata['module_script'] = module_script
        
        ### EXPERIMENTAL METADATA. Describe experiment-level details. Broken up into 1) organization, 2) experiment 3) design 4) study ###
        organization_metadata = expt_package.getElementsByTagName('Organization')
        for metadata in organization_metadata:
            my_expt_metadata['organization'].append(metadata.getElementsByTagName('Name')[0].firstChild.data)
            my_expt_metadata['email'].append(metadata.getElementsByTagName('Contact')[0].attributes['email'].value)
        my_expt_metadata['organization'] = ','.join(my_expt_metadata['organization'])
        my_expt_metadata['email'] = ','.join(my_expt_metadata['email'])

        expt_metadata = expt_package.getElementsByTagName('EXPERIMENT')
        for metadata in expt_metadata:
            title = metadata.getElementsByTagName('TITLE')[0].firstChild.data
            my_expt_metadata['experiment_nickname'].append(metadata.attributes['accession'].value)
            my_expt_metadata['experiment_summary'].append(title)
        my_expt_metadata['experiment_nickname'] = ','.join(my_expt_metadata['experiment_nickname'])
        my_expt_metadata['experiment_summary'] = ','.join(my_expt_metadata['experiment_summary'])

        design_metadata = expt_package.getElementsByTagName('DESIGN')[0]
        my_expt_metadata['library_layout'] = design_metadata.getElementsByTagName('LIBRARY_DESCRIPTOR')[0].getElementsByTagName('LIBRARY_LAYOUT')[0].firstChild.tagName
        my_expt_metadata['library_source'] = design_metadata.getElementsByTagName('LIBRARY_DESCRIPTOR')[0].getElementsByTagName('LIBRARY_SOURCE')[0].firstChild.nodeValue
        my_expt_metadata['library_description'] = design_metadata.getElementsByTagName('LIBRARY_DESCRIPTOR')[0].getElementsByTagName('LIBRARY_SELECTION')[0].firstChild.nodeValue        
        
        study_metadata = expt_package.getElementsByTagName('STUDY')[0]
        my_expt_metadata['study_title'] = study_metadata.getElementsByTagName('STUDY_TITLE')[0].firstChild.nodeValue
        my_expt_metadata['study_abstract'] = study_metadata.getElementsByTagName('STUDY_ABSTRACT')[0].firstChild.nodeValue
        
        external_ids = study_metadata.getElementsByTagName('EXTERNAL_ID')
        for external_id in external_ids:
            for attr in external_id.attributes.keys():
                ddb = external_id.attributes[attr].value
                did = external_id.firstChild.data
                my_expt_metadata['database'].append({ddb:did})
        study_links = study_metadata.getElementsByTagName('STUDY_LINKS')
        
        for link in study_links:
            sdb = link.getElementsByTagName('XREF_LINK')[0].childNodes[0].firstChild.data
            sid = link.getElementsByTagName('XREF_LINK')[0].childNodes[1].firstChild.data
            my_expt_metadata['reference'].append({sdb:sid})
            
        ### RUN METADATA. Describe each individual sequencing run. ###
        run_set = expt_package.getElementsByTagName('RUN_SET')
        for run in run_set:
            my_run_metadata = defaultdict(dict)

            run_metadata = run.getElementsByTagName('RUN')
            for metadata in run_metadata:
                try:
                    my_run_metadata['library_nickname'] = metadata.attributes['accession'].value
                except KeyError:
                    my_run_metadata['library_nickname'] = ''
                try:
                    my_run_metadata['library_prep'] = ""
                except KeyError:
                    my_run_metadata['library_prep'] = ""
                try:
                    my_run_metadata['sample_id'] = metadata.attributes['accession'].value
                except KeyError:
                    my_run_metadata['sample_id'] = ""
                try:
                    my_run_metadata['original_assembly'] = metadata.attributes['assembly'].value
                except KeyError:
                    my_run_metadata['original_assembly'] = ""

            ### Comment out for now. Supposedly NCBI will move to AWS and will eventually have s3 links available. ###
            # file_metadata = run.getElementsByTagName('SRAFile')
            # for metadata in file_metadata:
            #     if 'url' in metadata.attributes:
            #         print(metadata.attributes['url'].value)
                # filename = metadata.attributes['filename'].value
                # url = metadata.attributes['url'].value
                # print(url)

            instrument_metadata = expt_package.getElementsByTagName('PLATFORM')
            for metadata in instrument_metadata:
                try:
                    instrument = metadata.getElementsByTagName('INSTRUMENT_MODEL')[0].firstChild.nodeValue
                except AttributeError:
                    instrument = ''
                my_run_metadata['instrument_model'] = instrument
                my_run_metadata['transcriptome'] = {
                    'class': 'Directory',
                    'path': transcriptome
                }

            # This actually belong on the top level per-expt, but we apply characteristics to each file.
            taxonomy_metadata = expt_package.getElementsByTagName('SAMPLE_NAME')
            for metadata in taxonomy_metadata:
                scientific_name = metadata.getElementsByTagName('SCIENTIFIC_NAME')[0].firstChild.data
                my_expt_metadata['organism'] = scientific_name

            sample_set = expt_package.getElementsByTagName('SAMPLE')
            for sample in sample_set:
                sample_metadata = sample.getElementsByTagName('SAMPLE_ATTRIBUTES')
                characteristics_list = []
                for metadata in sample_metadata:
                    attributes = metadata.getElementsByTagName('SAMPLE_ATTRIBUTE')
                    my_sample_metadata = defaultdict()
                    for attribute in attributes:
                        characteristics_tag = {
                            'name': attribute.getElementsByTagName('TAG')[0].firstChild.data,
                            'value': attribute.getElementsByTagName('VALUE')[0].firstChild.data
                        }
                        characteristics_list.append(characteristics_tag)
            my_run_metadata['characteristics'] = characteristics_list
            my_expt_metadata['samples'].append(my_run_metadata)

        my_expt_set_metadata.append(my_expt_metadata)
    return my_expt_set_metadata

In [364]:
input_file = '/home/bay001/projects/codebase/bfx/notebooks/pilot_notebooks/SraExperimentPackage.xml'

get_metadata_from_xml(input_file, transcriptome='hg19.transcriptome', module='rnaseq', module_version='0.0.3', module_script='rnaseq/0.0.3')

[defaultdict(list,
             {'module': 'rnaseq',
              'module_version': '0.0.3',
              'module_script': 'rnaseq/0.0.3',
              'organization': 'NCBI',
              'email': 'geo-group@ncbi.nlm.nih.gov',
              'experiment_nickname': 'SRX4004898',
              'experiment_summary': 'GSM3119650: PSA-NCAM+ GFP- rep 2; Homo sapiens; RNA-Seq',
              'library_layout': 'PAIRED',
              'library_source': 'TRANSCRIPTOMIC',
              'library_description': 'cDNA',
              'study_title': 'Trans-differentiation of human adult peripheral blood T cells into neurons',
              'study_abstract': 'Examining the transcriptomic changes during transdifferentiation of peripheral blood mononuclear cells to induced neuronal cells. Overall design: There are three different populations: PBMC (2 biological replicates, starting population), PSA-NCAM+GFP+ (2 biological replicates, induced neuronal cells) and PSA-NCAM+GFP- (2 biological replicates,

In [None]:
metadata = get_metadata_from_xml(
    input_file=input_file, 
    transcriptome=transcriptome, 
    chemistry=chemistry, 
    cr11=cr11,
    module=module,
    module_version=module_version,
    module_script=module_script
)