In [1]:
import xml.etree.ElementTree as et 
import pandas as pd
import numpy as np
import os
import glob
from xml.dom import minidom
from collections import defaultdict
import yaml

from yaml.representer import Representer
yaml.add_representer(defaultdict, Representer.represent_dict)

import json

In [2]:
input_file = '/projects/ps-yeolab4/NCRCRG/home/bay001/notebooks/SraExperimentPackage.xml'

In [4]:
my_expt_set_metadata = []    

xmldoc = minidom.parse(input_file)
expt_packages = xmldoc.getElementsByTagName('EXPERIMENT_PACKAGE')
for expt_package in expt_packages:
    my_expt_metadata = defaultdict(list)
    
    organization_metadata = expt_package.getElementsByTagName('Organization')
    for metadata in organization_metadata:
        my_expt_metadata['organization'].append(metadata.getElementsByTagName('Name')[0].firstChild.data)
        my_expt_metadata['email'].append(metadata.getElementsByTagName('Contact')[0].attributes['email'].value)
    my_expt_metadata['organization'] = ','.join(my_expt_metadata['organization'])
    my_expt_metadata['email'] = ','.join(my_expt_metadata['email'])
    
    expt_metadata = expt_package.getElementsByTagName('EXPERIMENT')
    for metadata in expt_metadata:
        title = metadata.getElementsByTagName('TITLE')[0].firstChild.data
        my_expt_metadata['experiment_nickname'].append(metadata.attributes['accession'].value)
        my_expt_metadata['experiment_summary'].append(title)
    my_expt_metadata['experiment_nickname'] = ','.join(my_expt_metadata['experiment_nickname'])
    my_expt_metadata['experiment_summary'] = ','.join(my_expt_metadata['experiment_summary'])
    
    design_metadata = expt_package.getElementsByTagName('DESIGN')
    for metadata in design_metadata:
        try:
            design = metadata.getElementsByTagName('DESIGN_DESCRIPTION')[0].firstChild.data
        except AttributeError:
            design = ''
        my_expt_metadata['experiment_design'].append(design)
    my_expt_metadata['experiment_design'] = ','.join(my_expt_metadata['experiment_design'])
    
    run_set = expt_package.getElementsByTagName('RUN_SET')
    
    for run in run_set:
        my_run_metadata = defaultdict(dict)
        
        
        run_metadata = run.getElementsByTagName('RUN')
        for metadata in run_metadata:
            my_run_metadata['library_nickname'] = metadata.attributes['accession'].value
            my_run_metadata['library_prep'] = ""
            my_run_metadata['sample_id'] = metadata.attributes['accession'].value
            my_run_metadata['original_assembly'] = metadata.attributes['assembly'].value
        
        file_metadata = run.getElementsByTagName('SRAFile')
        for metadata in file_metadata:
            filename = metadata.attributes['filename'].value
            url = metadata.attributes['url'].value
            if filename.endswith('.bam'):
                my_run_metadata['bam_file']['class'] = 'File'
                my_run_metadata['bam_file']['path'] = metadata.attributes['url'].value
                
        instrument_metadata = expt_package.getElementsByTagName('PLATFORM')
        for metadata in instrument_metadata:
            try:
                instrument = metadata.getElementsByTagName('INSTRUMENT_MODEL')[0].firstChild.data
            except AttributeError:
                instrument = ''
            my_run_metadata['instrument_model'] = instrument
            my_run_metadata['read1_length'] = 26
            my_run_metadata['read2_length'] = 98
            my_run_metadata['expect_cells'] = 3000
            
        
        # This actually belong on the top level per-expt, but we apply characteristics to each file.
        taxonomy_metadata = expt_package.getElementsByTagName('SAMPLE_NAME')
        for metadata in taxonomy_metadata:
            scientific_name = metadata.getElementsByTagName('SCIENTIFIC_NAME')[0].firstChild.data
            my_expt_metadata['organism'] = scientific_name
        
        sample_set = expt_package.getElementsByTagName('SAMPLE')
        for sample in sample_set:
            sample_metadata = sample.getElementsByTagName('SAMPLE_ATTRIBUTES')
            characteristics_list = []
            for metadata in sample_metadata:
                attributes = metadata.getElementsByTagName('SAMPLE_ATTRIBUTE')
                my_sample_metadata = defaultdict()
                for attribute in attributes:
                    characteristics_tag = {
                        'name': attribute.getElementsByTagName('TAG')[0].firstChild.data,
                        'value': attribute.getElementsByTagName('VALUE')[0].firstChild.data
                    }
                    characteristics_list.append(characteristics_tag)
        my_run_metadata['characteristics'] = characteristics_list
        my_expt_metadata['samples'].append(my_run_metadata)
        

    my_expt_set_metadata.append(my_expt_metadata)

In [5]:
for expt in my_expt_set_metadata:
    with open('/projects/ps-yeolab4/NCRCRG/home/bay001/notebooks/{}.json'.format(expt['experiment_nickname']), 'w') as o:
        json.dump(dict(expt), o, indent=2)

In [6]:
my_expt_set_metadata

[defaultdict(list,
             {'organization': 'Karolinska Institute',
              'email': 'peter.lonnerberg@ki.se',
              'experiment_nickname': 'SRX3809389',
              'experiment_summary': 'RNA-Seq of mouse ENS',
              'experiment_design': 'Single-cell suspensions were prepared using Liberase/TrypLE digestion followed by manual trituration using a Pasteur pipette. Cells were resuspended in artificial cerebrospinal fluid and further processed with the 10X Genomics Chromium Single Cell Kit Version 1 aiming for 3500 cells per sample.',
              'organism': 'Mus musculus',
              'samples': [defaultdict(dict,
                           {'library_nickname': 'SRR6854061',
                            'library_prep': '',
                            'sample_id': 'SRR6854061',
                            'original_assembly': 'GRCm38',
                            'bam_file': {'class': 'File',
                             'path': 'https://sra-pub-src-1.s3.am