In [1]:
from lxml import etree
import pandas as pd

In [2]:
tree = etree.parse('biosample_result.xml')

In [3]:
# dict of all XPATHs we're interested in, relative to a given <BioSample>
BIOSAMPLE_XPATHS = {
    "biosample_id": ".//Id[@db='BioSample']", 
    "sra_id": ".//Id[@db='SRA']", 
    "title": ".//Title", 
    "paragraph": ".//Description/Comment/Paragraph",
    "tissue": ".//Attribute[@attribute_name='tissue']",
    "age": ".//Attribute[@attribute_name='age']",
    "cultivar": ".//Attribute[@attribute_name='cultivar']",
    "maize_cultivar": ".//Attribute[@attribute_name='maize cultivar']",
    "genotype": ".//Attribute[@attribute_name='genotype']",
    "ecotype": ".//Attribute[@attribute_name='ecotype']",
    "isolate": ".//Attribute[@attribute_name='isolate']",
    "dev_stage": ".//Attribute[@attribute_name='dev_stage']",
    "leaf_number": ".//Attribute[@attribute_name='leaf number']",
    "source_name": ".//Attribute[@attribute_name='source name']",
    "label": ".//Attribute[@attribute_name='label']"
}

In [4]:
def extract_biosample_values(biosample, xpaths):
    """Takes a <BioSample> element and a dict that maps
    from attribute names to XPATHs.
    Extracts all the text values from the XPATHs.
    Returns a dict mapping from the attribute names to
    attribute values.
    """
    result = {}
    for name, query in xpaths.items():
        xpath_result = biosample.xpath(query)
        if xpath_result:
            # XPATH queries always return a list of results,
            # but we're certain there's only one result.
            result[name] = xpath_result[0].text
        else: # If you remove the "else"-part,
              # the resulting table will contain NaNs instead of empty strings
            result[name] = ""
    return result

In [5]:
extracted_samples = []
for sample in tree.iter('BioSample'):
    extracted_sample = extract_biosample_values(sample, BIOSAMPLE_XPATHS)
    extracted_samples.append(extracted_sample)

In [6]:
samples_table = pd.DataFrame(extracted_samples)

In [7]:
samples_table

Unnamed: 0,age,biosample_id,cultivar,dev_stage,ecotype,genotype,isolate,label,leaf_number,maize_cultivar,paragraph,source_name,sra_id,tissue,title
0,9,SAMEA5605513,B73,,,wild type genotype,,,,,,,ERS3409710,,E-MTAB-7200_2:Sample 9
1,9,SAMEA5605512,B73,,,wild type genotype,,,,,,,ERS3409709,,E-MTAB-7200_2:Sample 8
2,9,SAMEA5605511,B73,,,wild type genotype,,,,,,,ERS3409708,,E-MTAB-7200_2:Sample 7
3,9,SAMEA5605510,B73,,,wild type genotype,,,,,,,ERS3409707,,E-MTAB-7200_2:Sample 6
4,9,SAMEA5605509,B73,,,wild type genotype,,,,,,,ERS3409706,,E-MTAB-7200_2:Sample 5
5,9,SAMEA5605508,B73,,,wild type genotype,,,,,,,ERS3409705,,E-MTAB-7200_2:Sample 4
6,9,SAMEA5605507,B73,,,wild type genotype,,,,,,,ERS3409704,,E-MTAB-7200_2:Sample 3
7,9,SAMEA5605506,B73,,,wild type genotype,,,,,,,ERS3409703,,E-MTAB-7200_2:Sample 2
8,9,SAMEA5605505,B73,,,wild type genotype,,,,,,,ERS3409702,,E-MTAB-7200_2:Sample 10
9,9,SAMEA5605504,B73,,,wild type genotype,,,,,,,ERS3409701,,E-MTAB-7200_2:Sample 1


In [8]:
samples_table.to_csv('biosample_result.csv')