# Pubmed Data Reading

Read the data from this website:https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/
Build an XML tree and extract information

In [1]:
import xml.etree.ElementTree

def make_dict_from_tree(element_tree):
    """Traverse the given XML element tree to convert it into a dictionary.
    
    :param element_tree: An XML element tree
    :type element_tree: xml.etree.ElementTree
    :rtype: dict
    """
    def internal_iter(tree, accum):
        """Recursively iterate through the elements of the tree accumulating
        a dictionary result.
        
        :param tree: The XML element tree
        :type tree: xml.etree.ElementTree
        :param accum: Dictionary into which data is accumulated
        :type accum: dict
        :rtype: dict
        """
        if tree is None:
            return accum
        
        if tree.getchildren():
            accum[tree.tag] = {}
            for each in tree.getchildren():
                result = internal_iter(each, {})
                if each.tag in accum[tree.tag]:
                    if not isinstance(accum[tree.tag][each.tag], list):
                        accum[tree.tag][each.tag] = [
                            accum[tree.tag][each.tag]
                        ]
                    accum[tree.tag][each.tag].append(result[each.tag])
                else:
                    accum[tree.tag].update(result)
        else:
            accum[tree.tag] = tree.text
        
        return accum
    
    return internal_iter(element_tree, {})

In [2]:
#Build the XML tree
tree = xml.etree.ElementTree.parse('pubmed18n0001.xml')
root = tree.getroot()

xmlstr = xml.etree.ElementTree.tostring(root, encoding='unicode', method='xml')
temporary_dict = make_dict_from_tree(xml.etree.ElementTree.fromstring(xmlstr))

In [3]:
#Print Article Info
print(temporary_dict["PubmedArticleSet"]["PubmedArticle"][1000]["MedlineCitation"]["Article"])

{'Journal': {'ISSN': '0264-6021', 'JournalIssue': {'Volume': '149', 'Issue': '3', 'PubDate': {'Year': '1975', 'Month': 'Sep'}}, 'Title': 'The Biochemical journal', 'ISOAbbreviation': 'Biochem. J.'}, 'ArticleTitle': 'The amino acid sequence of Neurospora NADP-specific glutamate dehydrogenase. Peptides from digestion with a staphylococcal proteinase.', 'Pagination': {'MedlinePgn': '749-55'}, 'Abstract': {'AbstractText': 'The extracellular proteinase of Staphylococcus aureus strain V8 was used to digest the NADP-specific glutamate dehydrogenase of Neurospora crassa. Of 35 non-overlapping peptides expected from the glutamate content of the polypeptide chain, 29 were isolated and substantially sequenced. The sequences obtained were valuable in providing overlaps for the alignment of about two-thirds of the sequences found in tryptic peptides [Wootton, J. C., Taylor, J, G., Jackson, A. A., Chambers, G. K. & Fincham, J. R. S. (1975) Biochem. J. 149, 739-748]. The blocked N-terminal peptide of

In [4]:
#print Mesheading List
print(temporary_dict["PubmedArticleSet"]["PubmedArticle"][1000]["MedlineCitation"]["MeshHeadingList"])

{'MeshHeading': [{'DescriptorName': 'Amino Acid Sequence'}, {'DescriptorName': 'Glutamate Dehydrogenase', 'QualifierName': ['analysis', 'isolation & purification']}, {'DescriptorName': 'NADP'}, {'DescriptorName': 'Neurospora', 'QualifierName': 'enzymology'}, {'DescriptorName': 'Neurospora crassa', 'QualifierName': 'enzymology'}, {'DescriptorName': 'Peptide Fragments', 'QualifierName': 'analysis'}, {'DescriptorName': 'Peptide Hydrolases'}, {'DescriptorName': 'Staphylococcus aureus', 'QualifierName': 'enzymology'}]}
