In [166]:
from Bio import Entrez
import os
import json

Entrez.email = 'aleksejs.sazonovs@gmail.com'
    
def fetch_xml(pmid):
    handle = Entrez.efetch(db='pubmed', id=pmid, retmode='xml')
    xml_data = Entrez.read(handle)[0]
    try:
        return xml_data
    except IndexError:
        return None

def doi_to_pmid(doi):
    handle = Entrez.esearch(db="pubmed", retmax=100, term=doi)
    record = Entrez.read(handle)
    handle.close()
    try:
        return record['IdList'][0]
    except IndexError:
        return None

def process_mendeley_record(record):
    doi = record['doi']
    pmid = doi_to_pmid(doi)
    if pmid != None:
        pubmed_xml = fetch_xml(pmid)

        # Process authors 
        authors = []
        if 'AuthorList' in pubmed_xml['MedlineCitation']['Article']:
            for author in pubmed_xml['MedlineCitation']['Article']['AuthorList']:
                if 'ForeName' in author:
                    fullname = author['ForeName'] + " " + author['LastName']
                else: 
                    fullname = author['LastName']
                authors += [fullname]
        else: 
            return None
        
         # Process keywords
        keywords = []

        # Process keywords from Mendeley
        keywords += [x.lower() for x in record['keywords']]

        # Process keyboards from mesh
        if 'MeshHeadingList' in list(pubmed_xml['MedlineCitation']):
            mesh_headings = list(pubmed_xml['MedlineCitation']['MeshHeadingList'])
            for mesh_heading in mesh_headings:
                if mesh_heading['DescriptorName'].attributes['MajorTopicYN'] == 'Y':
                    keywords += [str(mesh_heading['DescriptorName']).lower()]
                qualifier = mesh_heading['QualifierName']
                if len(qualifier) > 0 and qualifier[0].attributes['MajorTopicYN'] == 'Y':
                    keywords += [str(qualifier[0]).lower()]

        # Process keywords from Pubmed
        if 'KeywordList' in pubmed_xml['MedlineCitation'] and len(pubmed_xml['MedlineCitation']['KeywordList']) > 0:
            for keyword in pubmed_xml['MedlineCitation']['KeywordList'][0]:
                keywords += [keyword.lower()]
        
        return {'authors':authors, 'keywords':keywords}
    else:
        return None

In [167]:
#fetch_xml(doi_to_pmid("10.1007/s11306-015-0838-z"))

In [173]:
xml_example = None
def main():
    for i in os.listdir('./data'):
        if i.endswith(".json"): 
            json_file = open('./data/' + i)
            mendeley_data = json.load(json_file)
            output_file = open('./data_out/' + i, 'w')
            #print(mendeley_data[1])
            #print(process_mendeley_record(mendeley_data[1]))
            for i, record in enumerate(mendeley_data):
                record_output = process_mendeley_record(record)
                if record_output is not None:
                    output_file.write(str() + '\n')
                    if i % 100:
                        output_file.flush()
            output_file.close()
            continue
        else:
            continue
    
    
if __name__ == "__main__":
    main()

KeyboardInterrupt: 

In [139]:
mendeley_data = json.load(open('./data/drugdiscovery.json'))
xml = process_mendeley_record(mendeley_data[0])


['cellular transformation', 'metabolite correlations', 'metabolomics', 'nmr']



In [101]:
from pprint import pprint
mesh_headings = list(xml['MedlineCitation']['MeshHeadingList'])
for mesh_heading in mesh_headings:
    if mesh_heading['DescriptorName'].attributes['MajorTopicYN'] == 'Y':
        print(str(mesh_heading['DescriptorName']))
    qualifier = mesh_heading['QualifierName']
    if len(qualifier) > 0 and qualifier[0].attributes['MajorTopicYN'] == 'Y':
        print(str(qualifier[0]))

KeyError: 'MeshHeadingList'

In [148]:
for author in xml['MedlineCitation']['Article']['AuthorList']:
    print(author['ForeName'] + " " + author['LastName'])

Basetti Madhu
Masako Narita
Alexandra Jauhiainen
Suraj Menon
Marion Stubbs
Simon Tavaré
Masashi Narita
John R Griffiths


In [48]:
type(list(a))

<class 'list'>