# Import Documents from PubMed

In [None]:
# coding: utf-8

import os
import requests
import xml.etree.ElementTree as et


## Set access to PubMed webservices

In [None]:
# pubmed webservice url for retreiving documents ids from query
url_search = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=xml&retmax=100000&term="
# pubmed webservice url for retreiving abstract from docuemnt id
url_fetch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=text&rettype=xml&id="

## Set PubMed queries

In [None]:
#Pubmed queries
queries = {
    "eczema" : "(\"Eczema\"[MH] NOT \"Dermatitis, Atopic\"[MH]) AND (\"1945/01/01\"[PDat] : \"2017/31/12\"[PDat])",
    "dermatitis_atopic" : "(\"Dermatitis, Atopic\"[MH] NOT \"Eczema\"[MH]) AND (\"1945/01/01\"[PDat] : \"2017/31/12\"[PDat])"
}

## Retrieve and parse response

In [None]:
for label in queries : 
    query = queries[label]
    pathData = "./data/raw/"+label
    if not os.path.isdir(pathData) : 
        os.mkdir(pathData)


    # retrieve documents id related to query
    f = requests.get(url_search+query)
    #--parse response to extract article ids
    tree = et.fromstring(f.text)
    ids_list = tree.findall("./IdList/Id")
    print(query+"\t"+str(len(ids_list)))
    
    #--export PMIDs list
    with open("./data/"+label+"_PMIDs.tab", "w") as fout : 
        for idPaper in ids_list : 
            fout.write(str(idPaper.text)+"\n")
    
    # for each document returned by pubmed
    for idPaper in ids_list :
        
        #get pubmed entry xml content from pubmed identifier
        article = requests.get(url_fetch+idPaper.text)
        try : 
            #get title from pubmed entry              
            article_tree = et.fromstring(article.text)
            title = article_tree.find("./PubmedArticle/MedlineCitation/Article/ArticleTitle")

            #skip documents without title or undefined title
            if title is not None : 
                document = title.text
                if "Not Available" not in document : 
                    
                    #get abstract content from pubmed entry  
                    abstractObject = article_tree.find("./PubmedArticle/MedlineCitation/Article/Abstract/AbstractText")
                    #if abstract available, append to title in document
                    if not abstractObject is None :
                        document=document+" "+abstractObject.text
                    document = document.replace("[", "").replace("]", "") #remove brackets for translated titles

                    #write extracted document
                    with open(pathData+"/"+str(idPaper.text)+".txt", 'w') as fw : 
                        fw.write(document)
        except Exception as e:
                print("error : unable to import data from PMID ",idPaper.text,"\n",str(e))