# Text Source PubMed
Sales words

In [6]:
!python -m pip install openai
!python -m pip install pandas
!python -m pip install scipy
!python -m pip install requests
!python -m pip install numexpr



In [7]:
import sys
sys.path.append('/Users/andersohrn/opt/anaconda3/lib/python3.8/site-packages')
print (sys.path)

['/Users/andersohrn/Development/das_wort', '/Users/andersohrn/opt/anaconda3/lib/python38.zip', '/Users/andersohrn/opt/anaconda3/lib/python3.8', '/Users/andersohrn/opt/anaconda3/lib/python3.8/lib-dynload', '', '/Users/andersohrn/river_chatgpt/river_gpt/lib/python3.8/site-packages', '/Users/andersohrn/opt/anaconda3/lib/python3.8/site-packages', '/Users/andersohrn/opt/anaconda3/lib/python3.8/site-packages']


In [22]:
from typing import Optional, List
from datetime import date

import requests
import openai
import pandas as pd
from xml.etree import ElementTree as ET

In [35]:
PUBMED_APIs = {
    'standard' : {
        'base_url': 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/',
        'apis': {
            'search' : 'esearch.fcgi',
            'fetch' : 'efetch.fcgi'
        }
    },
    'open_access': {
        'base_url': 'https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/'
    }
}

KEY_SHORTHAND = {
    'n_search_results' : ('eSearchResult','@children','Count'),
    'ids_from_search' : ('eSearchResult', '@children','IdList', '@children','Id'),
    'abstract_text' : ('PubmedArticleSet','@children','PubmedArticle','@children','MedlineCitation','@children','Article','@children','Abstract','@children','AbstractText','@text')
}

In [24]:
def create_basic_search_term(author: Optional[str]=None, 
                             title: Optional[str]=None, 
                             abstract: Optional[str]=None, 
                             published_after_date: Optional[date]=None):
    term = []
    if not author is None:
        term.append('{}[Author]'.format(author))
    if not title is None:
        term.append('{}[Title]'.format(title))
    if not abstract is None:
        term.append('{}[Title/Abstract]'.format(abstract))
    if not published_after_date is None:
        term.append('{}[Date]'.format(FOOBAR(published_after_date)))
        
    if len(term) == 0:
        raise ValueError('No input values given')
        
    return '+AND+'.join(term)

search_term = create_basic_search_term(title='HER2-positive', author='Akihito Kawazoe')

In [64]:
def do_search_via_api(search_term: str):
    url = '{}{}?db=pubmed'.format(
        PUBMED_APIs['standard']['base_url'], 
        PUBMED_APIs['standard']['apis']['search']
    )
    url_full = '{}&term={}'.format(url, search_term)
    print('URL to call: {}'.format(url_full))
    
    r = requests.get(url=url_full)
    if r.status_code != 200:
        print ('Status code {} received!'.format(status_code))
        print ('Error message: {}'.format(r.content))
        
    return r.content

pubmed_object = do_search_via_api(search_term)

URL to call: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=Akihito Kawazoe[Author]+AND+HER2-positive[Title]


In [67]:
def element_to_dict(element):
    '''Convert an ET tree created from XML to dictionary

    '''
    result = {}
    tag = element.tag
    text = element.text.strip() if element.text else None
    attributes = element.attrib
    children = [element_to_dict(child) for child in element]
    if not attributes and not children:
        result[tag] = text
    else:
        result[tag] = {}
        if attributes:
            result[tag]["@attributes"] = attributes
        if children:
            result[tag]["@children"] = children
        if text:
            result[tag]["@text"] = text
    return result

root = ET.fromstring(pubmed_object)
print ([x.text for x in root.findall('.//Id'))]
root_dict = element_to_dict(root)

[<Element 'Id' at 0x7fc3ae64fc20>]


In [36]:
def get_element(obj, keys):
    # If the list of keys is empty, return the object itself
    if not keys:
        return obj
    # If the list of keys has only one element, return the value associated with that key
    elif len(keys) == 1:
        return obj[keys[0]]
    # If the list of keys has more than one element, get the first key and the rest of the keys
    else:
        key = keys[0]
        rest = keys[1:]
        # If the value associated with the first key is a dictionary, recursively call the function with that value and the rest of the keys
        if isinstance(obj[key], dict):
            return get_element(obj[key], rest)
        # If the value associated with the first key is a list, iterate over the list and filter out the elements that match the rest of the keys
        elif isinstance(obj[key], list):
            return [get_element(item, rest) for item in obj[key] if rest[0] in item]
        # If the value associated with the first key is neither a dictionary nor a list, raise an exception
        else:
            raise TypeError("Invalid object type")
            
print (root_dict)
print (get_element(root_dict, KEY_SHORTHAND['ids_from_search']))

{'eSearchResult': {'@children': [{'Count': '1'}, {'RetMax': '1'}, {'RetStart': '0'}, {'IdList': {'@children': [{'Id': '34912120'}]}}, {'TranslationSet': {'@children': [{'Translation': {'@children': [{'From': 'Akihito Kawazoe[Author]'}, {'To': 'Kawazoe, Akihito[Full Author Name]'}]}}]}}, {'QueryTranslation': 'kawazoe, akihito[Author] AND "HER2-positive"[Title]'}]}}
[['34912120']]


In [53]:
def get_abstracts_via_api(ids: List[str]):
    url = '{}{}?db=pubmed'.format(
        PUBMED_APIs['standard']['base_url'], 
        PUBMED_APIs['standard']['apis']['fetch']
    )
    url_full = '{}&rettype=Abstract&id={}'.format(url, ','.join(ids))
    print('URL to call: {}'.format(url_full))
    
    r = requests.get(url=url_full)
    if r.status_code != 200:
        print ('Status code {} received!'.format(status_code))
        print ('Error message: {}'.format(r.content))
        
    return r.content

pubmed_abstract_object = get_abstracts_via_api(ids=['34912120'])

URL to call: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&rettype=Abstract&id=34912120


In [54]:
root = ET.fromstring(pubmed_abstract_object)
root_dict = element_to_dict(root)

In [63]:
root.findall('.//AbstractText')[0].text

'Human epidermal growth factor receptor 2 (HER2,\xa0also known as ERBB2) amplification or overexpression occurs in approximately 20% of advanced gastric or gastro-oesophageal junction adenocarcinomas'

In [52]:
print(get_element(root_dict, ('PubmedArticleSet','@children','PubmedArticle','@children','MedlineCitation','@children','Article','@children','Abstract','@children','AbstractText','@text')))

[[[[['Human epidermal growth factor receptor 2 (HER2,\xa0also known as ERBB2) amplification or overexpression occurs in approximately 20% of advanced gastric or gastro-oesophageal junction adenocarcinomas']]]]]


[[[[['Human epidermal growth factor receptor 2 (HER2,\xa0also known as ERBB2) amplification or overexpression occurs in approximately 20% of advanced gastric or gastro-oesophageal junction adenocarcinomas']]]]]
