In [1]:
import xml.etree.ElementTree as ET

In [23]:
# xml parsing from xml file downloaded by esearch utility of NCBI
tree=ET.parse('cancer_inhib_affy_human')

In [24]:
root=tree.getroot()

In [26]:
print(root.getchildren())

[<Element 'Count' at 0x000001DAEA55AE08>, <Element 'RetMax' at 0x000001DAEA55AEF8>, <Element 'RetStart' at 0x000001DAEA51AE58>, <Element 'IdList' at 0x000001DAEA51AC78>, <Element 'TranslationSet' at 0x000001DAEA800958>, <Element 'TranslationStack' at 0x000001DAEA800E58>, <Element 'QueryTranslation' at 0x000001DAEA8562C8>]


  """Entry point for launching an IPython kernel.


In [50]:
for item in root.iter('Id'):
    print(item.text)

200141170
200149784
200148858
200148851
200125975
200148242
200136614
200136613
200128400
200133513
200147484
200128737
200128649
200146911
200143007
200052099
200145725
200137178
200141176
200114457


In [2]:
# script imported from POSTMAN for downloading record from GEO dataset for cancer + inhibitor + human + affymetrix
import requests

url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=cancer+AND+inhibitor+AND+Human+AND+Affymetrix"
payload = {}
headers = {
  'Cookie': 'ncbi_sid=6855898911E7D263_9047SID'
}
response = requests.request("GET", url, headers=headers, data = payload)

In [3]:
# io is needed if we get data as python request object. if saved as xml file then directly the file can be parsed 
import io
tree=ET.parse(io.StringIO(response.text))

In [5]:
# getting the root of the xml file
root=tree.getroot()

In [6]:
# Count is the number of matches found, RetMax is a parameter for results returned, the default is 20. IdList is the list of GEO files
for element in root:
    print(element.tag,element.attrib,element.text)

Count {} 918
RetMax {} 20
RetStart {} 0
IdList {} 

TranslationSet {} None
TranslationStack {}    
QueryTranslation {} ("neoplasms"[MeSH Terms] OR cancer[All Fields]) AND inhibitor[All Fields] AND ("humans"[MeSH Terms] OR "Homo sapiens"[Organism] OR Human[All Fields]) AND Affymetrix[All Fields]


In [26]:
# 
for item in root.findall('Count'):
    print(item.text)

918


In [27]:
# RetMax is the maximal value of results returned
for item in root.findall('RetMax'):
    print(item.text)

20


In [7]:
# the child nodes of the root
root.getchildren()

  


[<Element 'Count' at 0x7fcbba772650>,
 <Element 'RetMax' at 0x7fcbba7726b0>,
 <Element 'RetStart' at 0x7fcbba772710>,
 <Element 'IdList' at 0x7fcbba772770>,
 <Element 'TranslationSet' at 0x7fcbba772fb0>,
 <Element 'TranslationStack' at 0x7fcbba6e4350>,
 <Element 'QueryTranslation' at 0x7fcbba6e6410>]

In [8]:
# collect a child by tag
x=root.find('IdList')
print(x)

<Element 'IdList' at 0x7fcbba772770>


In [9]:
# collect a child by subscription
y=root.getchildren()[3]
print(y)

<Element 'IdList' at 0x7fcbba772770>


  


In [10]:
# loop for printing the list of ids
for item in x:
    print(item.tag,item.attrib,item.text)

Id {} 200141170
Id {} 200149784
Id {} 200148858
Id {} 200148851
Id {} 200125975
Id {} 200148242
Id {} 200136614
Id {} 200136613
Id {} 200128400
Id {} 200133513
Id {} 200147484
Id {} 200128737
Id {} 200128649
Id {} 200146911
Id {} 200143007
Id {} 200052099
Id {} 200145725
Id {} 200137178
Id {} 200141176
Id {} 200114457


In [11]:
# the IDs are collected in a list 
lst_dwnld=[]
for item in x:
    lst_dwnld.append(item.text)
print(lst_dwnld)    

['200141170', '200149784', '200148858', '200148851', '200125975', '200148242', '200136614', '200136613', '200128400', '200133513', '200147484', '200128737', '200128649', '200146911', '200143007', '200052099', '200145725', '200137178', '200141176', '200114457']


In [23]:
#the list is saved as a file
fh=open('lst_dwnld','w+')
fh.write(str(lst_dwnld))
fh.close()

In [99]:
# since count has shown 918 records we will modify our response to get all the 918 records
import requests
# adding RetMax value to url
new_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?RetMax=1000&db=gds&term=cancer+AND+inhibitor+AND+Human+AND+Affymetrix"
payload = {}
headers = {
  'Cookie': 'ncbi_sid=6855898911E7D263_9047SID'
}
new_response = requests.request("GET", new_url, headers=headers, data = payload)

In [100]:
new_tree=ET.parse(io.StringIO(new_response.text))

In [101]:
new_root=new_tree.getroot()

In [102]:
new_root.getchildren()

  """Entry point for launching an IPython kernel.


[<Element 'Count' at 0x000001E3FC465D18>,
 <Element 'RetMax' at 0x000001E3FC465BD8>,
 <Element 'RetStart' at 0x000001E3FC4656D8>,
 <Element 'IdList' at 0x000001E3FC4655E8>,
 <Element 'TranslationSet' at 0x000001E3FB74F6D8>,
 <Element 'TranslationStack' at 0x000001E3FB74FA98>,
 <Element 'QueryTranslation' at 0x000001E3FB7573B8>]

In [89]:
# the returned results has increased to 918
for element in new_root:
    print(element.tag,element.attrib,element.text)

Count {} 918
RetMax {} 918
RetStart {} 0
IdList {} 

TranslationSet {} None
TranslationStack {}    
QueryTranslation {} ("neoplasms"[MeSH Terms] OR cancer[All Fields]) AND inhibitor[All Fields] AND ("humans"[MeSH Terms] OR "Homo sapiens"[Organism] OR Human[All Fields]) AND Affymetrix[All Fields]


In [105]:
idlist=new_root.find('IdList')
print(idlist)

<Element 'IdList' at 0x000001E3FC4655E8>


In [107]:
listofID=[]
for child in idlist:
    listofID.append(child.text)

In [108]:
len(listofID)

918