# Introduction to Entrez library

#### With Entrez library we can get data from NCBI (National Center for Biotechnology Information). Everything related to einfo we can do with Entrez

In [1]:
from Bio import Entrez # everything related to einfo we do with Entrez

In [2]:
Entrez.email='caveion.bruna@gmail.com' # nothing works without an email
record = Entrez.read(Entrez.einfo())
print(type(record))  # record is a dictionary that contains 'Dblist'
print(record.keys()) # inside 'Dblist' we can see all the type of information we can get from NCBI

<class 'Bio.Entrez.Parser.DictionaryElement'>
dict_keys(['DbList'])


In [3]:
record['DbList']

['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'proteinclusters', 'pcassay', 'protfam', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']

In [4]:
print("\n".join(record['DbList'])) # print each element is a different line to see it better

pubmed
protein
nuccore
ipg
nucleotide
structure
genome
annotinfo
assembly
bioproject
biosample
blastdbinfo
books
cdd
clinvar
gap
gapplus
grasp
dbvar
gene
gds
geoprofiles
homologene
medgen
mesh
nlmcatalog
omim
orgtrack
pmc
popset
proteinclusters
pcassay
protfam
pccompound
pcsubstance
seqannot
snp
sra
taxonomy
biocollections
gtr


In [5]:
record = Entrez.read(Entrez.einfo(db='genome'))  # get only genome database from einfo
print(type(record))

<class 'Bio.Entrez.Parser.DictionaryElement'>


In [6]:
record.keys() # even though this record has the same name as the previous one, this one contains a different key name

dict_keys(['DbInfo'])

In [7]:
record['DbInfo']

{'DbName': 'genome', 'MenuName': 'Genome', 'Description': 'Genomic sequences, contigs, and maps', 'DbBuild': 'Build220817-0555.1', 'Count': '91476', 'LastUpdate': '2022/08/17 07:08', 'FieldList': [{'Name': 'ALL', 'FullName': 'All Fields', 'Description': 'All terms from all searchable fields', 'TermCount': '16238721', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'N', 'Hierarchy': 'N', 'IsHidden': 'N'}, {'Name': 'UID', 'FullName': 'UID', 'Description': 'Unique number assigned to genome', 'TermCount': '0', 'IsDate': 'N', 'IsNumerical': 'Y', 'SingleToken': 'Y', 'Hierarchy': 'N', 'IsHidden': 'Y'}, {'Name': 'FILT', 'FullName': 'Filter', 'Description': 'Limits the records', 'TermCount': '17', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'Y', 'Hierarchy': 'N', 'IsHidden': 'N'}, {'Name': 'ORGN', 'FullName': 'Organism', 'Description': 'Organism', 'TermCount': '681473', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'Y', 'Hierarchy': 'Y', 'IsHidden': 'N'}, {'Name': 'PID', 'FullName':

In [8]:
for key in record['DbInfo'].keys():          # reorganize the data from record['DbInfo'] to see it better
    print(key, ':',record['DbInfo'][key])

DbName : genome
MenuName : Genome
Description : Genomic sequences, contigs, and maps
DbBuild : Build220817-0555.1
Count : 91476
LastUpdate : 2022/08/17 07:08
FieldList : [{'Name': 'ALL', 'FullName': 'All Fields', 'Description': 'All terms from all searchable fields', 'TermCount': '16238721', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'N', 'Hierarchy': 'N', 'IsHidden': 'N'}, {'Name': 'UID', 'FullName': 'UID', 'Description': 'Unique number assigned to genome', 'TermCount': '0', 'IsDate': 'N', 'IsNumerical': 'Y', 'SingleToken': 'Y', 'Hierarchy': 'N', 'IsHidden': 'Y'}, {'Name': 'FILT', 'FullName': 'Filter', 'Description': 'Limits the records', 'TermCount': '17', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'Y', 'Hierarchy': 'N', 'IsHidden': 'N'}, {'Name': 'ORGN', 'FullName': 'Organism', 'Description': 'Organism', 'TermCount': '681473', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'Y', 'Hierarchy': 'Y', 'IsHidden': 'N'}, {'Name': 'PID', 'FullName': 'ProjectID', 'Description

#### espell function from Entrez exemples

In [9]:
# espell gives us spelling suggestions, see the I wrote biobython instead of biopython, and spell returned a correction 
record = Entrez.read(Entrez.espell(db='pmc', term='biobython')) 
# pmc is an open source database scientific research
# inside pmc we will look for papers containing the 'term' 'biobython' 
print(type(record))
print(record.keys())
for key in record.keys():
    print(key, ':', record[key])

<class 'Bio.Entrez.Parser.DictionaryElement'>
dict_keys(['Database', 'Query', 'CorrectedQuery', 'SpelledQuery'])
Database : pmc
Query : biobython
CorrectedQuery : biopython
SpelledQuery : ['', 'biopython']


In [10]:
# espell gives us spelling suggestions, here is an exemple of how I wrote and how it is in the database
sciNames = ['Bos gaurus', 'Antelope cervicapra', 'Gazella bennettii', 'Boselaphus tragocamelus', 'Canis lupus', 'Panthera leo', 'Elephas maximus', 'Equus africanus', 'Panthera pardus', 'Cervus canadensis', 'Pavo cristatus', 'Grus leucogeranus', 'Vulpes vulpes', 'Rhinoceros unicornis', 'Panthera Tigris', 'Crocodylus palustris', 'Gavialis gangeticus', 'Equus caballus', 'Equus quagga', 'Babalus bubalis', 'Sus scrofa', 'Camelus dromedaries', 'Giraffa camelopardalis ', 'Hemidactylus flaviviridis', 'Hippopotamus amphibius', 'Macaca mulatta', 'Canis lupus', 'Felis domesticus', 'Acinonyx jubatus', 'Rattus rattus', 'Mus musculus', 'Oryctolagus cuniculus', 'Bubo virginianus', 'Passer domesticus', 'Corvus splendens', 'Acridotheres tristis', 'Psittacula eupatria', 'Molpastes cafer', 'Eudynamis scolopaccus', 'Columba livia', 'Naja naja', 'Ophiophagus hannah', 'Hydrophiinae ', 'Python molurus', 'Ptyas mucosa']
for SNIND, SN in enumerate(sciNames):
    record = Entrez.read(Entrez.espell(db='taxonomy', term=SN))
    print('Index:', SNIND)
    print('Original  Sci. Name:', record['Query'])
    print('Corrected Sci. Name:', record['CorrectedQuery'].capitalize())
    print('-----------------')

Index: 0
Original  Sci. Name: Bos gaurus
Corrected Sci. Name: Bos taurus
-----------------
Index: 1
Original  Sci. Name: Antelope cervicapra
Corrected Sci. Name: Antilope cervicapra
-----------------
Index: 2
Original  Sci. Name: Gazella bennettii
Corrected Sci. Name: Gazella bennettii
-----------------
Index: 3
Original  Sci. Name: Boselaphus tragocamelus
Corrected Sci. Name: Boselaphus tragocamelus
-----------------
Index: 4
Original  Sci. Name: Canis lupus
Corrected Sci. Name: Canis lupus
-----------------
Index: 5
Original  Sci. Name: Panthera leo
Corrected Sci. Name: Panthera leo
-----------------
Index: 6
Original  Sci. Name: Elephas maximus
Corrected Sci. Name: Elephas maximus
-----------------
Index: 7
Original  Sci. Name: Equus africanus
Corrected Sci. Name: Equus africanus
-----------------
Index: 8
Original  Sci. Name: Panthera pardus
Corrected Sci. Name: Panthera pardus
-----------------
Index: 9
Original  Sci. Name: Cervus canadensis
Corrected Sci. Name: Cervus canadensis


In [11]:
# This code returns only the misspelled words in sciNames
sciNames = ['Bos gaurus', 'Antelope cervicapra', 'Gazella bennettii', 'Boselaphus tragocamelus', 'Canis lupus', 'Panthera leo', 'Elephas maximus', 'Equus africanus', 'Panthera pardus', 'Cervus canadensis', 'Pavo cristatus', 'Grus leucogeranus', 'Vulpes vulpes', 'Rhinoceros unicornis', 'Panthera Tigris', 'Crocodylus palustris', 'Gavialis gangeticus', 'Equus caballus', 'Equus quagga', 'Babalus bubalis', 'Sus scrofa', 'Camelus dromedaries', 'Giraffa camelopardalis ', 'Hemidactylus flaviviridis', 'Hippopotamus amphibius', 'Macaca mulatta', 'Canis lupus', 'Felis domesticus', 'Acinonyx jubatus', 'Rattus rattus', 'Mus musculus', 'Oryctolagus cuniculus', 'Bubo virginianus', 'Passer domesticus', 'Corvus splendens', 'Acridotheres tristis', 'Psittacula eupatria', 'Molpastes cafer', 'Eudynamis scolopaccus', 'Columba livia', 'Naja naja', 'Ophiophagus hannah', 'Hydrophiinae ', 'Python molurus', 'Ptyas mucosa']
for SNIND, SN in enumerate(sciNames):
    record = Entrez.read(Entrez.espell(db='taxonomy', term=SN))
    if record['Query'] != record['CorrectedQuery'].capitalize():
        print('Index:', SNIND)
        print('Original  Sci. Name:', record['Query'])
        print('Corrected Sci. Name:', record['CorrectedQuery'].capitalize())
        print('-----------------')

Index: 0
Original  Sci. Name: Bos gaurus
Corrected Sci. Name: Bos taurus
-----------------
Index: 1
Original  Sci. Name: Antelope cervicapra
Corrected Sci. Name: Antilope cervicapra
-----------------
Index: 14
Original  Sci. Name: Panthera Tigris
Corrected Sci. Name: Panthera tigris
-----------------
Index: 19
Original  Sci. Name: Babalus bubalis
Corrected Sci. Name: Bubalus bubalis
-----------------
Index: 21
Original  Sci. Name: Camelus dromedaries
Corrected Sci. Name: Camelus dromedarius
-----------------
Index: 37
Original  Sci. Name: Molpastes cafer
Corrected Sci. Name: Molasses caffer
-----------------
Index: 38
Original  Sci. Name: Eudynamis scolopaccus
Corrected Sci. Name: Eudynamys scolopaceus
-----------------


#### esearch function from Entrez

In [13]:
## get papers with especific terms 
terms = ['homo sapiens AND Lung Cancer AND Marker',
         'Human AND homo sapiens AND Lung Cancer AND Marker',
         '(Human[Title] OR homo sapiens[Title]) AND Lung Cancer AND Marker',
         '(Human[Title] OR homo sapiens[Title]) OR (Lung Cancer[Title] AND Marker[Title])',
         '(Human[Title] OR homo sapiens[Title] OR Lung Cancer[Title] OR Marker[Title]',
         '(Human[Title] OR homo sapiens[Title]) AND (Lung Cancer[Title] AND Marker[Title])']
for termIndex, term in enumerate(terms):
    record = Entrez.read(Entrez.esearch(db='pmc',
                                        term=term,
                                        retmax=5))
    print(termIndex+1, ":", term)
    print('Count:', record['Count'])
    IdList = record['IdList']
    for ID in IdList:
        summary = Entrez.read(Entrez.esummary(db='pmc', id=ID))
        for doc in summary:
            print(doc['Title'])
    print('-------------------------')

1 : homo sapiens AND Lung Cancer AND Marker
Count: 298817
Factor XII in Inflammation and Wound Healing
Supramolecular organizing centers at the interface of inflammation and neurodegeneration
Vitamin B6 Metabolic Pathway is Involved in the Pathogenesis of Liver Diseases via Multi-Omics Analysis
Toxicity and toxicokinetics of the ethanol extract of Zuojin formula
A TLR4-independent critical role for CD14 in intracellular LPS sensing
-------------------------
2 : Human AND homo sapiens AND Lung Cancer AND Marker
Count: 298794
Factor XII in Inflammation and Wound Healing
Supramolecular organizing centers at the interface of inflammation and neurodegeneration
Vitamin B6 Metabolic Pathway is Involved in the Pathogenesis of Liver Diseases via Multi-Omics Analysis
Toxicity and toxicokinetics of the ethanol extract of Zuojin formula
A TLR4-independent critical role for CD14 in intracellular LPS sensing
-------------------------
3 : (Human[Title] OR homo sapiens[Title]) AND Lung Cancer AND Mark

#### Getting RNA data from NCBI and Genbank

In [15]:
# Inside the nucleotide database, we will download .fasta data
record = Entrez.read(Entrez.esearch(db='nucleotide', 
                                    term='HBB[Gene Name] AND RefSeq[key]', # eye color gene
                                    retmax=2000,
                                    idtype='acc')) 

fetchList=[]
counter = 0
for ID in record['IdList']:
    if 'NM_' in ID:      # NC_ is genomic data
        counter+=1
        fetch = Entrez.efetch(db='nucleotide',
                             id=ID,
                             rettype='fasta',
                             retmode='text')
        readFetch = fetch.read()
        fetchList.append(readFetch)

for files in fetchList:
    with open('HBB.fasta', 'a+') as savedFile:
        savedFile.write(files)

In [17]:
## The same data but downloaded from Genbank. Genbank has more information of the same data than a .fasta has.
record = Entrez.read(Entrez.esearch(db='nucleotide', 
                                    term='HBB[Gene Name] AND RefSeq[key]',
                                    retmax=2000,
                                    idtype='acc')) # eye color gene

fetchList=[]
counter = 0
for ID in record['IdList']:
    if 'NM_' in ID:      # NC_ is genomic data
        counter+=1
        fetch = Entrez.efetch(db='nucleotide',
                             id=ID,
                             rettype='gb',    ########## change here from .fasta to .gb
                             retmode='text')
        readFetch = fetch.read()
        fetchList.append(readFetch)

for files in fetchList:
    with open('HBB.gb', 'a+') as savedFile:         ########## change here from .fasta to .gb
        savedFile.write(files)