# EInfo: Obtaining information about the Entrez databases

In [1]:
from Bio import Entrez

Entrez.email = "arezou.pst@gmail.com"   # Always tell NCBI who you are
handle = Entrez.einfo()
result = handle.read()

In [2]:
print(result)

b'<?xml version="1.0" encoding="UTF-8" ?>\n<!DOCTYPE eInfoResult PUBLIC "-//NLM//DTD einfo 20190110//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20190110/einfo.dtd">\n<eInfoResult>\n<DbList>\n\n\t<DbName>pubmed</DbName>\n\t<DbName>protein</DbName>\n\t<DbName>nuccore</DbName>\n\t<DbName>ipg</DbName>\n\t<DbName>nucleotide</DbName>\n\t<DbName>structure</DbName>\n\t<DbName>genome</DbName>\n\t<DbName>annotinfo</DbName>\n\t<DbName>assembly</DbName>\n\t<DbName>bioproject</DbName>\n\t<DbName>biosample</DbName>\n\t<DbName>blastdbinfo</DbName>\n\t<DbName>books</DbName>\n\t<DbName>cdd</DbName>\n\t<DbName>clinvar</DbName>\n\t<DbName>gap</DbName>\n\t<DbName>gapplus</DbName>\n\t<DbName>grasp</DbName>\n\t<DbName>dbvar</DbName>\n\t<DbName>gene</DbName>\n\t<DbName>gds</DbName>\n\t<DbName>geoprofiles</DbName>\n\t<DbName>homologene</DbName>\n\t<DbName>medgen</DbName>\n\t<DbName>mesh</DbName>\n\t<DbName>ncbisearch</DbName>\n\t<DbName>nlmcatalog</DbName>\n\t<DbName>omim</DbName>\n\t<DbName>orgtrack</Db

In [3]:
handle_out = open ("info_db.xml", "wb")
handle_out.write(result)
handle_out.close()

In [5]:
from Bio import Entrez

Entrez.email = "arezou.pst@gmail.com"   # Always tell NCBI who you are
handle = Entrez.einfo()
record_info = Entrez.read(handle)

In [6]:
print (record_info)

{'DbList': ['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'ncbisearch', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'proteinclusters', 'pcassay', 'protfam', 'biosystems', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']}


In [7]:
from Bio import Entrez

Entrez.email = "arezou.pst@gmail.com"   # Always tell NCBI who you are
handle = Entrez.einfo(db = "pubmed")
record_pubmed = Entrez.read(handle)

In [8]:
print (record_pubmed)

{'DbInfo': {'DbName': 'pubmed', 'MenuName': 'PubMed', 'Description': 'PubMed bibliographic record', 'DbBuild': 'Build210313-2212m.3', 'Count': '32276434', 'LastUpdate': '2021/03/15 15:17', 'FieldList': [{'Name': 'ALL', 'FullName': 'All Fields', 'Description': 'All terms from all searchable fields', 'TermCount': '256394196', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'N', 'Hierarchy': 'N', 'IsHidden': 'N'}, {'Name': 'UID', 'FullName': 'UID', 'Description': 'Unique number assigned to publication', 'TermCount': '0', 'IsDate': 'N', 'IsNumerical': 'Y', 'SingleToken': 'Y', 'Hierarchy': 'N', 'IsHidden': 'Y'}, {'Name': 'FILT', 'FullName': 'Filter', 'Description': 'Limits the records', 'TermCount': '5568', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'Y', 'Hierarchy': 'N', 'IsHidden': 'N'}, {'Name': 'TITL', 'FullName': 'Title', 'Description': 'Words in title of publication', 'TermCount': '19149983', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'N', 'Hierarchy': 'N', 'IsHidden':

In [9]:
import json

print (json.dumps(record_pubmed, indent=4))

{
    "DbInfo": {
        "DbName": "pubmed",
        "MenuName": "PubMed",
        "Description": "PubMed bibliographic record",
        "DbBuild": "Build210313-2212m.3",
        "Count": "32276434",
        "LastUpdate": "2021/03/15 15:17",
        "FieldList": [
            {
                "Name": "ALL",
                "FullName": "All Fields",
                "Description": "All terms from all searchable fields",
                "TermCount": "256394196",
                "IsDate": "N",
                "IsNumerical": "N",
                "SingleToken": "N",
                "Hierarchy": "N",
                "IsHidden": "N"
            },
            {
                "Name": "UID",
                "FullName": "UID",
                "Description": "Unique number assigned to publication",
                "TermCount": "0",
                "IsDate": "N",
                "IsNumerical": "Y",
                "SingleToken": "Y",
                "Hierarchy": "N",
                "IsHidden":

In [10]:
print (record_pubmed["DbInfo"]["Description"])
print (record_pubmed["DbInfo"]["Count"])
print (record_pubmed["DbInfo"]["LastUpdate"])

PubMed bibliographic record
32276434
2021/03/15 15:17


In [11]:
for field in record_pubmed["DbInfo"]["FieldList"]:
    print ("%(Name)s, %(FullName)s, %(Description)s" % field)

ALL, All Fields, All terms from all searchable fields
UID, UID, Unique number assigned to publication
FILT, Filter, Limits the records
TITL, Title, Words in title of publication
WORD, Text Word, Free text associated with publication
MESH, MeSH Terms, Medical Subject Headings assigned to publication
MAJR, MeSH Major Topic, MeSH terms of major importance to publication
AUTH, Author, Author(s) of publication
JOUR, Journal, Journal abbreviation of publication
AFFL, Affiliation, Author's institutional affiliation and address
ECNO, EC/RN Number, EC number for enzyme or CAS registry number
SUBS, Supplementary Concept, CAS chemical name or MEDLINE Substance Name
PDAT, Date - Publication, Date of publication
EDAT, Date - Entrez, Date publication first accessible through Entrez
VOL, Volume, Volume number of publication
PAGE, Pagination, Page number(s) of publication
PTYP, Publication Type, Type of publication (e.g., review)
LANG, Language, Language of publication
ISS, Issue, Issue number of publ

# Exercise 1

Find all searchable fields from Genbank

# ESearch: Searching the Entrez databases

In [12]:
from Bio import Entrez

Entrez.email = ""   # Always tell NCBI who you are
handle = Entrez.esearch(db = "nucleotide", term = "drosophila melanogaster[orgn] AND ubx[gene]", idtype = "acc")
record_ubx = Entrez.read(handle)

In [13]:
print (json.dumps(record_ubx, indent = 4))

{
    "Count": "106",
    "RetMax": "20",
    "RetStart": "0",
    "IdList": [
        "NT_033777.3",
        "NM_206497.3",
        "NM_169729.3",
        "NM_169728.3",
        "NM_169730.3",
        "NM_080500.4",
        "NM_080504.4",
        "AE014297.3",
        "FJ631257.1",
        "U31961.1",
        "BT010241.1",
        "FJ636139.1",
        "FJ897682.1",
        "FJ897681.1",
        "FJ897680.1",
        "FJ897679.1",
        "FJ897678.1",
        "FJ897677.1",
        "FJ897676.1",
        "FJ897675.1"
    ],
    "TranslationSet": [
        {
            "From": "drosophila melanogaster[orgn]",
            "To": "\"Drosophila melanogaster\"[Organism]"
        }
    ],
    "TranslationStack": [
        {
            "Term": "\"Drosophila melanogaster\"[Organism]",
            "Field": "Organism",
            "Count": "1337868",
            "Explode": "Y"
        },
        {
            "Term": "ubx[gene]",
            "Field": "gene",
            "Count": "1195",
       

In [14]:
print (record_ubx["Count"])

106


In [15]:
print (record_ubx["IdList"])

['NT_033777.3', 'NM_206497.3', 'NM_169729.3', 'NM_169728.3', 'NM_169730.3', 'NM_080500.4', 'NM_080504.4', 'AE014297.3', 'FJ631257.1', 'U31961.1', 'BT010241.1', 'FJ636139.1', 'FJ897682.1', 'FJ897681.1', 'FJ897680.1', 'FJ897679.1', 'FJ897678.1', 'FJ897677.1', 'FJ897676.1', 'FJ897675.1']


# Exercise2 
Find accession numbers of UID list in 'at_id.txt' file.

# EFetch: Downloading full records from Entrez

Requesting a specific file format from Entrez using Bio.Entrez.efetch() requires specifying the rettype and/or retmode optional arguments. The different combinations are described for each database type on the pages linked to on NCBI efetch webpage.{https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch}.

In [16]:
from Bio import Entrez

Entrez.email = ""   # Always tell NCBI who you are
handle = Entrez.efetch(db = "nucleotide", id = "NM_206497.3", rettype = "gb", retmode = "text")
data = handle.read()

out_handle = open ("ubx.gbk", "w")
out_handle.write(data)
out_handle.close()
handle.close()

In [17]:
from Bio import Entrez

Entrez.email = ""   # Always tell NCBI who you are
handle = Entrez.efetch(db = "nucleotide", id = "NM_206497.3", rettype = "fasta", retmode = "text")
data = handle.read()

out_handle = open ("ubx.fasta", "w")
out_handle.write(data)
out_handle.close()
handle.close()

# ELink: Searching for related items in NCBI Entrez

For help on ELink, see the ELink help page https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ELink There is an entire sub-page just for the link names, describing how different databases can be cross referenced.

In [18]:
from Bio import Entrez

Entrez.email = ""   # Always tell NCBI who you are
handle = Entrez.elink (dbfrom = "nucleotide", db = "protein", id = "NM_206497.3, NM_169729.3, NM_169728.3", idtype="acc")
record_pId = Entrez.read(handle)

In [19]:
print (json.dumps(record_pId, indent = 4))

[
    {
        "LinkSetDbHistory": [],
        "ERROR": [],
        "LinkSetDb": [
            {
                "Link": [
                    {
                        "Id": "NP_996219.1"
                    },
                    {
                        "Id": "NP_732172.1"
                    },
                    {
                        "Id": "NP_732171.1"
                    }
                ],
                "DbTo": "protein",
                "LinkName": "nuccore_protein"
            }
        ],
        "DbFrom": "nuccore",
        "IdList": [
            "NM_206497.3",
            "NM_169729.3",
            "NM_169728.3"
        ]
    }
]


In [20]:
pId = []
for link in record_pId[0]["LinkSetDb"][0]["Link"]:
    pId.append(link["Id"])
    
print (pId)

['NP_996219.1', 'NP_732172.1', 'NP_732171.1']


# Exercise 3
At first, find accession numbers from protein database related to 'atAcc' UID gene list and then, fetch 'FASTA' file consisting protein sequences.