In [11]:
import google.generativeai as genai
import os
import pandas as pd
from IPython.display import display, Markdown, Latex
from pathlib import Path
from tqdm import tqdm

In [12]:
OUT_PATH = 'data/eutils_cleaned'
DATA_DIR = "data/eutils_raw/"
BASE_PROMPT_PATH = "data/prompts/apidoc2md.md"


DOC_ORDER = ["einfo", "esearch", "epost",
             "esummary", "efetch", "elink", 
             "egquery", "espell", "ecitmatch"]
DOC_ORDER = pd.DataFrame(zip(DOC_ORDER, range(len(DOC_ORDER))), 
                         columns=["api_name", "format_order"])

CONTEXT_PATH = "data/prompts/chap_4_tab_1.md"

Path(OUT_PATH).mkdir(parents=True, exist_ok=True)


In [13]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# See https://ai.google.dev/api/python/google/generativeai/GenerativeModel
# when response_mime_type is set to application/json api returns "RECITATION"
# due to some repetition safe guard?
# https://dropbox.tech/machine-learning/bye-bye-bye-evolution-of-repeated-token-attacks-on-chatgpt-models 
generation_config = {
  "temperature": 0,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

In [14]:
model = genai.GenerativeModel(
  model_name="gemini-1.5-flash-latest",
  generation_config=generation_config,
  system_instruction="You are a helpful, thorough assistant. You analyze raw, unformatted text and output clean, [structured markdown](https://www.markdownguide.org/)."
)

In [15]:
api_docs = []

for _file in Path(DATA_DIR).iterdir():
    with open(_file) as f:
        doc = f.read()
    api_docs.append([Path(_file).stem, doc])

api_docs = pd.DataFrame(api_docs, columns=["api_name", "raw_text"])
api_docs = api_docs.merge(DOC_ORDER)
api_docs = api_docs.sort_values(["format_order"])

In [16]:
with open(BASE_PROMPT_PATH) as f:
    base_prompt = f.read()

with open(CONTEXT_PATH) as f:
    context = f.read()

In [17]:
model = genai.GenerativeModel(
  model_name="gemini-1.5-flash-latest",
  generation_config=generation_config,
)


responses = []
current_docs = None
for index, row in tqdm(api_docs.iterrows(), total=api_docs.shape[0]):
  # tack on extra context if the document tells the readers
  # to see tables. Avoid doing otherwise to avoid unnecessary 
  # inference costs
  if row["api_name"] == "efetch":
    prompt = base_prompt.replace("{CONTEXT_RELEVANT}", "True")
    prompt = prompt.replace("{CONTEXT}", context)
  else:
    prompt = base_prompt.replace("{CONTEXT_RELEVANT}", "False")
    prompt = prompt.replace("{CONTEXT}", "")


  response = model.generate_content([prompt, 
                                    f"filename: {row['api_name']}.txt", 
                                    row["raw_text"]])
  try:
    responses.append(response.text)
  except ValueError:
    try: 
      print(response.candidates)
    except Exception as e:
      print("Gemini is throwing a tantrum...")
      print(f"Exception {e} returned from API.")


100%|██████████| 9/9 [01:54<00:00, 12.78s/it]


In [18]:
api_docs["text_md"] = responses

In [19]:
for i in api_docs["text_md"]:
    display(Markdown(i))

## EInfo API Documentation

### Base URL

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi
```

### Functions

The EInfo API provides the following functionalities:

* **List of all valid Entrez databases:** Returns a list of all valid Entrez database names when no `db` parameter is provided.
* **Database statistics:** Provides statistics for a single database, including lists of indexing fields and available link names.

### Required Parameters

None. If no `db` parameter is provided, EInfo will return a list of the names of all valid Entrez databases.

### Optional Parameters

| Parameter | Description | Values |
|---|---|---|
| db | Target database about which to gather statistics. Value must be a valid Entrez database name. | annotinfo, assembly, biocollections, bioproject, biosample, blastdbinfo, books, cdd, clinvar, dbvar, gap, gapplus, gds, gene, genome, geoprofiles, grasp, gtr, ipg, medgen, mesh, nlmcatalog, nuccore, nucleotide, omim, orgtrack, pcassay, pccompound, pcsubstance, pmc, popset, protein, proteinclusters, protfam, pubmed, seqannot, snp, sra, structure, taxonomy |
| version | Used to specify version 2.0 EInfo XML. The only supported value is '2.0'. When present, EInfo will return XML that includes two new fields: `<IsTruncatable>` and `<IsRangeable>`. Fields that are truncatable allow the wildcard character '*' in terms. The wildcard character will expand to match any set of characters up to a limit of 600 unique expansions. Fields that are rangeable allow the range operator ':' to be placed between a lower and upper limit for the desired range (e.g. 2008:2010[pdat]). | '2.0' |
| retmode | Retrieval type. Determines the format of the returned output. The default value is 'xml' for EInfo XML, but 'json' is also supported to return output in JSON format. | 'xml', 'json' |

### Examples

**Return a list of all Entrez database names:**

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi
```

**Return version 2.0 statistics for Entrez Protein:**

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi?db=protein&version=2.0
``` 


## ESearch API Documentation

The ESearch API provides a way to search Entrez databases and retrieve a list of UIDs (Unique Identifiers) matching a text query. It also allows you to post search results to the History server, download UIDs from a dataset stored on the History server, combine or limit UID datasets, and sort sets of UIDs.

**Base URL:** https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi

**Note:** Some NCBI products contain search tools that generate content from searches on the web interface that are not available to ESearch. For example, the PubMed web interface (pubmed.ncbi.nlm.nih.gov) contains citation matching and spelling correction tools that are only available through that interface. Please see ECitMatch and ESpell for API equivalents.

### Required Parameters

| Parameter | Description | Values |
|---|---|---|
| db | Database to search. Value must be a valid Entrez database name. | annotinfo, assembly, biocollections, bioproject, biosample, blastdbinfo, books, cdd, clinvar, dbvar, gap, gapplus, gds, gene, genome, geoprofiles, grasp, gtr, ipg, medgen, mesh, nlmcatalog, nuccore, nucleotide, omim, orgtrack, pcassay, pccompound, pcsubstance, pmc, popset, protein, proteinclusters, protfam, pubmed, seqannot, snp, sra, structure, taxonomy |
| term | Entrez text query. All special characters must be URL encoded. Spaces may be replaced by '+' signs. For very long queries (more than several hundred characters long), consider using an HTTP POST call. See the PubMed or Entrez help for information about search field descriptions and tags. Search fields and tags are database specific. | String |

**Example:**

```
esearch.fcgi?db=pubmed&term=asthma
```

**Proximity Searching in PubMed:**

PubMed also offers “proximity searching” for multiple terms appearing in any order within a specified number of words from one another in the [Title] or [Title/Abstract] fields.

**Example:**

```
esearch.fcgi?db=pubmed&term=”asthma treatment”[Title:~3]
```

### Optional Parameters - History Server

| Parameter | Description | Values |
|---|---|---|
| usehistory | When set to 'y', ESearch will post the UIDs resulting from the search operation onto the History server so that they can be used directly in a subsequent E-utility call. Also, usehistory must be set to 'y' for ESearch to interpret query key values included in term or to accept a WebEnv as input. | 'y' |
| WebEnv | Web environment string returned from a previous ESearch, EPost or ELink call. When provided, ESearch will post the results of the search operation to this pre-existing WebEnv, thereby appending the results to the existing environment. In addition, providing WebEnv allows query keys to be used in term so that previous search sets can be combined or limited. As described above, if WebEnv is used, usehistory must be set to 'y'. | String |
| query_key | Integer query key returned by a previous ESearch, EPost or ELink call. When provided, ESearch will find the intersection of the set specified by query_key and the set retrieved by the query in term (i.e. joins the two with AND). For query_key to function, WebEnv must be assigned an existing WebEnv string and usehistory must be set to 'y'. | Integer |

**Example:**

```
esearch.fcgi?db=pubmed&term=asthma&WebEnv=<webenv string>&usehistory=y
```

**Query Keys in term:**

Values for query keys may also be provided in term if they are preceeded by a '#' (%23 in the URL). While only one query_key parameter can be provided to ESearch, any number of query keys can be combined in term. Also, if query keys are provided in term, they can be combined with OR or NOT in addition to AND.

**Example:**

The following two URLs are functionally equivalent:

```
esearch.fcgi?db=pubmed&term=asthma&query_key=1&WebEnv=<webenv string>&usehistory=y

esearch.fcgi?db=pubmed&term=%231+AND+asthma&WebEnv=<webenv string>&usehistory=y
```

### Optional Parameters - Retrieval

| Parameter | Description | Values |
|---|---|---|
| retstart | Sequential index of the first UID in the retrieved set to be shown in the XML output (default=0, corresponding to the first record of the entire set). This parameter can be used in conjunction with retmax to download an arbitrary subset of UIDs retrieved from a search. | Integer |
| retmax | Total number of UIDs from the retrieved set to be shown in the XML output (default=20). By default, ESearch only includes the first 20 UIDs retrieved in the XML output. If usehistory is set to 'y', the remainder of the retrieved set will be stored on the History server; otherwise these UIDs are lost. Increasing retmax allows more of the retrieved UIDs to be included in the XML output, up to a maximum of 10,000 records. | Integer |
| rettype | Retrieval type. There are two allowed values for ESearch: 'uilist' (default), which displays the standard XML output, and 'count', which displays only the <Count> tag. | 'uilist', 'count' |
| retmode | Retrieval type. Determines the format of the returned output. The default value is ‘xml’ for ESearch XML, but ‘json’ is also supported to return output in JSON format. | 'xml', 'json' |
| sort | Specifies the method used to sort UIDs in the ESearch output. The available values vary by database (db) and may be found in the Display Settings menu on an Entrez search results page. If usehistory is set to ‘y’, the UIDs are loaded onto the History Server in the specified sort order and will be retrieved in that order by ESummary or EFetch. Example values are ‘relevance’ and ‘name’ for Gene. Users should be aware that the default value of sort varies from one database to another, and that the default value used by ESearch for a given database may differ from that used on NCBI web search pages. | Database specific |
| field | Search field. If used, the entire search term will be limited to the specified Entrez field. | String |
| idtype | Specifies the type of identifier to return for sequence databases (nuccore, popset, protein). By default, ESearch returns GI numbers in its output. If idtype is set to ‘acc’, ESearch will return accession.version identifiers rather than GI numbers. | 'acc', 'gi' |

**Example:**

```
esearch.fcgi?db=pubmed&term=asthma&field=title
```

**Sort Values for PubMed:**

* pub_date – descending sort by publication date
* Author – ascending sort by first author
* JournalName – ascending sort by journal name
* relevance – default sort order, (“Best Match”) on web PubMed

### Optional Parameters - Dates

| Parameter | Description | Values |
|---|---|---|
| datetype | Type of date used to limit a search. The allowed values vary between Entrez databases, but common values are 'mdat' (modification date), 'pdat' (publication date) and 'edat' (Entrez date). Generally an Entrez database will have only two allowed values for datetype. | Database specific |
| reldate | When reldate is set to an integer n, the search returns only those items that have a date specified by datetype within the last n days. | Integer |
| mindate | Date range used to limit a search result by the date specified by datetype. These two parameters (mindate, maxdate) must be used together to specify an arbitrary date range. The general date format is YYYY/MM/DD, and these variants are also allowed: YYYY, YYYY/MM. | YYYY/MM/DD, YYYY, YYYY/MM |
| maxdate | Date range used to limit a search result by the date specified by datetype. These two parameters (mindate, maxdate) must be used together to specify an arbitrary date range. The general date format is YYYY/MM/DD, and these variants are also allowed: YYYY, YYYY/MM. | YYYY/MM/DD, YYYY, YYYY/MM |

**Example:**

Search in PubMed with the term cancer for abstracts that have an Entrez date within the last 60 days; retrieve the first 100 PMIDs and translations; post the results on the History server and return a WebEnv and query_key:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=cancer&reldate=60&datetype=edat&retmax=100&usehistory=y
```

### Examples

* Search in PubMed for the journal PNAS, Volume 97, and retrieve six PMIDs starting with the seventh PMID in the list:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=PNAS[ta]+AND+97[vi]&retstart=6&retmax=6&tool=biomed3
```

* Search in the NLM Catalog for journals matching the term obstetrics:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nlmcatalog&term=obstetrics+AND+ncbijournals[filter]
```

* Search PubMed Central for free full text articles containing the query stem cells:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=stem+cells+AND+free+fulltext[filter]
```

* Search in Nucleotide for all tRNAs:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nucleotide&term=biomol+trna[prop]
```

* Search in Protein for a molecular weight range:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=protein&term=70000:90000[molecular+weight]
```


## EPost API Documentation

The EPost API allows you to upload a list of UIDs to the Entrez History server or append a list of UIDs to an existing set of UID lists attached to a Web Environment.

### Base URL

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi
```

### Functions

* **Uploads a list of UIDs to the Entrez History server:** This function allows you to store a list of UIDs for later retrieval.
* **Appends a list of UIDs to an existing set of UID lists attached to a Web Environment:** This function allows you to add a list of UIDs to an existing Web Environment, which can be used for further processing or analysis.

### Required Parameters

| Parameter | Description | Values |
|---|---|---|
| db | Database containing the UIDs in the input list. The value must be a valid Entrez database name. | annotinfo, assembly, biocollections, bioproject, biosample, blastdbinfo, books, cdd, clinvar, dbvar, gap, gapplus, gds, gene, genome, geoprofiles, grasp, gtr, ipg, medgen, mesh, nlmcatalog, nuccore, nucleotide, omim, orgtrack, pcassay, pccompound, pcsubstance, pmc, popset, protein, proteinclusters, protfam, pubmed, seqannot, snp, sra, structure, taxonomy |
| id | UID list. Either a single UID or a comma-delimited list of UIDs may be provided. All of the UIDs must be from the database specified by db. For PubMed, no more than 10,000 UIDs can be included in a single URL request. For other databases there is no set maximum for the number of UIDs that can be passed to epost, but if more than about 200 UIDs are to be posted, the request should be made using the HTTP POST method. For sequence databases (nuccore, popset, protein), the UID list may be a mixed list of GI numbers and accession.version identifiers. Note: When using accession.version identifiers, there is a conversion step that takes place that causes large lists of identifiers to time out, even when using POST. Therefore, we recommend batching these types of requests in sizes of about 500 UIDs or less, to avoid retrieving only a partial amount of records from your original POST input list. | String (comma-separated list of UIDs) |

### Optional Parameter

| Parameter | Description | Values |
|---|---|---|
| WebEnv | Web Environment. If provided, this parameter specifies the Web Environment that will receive the UID list sent by post. EPost will create a new query key associated with that Web Environment. Usually this WebEnv value is obtained from the output of a previous ESearch, EPost or ELink call. If no WebEnv parameter is provided, EPost will create a new Web Environment and post the UID list to query_key 1. | String |

### Example

**Post records to PubMed:**

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi?db=pubmed&id=11237011,12466850
```

**Post records to Protein database with a specific Web Environment:**

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi?db=protein&id=15718680,157427902,119703751&WebEnv=<webenv string>
```

**Note:** Replace `<webenv string>` with the actual Web Environment value. 


## ESummary API Documentation

### Base URL

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi
```

### Functions

The ESummary API retrieves document summaries (DocSums) for a list of input UIDs or a set of UIDs stored on the Entrez History server.

### Required Parameters

#### Input from UID List

| Parameter | Description | Values |
|---|---|---|
| db | Database from which to retrieve DocSums. | annotinfo, assembly, biocollections, bioproject, biosample, blastdbinfo, books, cdd, clinvar, dbvar, gap, gapplus, gds, gene, genome, geoprofiles, grasp, gtr, ipg, medgen, mesh, nlmcatalog, nuccore, nucleotide, omim, orgtrack, pcassay, pccompound, pcsubstance, pmc, popset, protein, proteinclusters, protfam, pubmed, seqannot, snp, sra, structure, taxonomy |
| id | UID list. Either a single UID or a comma-delimited list of UIDs may be provided. All of the UIDs must be from the database specified by db. There is no set maximum for the number of UIDs that can be passed to ESummary, but if more than about 200 UIDs are to be provided, the request should be made using the HTTP POST method. For sequence databases (nuccore, popset, protein), the UID list may be a mixed list of GI numbers and accession.version identifiers. | String (comma-separated list of UIDs) |

#### Input from Entrez History Server

| Parameter | Description | Values |
|---|---|---|
| db | Database from which to retrieve DocSums. | annotinfo, assembly, biocollections, bioproject, biosample, blastdbinfo, books, cdd, clinvar, dbvar, gap, gapplus, gds, gene, genome, geoprofiles, grasp, gtr, ipg, medgen, mesh, nlmcatalog, nuccore, nucleotide, omim, orgtrack, pcassay, pccompound, pcsubstance, pmc, popset, protein, proteinclusters, protfam, pubmed, seqannot, snp, sra, structure, taxonomy |
| query_key | Query key. This integer specifies which of the UID lists attached to the given Web Environment will be used as input to ESummary. Query keys are obtained from the output of previous ESearch, EPost or ELink calls. | Integer |
| WebEnv | Web Environment. This parameter specifies the Web Environment that contains the UID list to be provided as input to ESummary. Usually this WebEnv value is obtained from the output of a previous ESearch, EPost or ELink call. | String |

### Optional Parameters

#### Retrieval

| Parameter | Description | Values |
|---|---|---|
| retstart | Sequential index of the first DocSum to be retrieved (default=1, corresponding to the first record of the entire set). This parameter can be used in conjunction with retmax to download an arbitrary subset of DocSums from the input set. | Integer |
| retmax | Total number of DocSums from the input set to be retrieved, up to a maximum of 10,000. If the total set is larger than this maximum, the value of retstart can be iterated while holding retmax constant, thereby downloading the entire set in batches of size retmax. | Integer |
| retmode | Retrieval type. Determines the format of the returned output. The default value is ‘xml’ for ESummary XML, but ‘json’ is also supported to return output in JSON format. | xml, json |
| version | Used to specify version 2.0 ESummary XML. The only supported value is ‘2.0’. When present, ESummary will return version 2.0 DocSum XML that is unique to each Entrez database and that often contains more data than the default DocSum XML. | 2.0 |

### Examples

#### PubMed

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=11850928,11482001
```

#### PubMed, version 2.0 XML

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=11850928,11482001&version=2.0
```

#### Protein

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=protein&id=28800982,28628843
```

#### Nucleotide

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=nucleotide&id=28864546,28800981
```

#### Structure

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=structure&id=19923,12120
```

#### Taxonomy

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=taxonomy&id=9913,30521
```

#### UniSTS

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=unists&id=254085,254086
```


## EFetch API Documentation

### Base URL

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi
```

### Functions

The EFetch API retrieves formatted data records for a list of input UIDs or a set of UIDs stored on the Entrez History server.

### Required Parameters

#### Input from UID List

| Parameter | Description | Values |
|---|---|---|
| db | Database from which to retrieve records. The value must be a valid Entrez database name (default = pubmed). | annotinfo, assembly, biocollections, bioproject, biosample, blastdbinfo, books, cdd, clinvar, dbvar, gap, gapplus, gds, gene, genome, geoprofiles, grasp, gtr, ipg, medgen, mesh, nlmcatalog, nuccore, nucleotide, omim, orgtrack, pcassay, pccompound, pcsubstance, pmc, popset, protein, proteinclusters, protfam, pubmed, seqannot, snp, sra, structure, taxonomy |
| id | UID list. Either a single UID or a comma-delimited list of UIDs may be provided. All of the UIDs must be from the database specified by db. There is no set maximum for the number of UIDs that can be passed to EFetch, but if more than about 200 UIDs are to be provided, the request should be made using the HTTP POST method. For sequence databases (nuccore, popset, protein), the UID list may be a mixed list of GI numbers and accession.version identifiers. | String (comma-separated list of UIDs) |

**Example:**

```
efetch.fcgi?db=pubmed&id=19393038,30242208,29453458
efetch.fcgi?db=protein&id=15718680,NP_001098858.1,119703751
```

**Special Note for Sequence Databases:**

NCBI is no longer assigning GI numbers to a growing number of new sequence records. These records are not indexed in Entrez and cannot be retrieved using ESearch or ESummary. They also have no Entrez links accessible by ELink. EFetch can retrieve these records by including their accession.version identifier in the `id` parameter.

#### Input from Entrez History Server

| Parameter | Description | Values |
|---|---|---|
| db | Database from which to retrieve records. The value must be a valid Entrez database name (default = pubmed). | annotinfo, assembly, biocollections, bioproject, biosample, blastdbinfo, books, cdd, clinvar, dbvar, gap, gapplus, gds, gene, genome, geoprofiles, grasp, gtr, ipg, medgen, mesh, nlmcatalog, nuccore, nucleotide, omim, orgtrack, pcassay, pccompound, pcsubstance, pmc, popset, protein, proteinclusters, protfam, pubmed, seqannot, snp, sra, structure, taxonomy |
| query_key | Query key. This integer specifies which of the UID lists attached to the given Web Environment will be used as input to EFetch. Query keys are obtained from the output of previous ESearch, EPost or ELInk calls. The `query_key` parameter must be used in conjunction with `WebEnv`. | Integer |
| WebEnv | Web Environment. This parameter specifies the Web Environment that contains the UID list to be provided as input to EFetch. Usually this `WebEnv` value is obtained from the output of a previous ESearch, EPost or ELink call. The `WebEnv` parameter must be used in conjunction with `query_key`. | String |

**Example:**

```
efetch.fcgi?db=protein&query_key=<key>&WebEnv=<webenv string>
```

### Optional Parameters

#### Retrieval

| Parameter | Description | Values |
|---|---|---|
| retmode | Retrieval mode. This parameter specifies the data format of the records returned, such as plain text, HTML or XML. See Table 1 for a full list of allowed values for each database. | xml, text, asn.1, default |
| rettype | Retrieval type. This parameter specifies the record view returned, such as Abstract or MEDLINE from PubMed, or GenPept or FASTA from protein. Please see Table 1 for a full list of allowed values for each database. | docsum, uilist, full, summary, gene_table, alignmentscores, fasta, homologene, null, native, acc, seqid, gb, gbc, ft, gbwithparts, fasta_cds_na, fasta_cds_aa, gp, gpc, ipg, medline, abstract, flt, rsr, ssexemplar, chr, docset, clinvarset, gtracc |
| retstart | Sequential index of the first record to be retrieved (default=0, corresponding to the first record of the entire set). This parameter can be used in conjunction with `retmax` to download an arbitrary subset of records from the input set. | Integer |
| retmax | Total number of records from the input set to be retrieved, up to a maximum of 10,000. Optionally, for a large set the value of `retstart` can be iterated while holding `retmax` constant, thereby downloading the entire set in batches of size `retmax`. | Integer |

#### Sequence Databases

| Parameter | Description | Values |
|---|---|---|
| strand | Strand of DNA to retrieve. Available values are "1" for the plus strand and "2" for the minus strand. | 1, 2 |
| seq_start | First sequence base to retrieve. The value should be the integer coordinate of the first desired base, with "1" representing the first base of the sequence. | Integer |
| seq_stop | Last sequence base to retrieve. The value should be the integer coordinate of the last desired base, with "1" representing the first base of the sequence. | Integer |
| complexity | Integer value 0 through 4. Data content to return. Many sequence records are part of a larger data structure or "blob", and the `complexity` parameter determines how much of that blob to return. For example, an mRNA may be stored together with its protein product. | 0, 1, 2, 3, 4 |

**Value of complexity data returned for each requested GI:**

* 0 - entire blob
* 1 - bioseq
* 2 - minimal bioseq-set
* 3 - minimal nuc-prot
* 4 - minimal pub-set

### Examples

#### PubMed

* Fetch PMIDs 17284678 and 9997 as text abstracts:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=17284678,9997&retmode=text&rettype=abstract
```

* Fetch PMIDs in XML:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=11748933,11700088&retmode=xml
```

#### PubMed Central

* Fetch XML for PubMed Central ID 212403:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=212403
```

#### Nucleotide/Nuccore

* Fetch the first 100 bases of the plus strand of GI 21614549 in FASTA format:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=21614549&strand=1&seq_start=1&seq_stop=100&rettype=fasta&retmode=text
```

* Fetch the first 100 bases of the minus strand of GI 21614549 in FASTA format:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=21614549&strand=2&seq_start=1&seq_stop=100&rettype=fasta&retmode=text
```

* Fetch the nuc-prot object for GI 21614549:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=21614549&complexity=3
```

* Fetch the full ASN.1 record for GI 5:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=5
```

* Fetch FASTA for GI 5:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=5&rettype=fasta
```

* Fetch the GenBank flat file for GI 5:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=5&rettype=gb
```

* Fetch GBSeqXML for GI 5:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=5&rettype=gb&retmode=xml
```

* Fetch TinySeqXML for GI 5:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=5&rettype=fasta&retmode=xml
```

#### Popset

* Fetch the GenPept flat file for Popset ID 12829836:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=popset&id=12829836&rettype=gp
```

#### Protein

* Fetch the GenPept flat file for GI 8:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=8&rettype=gp
```

* Fetch GBSeqXML for GI 8:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=8&rettype=gp&retmode=xml
```

#### Sequences

* Fetch FASTA for a transcript and its protein product (GIs 312836839 and 34577063)

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sequences&id=312836839,34577063&rettype=fasta&retmode=text
```

#### Gene

* Fetch full XML record for Gene ID 2:

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=2&retmode=xml
```

### Table 1: Valid values of `&retmode` and `&rettype` for EFetch (null = empty string)

**All Databases**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| Document summary | docsum | xml, default |
| List of UIDs in XML | uilist | xml |
| List of UIDs in plain text | uilist | text |

**db = bioproject**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| Full record XML | xml, default | xml, default |

**db = biosample**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| Full record XML | full, default | xml, default |
| Full record text | full, default | text |

**db = gds**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| Summary | summary, default | text, default |

**db = gene**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| text ASN.1 | null | asn.1, default |
| XML | null | xml |
| Gene table | gene_table | text |

**db = homologene**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| text ASN.1 | null | asn.1, default |
| XML | null | xml |
| Alignment scores | alignmentscores | text |
| FASTA | fasta | text |
| HomoloGene | homologene | text |

**db = mesh**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| Full record | full, default | text, default |

**db = nlmcatalog**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| Full record | null | text, default |
| XML | null | xml |

**db = nuccore, protein or popset**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| text ASN.1 | null | text, default |
| binary ASN.1 | null | asn.1 |
| Full record in XML | native | xml |
| Accession number(s) | acc | text |
| FASTA | fasta | text |
| TinySeq XML | fasta | xml |
| SeqID string | seqid | text |

**Additional options for db = nuccore or popset**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| GenBank flat file | gb | text |
| GBSeq XML | gb | xml |
| INSDSeq XML | gbc | xml |

**Additional option for db = nuccore and protein**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| Feature table | ft | text |

**Additional option for db = nuccore**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| GenBank flat file with full sequence (contigs) | gbwithparts | text |
| CDS nucleotide FASTA | fasta_cds_na | text |
| CDS protein FASTA | fasta_cds_aa | text |

**Additional options for db = protein**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| GenPept flat file | gp | text |
| GBSeq XML | gp | xml |
| INSDSeq XML | gpc | xml |
| Identical Protein XML | ipg | xml |

**db = pmc**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| XML | null | xml, default |
| MEDLINE | medline | text |

**db = pubmed**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| XML | null | xml, default |
| MEDLINE | medline | text |
| PMID list | uilist | text |
| Abstract | abstract | text |

**db = sequences**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| text ASN.1 | null | text, default |
| Accession number(s) | acc | text |
| FASTA | fasta | text |
| SeqID string | seqid | text |

**db = snp**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| text ASN.1 | null | asn.1, default |
| XML | null | xml |
| Flat file | flt | text |
| FASTA | fasta | text |
| RS Cluster report | rsr | text |
| SS Exemplar list | ssexemplar | text |
| Chromosome report | chr | text |
| Summary | docset | text |
| UID list | uilist | text or xml |

**db = sra**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| XML | full, default | xml, default |

**db = taxonomy**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| XML | null | xml, default |
| TaxID list | uilist | text or xml |

**db = clinvar**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| ClinVar Set | clinvarset | xml, default |
| UID list | uilist | text or xml |

**db = gtr**

| Record Type | `&rettype` | `&retmode` |
|---|---|---|
| GTR Test Report | gtracc | xml, default |


## ELink API Documentation

### Base URL

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi
```

### Functions

The ELink API provides functions for retrieving linked UIDs from different Entrez databases, finding related UIDs within the same database, checking for the existence of links, and listing LinkOut URLs.

### Required Parameters

| Parameter | Description | Values |
|---|---|---|
| db | Target database for the link operation. | annotinfo, assembly, biocollections, bioproject, biosample, blastdbinfo, books, cdd, clinvar, dbvar, gap, gapplus, gds, gene, genome, geoprofiles, grasp, gtr, ipg, medgen, mesh, nlmcatalog, nuccore, nucleotide, omim, orgtrack, pcassay, pccompound, pcsubstance, pmc, popset, protein, proteinclusters, protfam, pubmed, seqannot, snp, sra, structure, taxonomy |
| dbfrom | Origin database containing the input UIDs. | annotinfo, assembly, biocollections, bioproject, biosample, blastdbinfo, books, cdd, clinvar, dbvar, gap, gapplus, gds, gene, genome, geoprofiles, grasp, gtr, ipg, medgen, mesh, nlmcatalog, nuccore, nucleotide, omim, orgtrack, pcassay, pccompound, pcsubstance, pmc, popset, protein, proteinclusters, protfam, pubmed, seqannot, snp, sra, structure, taxonomy |
| cmd | ELink command mode. Specifies the function to be performed. | neighbor, neighbor_score, neighbor_history, acheck, ncheck, lcheck, llinks, llinkslib, prlinks |
| id | UID list (single UID or comma-delimited list). | Integer |
| query_key | Query key from a previous ESearch, EPost, or ELInk call. | Integer |
| WebEnv | Web Environment from a previous ESearch, EPost, or ELInk call. | String |

### Optional Parameters

#### Retrieval

| Parameter | Description | Values |
|---|---|---|
| retmode | Retrieval type. | xml, json |
| idtype | Identifier type for sequence databases (nuccore, popset, protein). | gi, acc |

#### Limiting the Output Set of Links

| Parameter | Description | Values |
|---|---|---|
| linkname | Name of the Entrez link to retrieve. | String (e.g., gene_snp_genegenotype) |
| term | Entrez query to limit the output set of linked UIDs. | String |
| holding | Name of LinkOut provider. | String (e.g., CTgov) |

#### Dates (Only for `cmd=neighbor` or `cmd=neighbor_history` and `dbfrom=pubmed`)

| Parameter | Description | Values |
|---|---|---|
| datetype | Type of date used to limit the link operation. | mdat, pdat, edat |
| reldate | Number of days to limit the link operation by the date specified by `datetype`. | Integer |
| mindate | Minimum date for the date range. | YYYY/MM/DD, YYYY, YYYY/MM |
| maxdate | Maximum date for the date range. | YYYY/MM/DD, YYYY, YYYY/MM |

### Command Modes

#### `cmd=neighbor` (Default)

Returns a set of UIDs in `db` linked to the input UIDs in `dbfrom`.

**Example:** Link from protein to gene

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=protein&db=gene&id=15718680,157427902
```

#### `cmd=neighbor_score`

Returns a set of UIDs within the same database as the input UIDs along with computed similarity scores.

**Example:** Find related articles to PMID 20210808

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&db=pubmed&id=20210808&cmd=neighbor_score
```

#### `cmd=neighbor_history`

Posts the output UIDs to the Entrez History server and returns a `query_key` and `WebEnv` corresponding to the location of the output set.

**Example:** Link from protein to gene and post the results on the Entrez History

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=protein&db=gene&id=15718680,157427902&cmd=neighbor_history
```

#### `cmd=acheck`

Lists all links available for a set of UIDs.

**Example:** List all possible links from two protein GIs

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=protein&id=15718680,157427902&cmd=acheck
```

#### `cmd=ncheck`

Checks for the existence of links within the same database for a set of UIDs.

**Example:** Check whether two nuccore sequences have "related sequences" links.

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=nuccore&id=21614549,219152114&cmd=ncheck
```

#### `cmd=lcheck`

Checks for the existence of external links (LinkOuts) for a set of UIDs.

**Example:** Check whether two protein sequences have any LinkOut providers.

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=protein&id=15718680,157427902&cmd=lcheck
```

#### `cmd=llinks`

Lists the URLs and attributes for the LinkOut providers that are not libraries for each input UID.

**Example:** List the LinkOut URLs for non-library providers for two pubmed abstracts.

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=19880848,19822630&cmd=llinks
```

#### `cmd=llinkslib`

Lists the URLs and attributes for all LinkOut providers, including libraries, for each input UID.

**Example:** List all LinkOut URLs for two PubMed abstracts.

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=19880848,19822630&cmd=llinkslib
```

#### `cmd=prlinks`

Lists the primary LinkOut provider for each input UID, or links directly to the LinkOut provider's web site for a single UID if `retmode` is set to `ref`.

**Example:** Find links to full text providers for two PubMed abstracts.

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=19880848,19822630&cmd=prlinks
```

**Example:** Link directly to the full text for a PubMed abstract at the provider's web site.

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=19880848&cmd=prlinks&retmode=ref
```

### Notes

* If more than one `id` parameter is provided, ELink will perform a separate link operation for each set of UIDs specified by each `id` parameter.
* For sequence databases (nuccore, popset, protein), the `id` list may be a mixed list of GI numbers and accession.version identifiers.
* The `linkname` parameter only functions when `cmd` is set to `neighbor` or `neighbor_history`.
* The `term` parameter only functions when `db` and `dbfrom` are set to the same database value.
* The `holding` parameter only functions when `cmd` is set to `llinks` or `llinkslib`.
* The date parameters (`datetype`, `reldate`, `mindate`, `maxdate`) only function when `cmd` is set to `neighbor` or `neighbor_history` and `dbfrom` is `pubmed`.

### Example Usage

**Find all links from gene to snp:**

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=gene&db=snp&id=93986
```

**Find snps with genotype data linked to genes:**

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=gene&db=snp&id=93986&linkname=gene_snp_genegenotype
```

**Find all related articles for a PMID:**

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&db=pubmed&id=19879512
```

**Find all related review articles published in 2008 for a PMID:**

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&db=pubmed&id=19879512&term=review%5Bfilter%5D+AND+2008%5Bpdat%5Dh
```

**Find information for all LinkOut providers for a PMID:**

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&cmd=llinkslib&id=16210666
```

**Find information from clinicaltrials.gov for a PMID:**

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&cmd=llinkslib&id=16210666&holding=CTgov
```


## EGQuery API Documentation

### Base URL

https://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi

### Function

Provides the number of records retrieved in all Entrez databases by a single text query.

### Required Parameters

| Parameter | Description | Values |
|---|---|---|
| term | Entrez text query. All special characters must be URL encoded. Spaces may be replaced by '+' signs. For very long queries (more than several hundred characters long), consider using an HTTP POST call. See the PubMed or Entrez help for information about search field descriptions and tags. Search fields and tags are database specific. | String |

### Example

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi?term=asthma
``` 


## ESpell API Documentation

**Base URL:** https://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi

**Function:** Provides spelling suggestions for terms within a single text query in a given database.

### Required Parameters

| Parameter | Description | Values |
|---|---|---|
| db | Database to search. Value must be a valid Entrez database name. | annotinfo, assembly, biocollections, bioproject, biosample, blastdbinfo, books, cdd, clinvar, dbvar, gap, gapplus, gds, gene, genome, geoprofiles, grasp, gtr, ipg, medgen, mesh, nlmcatalog, nuccore, nucleotide, omim, orgtrack, pcassay, pccompound, pcsubstance, pmc, popset, protein, proteinclusters, protfam, pubmed, seqannot, snp, sra, structure, taxonomy |
| term | Entrez text query. All special characters must be URL encoded. Spaces may be replaced by '+' signs. For very long queries (more than several hundred characters long), consider using an HTTP POST call. See the PubMed or Entrez help for information about search field descriptions and tags. Search fields and tags are database specific. | String |

**Example:**

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi?db=pubmed&term=asthmaa+OR+alergies
``` 


## ECitMatch

**Base URL:** https://eutils.ncbi.nlm.nih.gov/entrez/eutils/ecitmatch.cgi

**Function:** Retrieves PubMed IDs (PMIDs) that correspond to a set of input citation strings.

### Required Parameters

| Parameter | Description | Values |
|---|---|---|
| db | Database to search. | pubmed |
| rettype | Retrieval type. | xml |
| bdata | Citation strings. Each input citation must be represented by a citation string in the following format:  `journal_title|year|volume|first_page|author_name|your_key|`  Multiple citation strings may be provided by separating the strings with a carriage return character (`%0D`). The `your_key` value is an arbitrary label provided by the user that may serve as a local identifier for the citation, and it will be included in the output. Be aware that all spaces must be replaced by '+' symbols and that citation strings should end with a final vertical bar '|'. | String |

**Example:**

```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/ecitmatch.cgi?db=pubmed&retmode=xml&bdata=proc+natl+acad+sci+u+s+a|1991|88|3248|mann+bj|Art1|%0Dscience|1987|235|182|palmenberg+ac|Art2|
``` 


In [20]:
api_docs.to_parquet("data/entrez/markdown/chapter_4.parquet")