Collect a list of species and their taxonomies

```bash
cat ../d2-idb-top-3-families/raw/*.jsonl | split -n r/1/5 | jq .indexTerms\
| mlr --ijson --otsv template -f "genus","specificepithet" --fill-with MISSING | grep -v MISSING\
| mlr --tsv uniq -a\
> processed/indexTerms-species.tsv
```

Turn them into questions and submit to ChatGPT

```bash
cat processed/indexTerms-all-taxon.tsv | pv --line-mode -ptl\
| python3 ~/biodiversity-llms/scripts/nlp/qa.py --num-responses 10 --max-tokens 10 \
"What taxonomic kingdom does species \"{5} {6}\" belong to? Only say its name" \
"What taxonomic phylum does species \"{5} {6}\" belong to? Only say its name" \
"What taxonomic class does species \"{5} {6}\" belong to? Only say its name" \
"What taxonomic order does species \"{5} {6}\" belong to? Only say its name" \
"What taxonomic family does species \"{5} {6}\" belong to? Only say its name" \
> results/indexTerms-all-taxon.tsv
```

And repeat for all upper taxonomic ranks

In [11]:
ranks = ["kingdom","phylum","class","order","family","genus"]

for rank in ranks:
    print(f"""\
cat ../d2-idb-top-3-families/raw/*.jsonl \
| jq .indexTerms \
| jq "{{taxon: .{rank}}}" \
| mlr --ijson --otsv uniq -a \
| grep -vE "^null$" \
> processed/indexTerms-{rank}.tsv""")

cat ../d2-idb-top-3-families/raw/*.jsonl | jq .indexTerms | jq "{taxon: .kingdom}" | mlr --ijson --otsv uniq -a | grep -vE "^null$" > processed/indexTerms-kingdom.tsv
cat ../d2-idb-top-3-families/raw/*.jsonl | jq .indexTerms | jq "{taxon: .phylum}" | mlr --ijson --otsv uniq -a | grep -vE "^null$" > processed/indexTerms-phylum.tsv
cat ../d2-idb-top-3-families/raw/*.jsonl | jq .indexTerms | jq "{taxon: .class}" | mlr --ijson --otsv uniq -a | grep -vE "^null$" > processed/indexTerms-class.tsv
cat ../d2-idb-top-3-families/raw/*.jsonl | jq .indexTerms | jq "{taxon: .order}" | mlr --ijson --otsv uniq -a | grep -vE "^null$" > processed/indexTerms-order.tsv
cat ../d2-idb-top-3-families/raw/*.jsonl | jq .indexTerms | jq "{taxon: .family}" | mlr --ijson --otsv uniq -a | grep -vE "^null$" > processed/indexTerms-family.tsv
cat ../d2-idb-top-3-families/raw/*.jsonl | jq .indexTerms | jq "{taxon: .genus}" | mlr --ijson --otsv uniq -a | grep -vE "^null$" > processed/indexTerms-genus.tsv


In [13]:
for i in range(1, len(ranks)):
    rank = ranks[i]
    questions = [f"\"What taxonomic {ranks[j]} does {rank} \\\"{{0}}\\\" belong to? Only say its name\"" for j in range(0, i)]
    print(f"""\
cat processed/indexTerms-{rank}.tsv \
| python3 ~/biodiversity-llms/scripts/nlp/qa.py --num-responses 10 --max-tokens 10 \
{" ".join(questions)} \
> results/indexTerms-{rank}.tsv""")

cat processed/indexTerms-phylum.tsv | python3 ~/biodiversity-llms/scripts/nlp/qa.py --num-responses 10 --max-tokens 10 "What taxonomic kingdom does phylum \"{0}\" belong to? Only say its name" > results/indexTerms-phylum.tsv
cat processed/indexTerms-class.tsv | python3 ~/biodiversity-llms/scripts/nlp/qa.py --num-responses 10 --max-tokens 10 "What taxonomic kingdom does class \"{0}\" belong to? Only say its name" "What taxonomic phylum does class \"{0}\" belong to? Only say its name" > results/indexTerms-class.tsv
cat processed/indexTerms-order.tsv | python3 ~/biodiversity-llms/scripts/nlp/qa.py --num-responses 10 --max-tokens 10 "What taxonomic kingdom does order \"{0}\" belong to? Only say its name" "What taxonomic phylum does order \"{0}\" belong to? Only say its name" "What taxonomic class does order \"{0}\" belong to? Only say its name" > results/indexTerms-order.tsv
cat processed/indexTerms-family.tsv | python3 ~/biodiversity-llms/scripts/nlp/qa.py --num-responses 10 --max-tokens 

Get number of records for each taxon. Note that iDigBio only indexes the following ranks:

```python
[
    "kingdom",
    "phylum",
    "class",
    "order",
    "family",
    "genus"
]
``````

In [1]:
import json
import requests as rq
import os
import time

In [2]:
base_dir = "./"
raw_dir = os.path.join(base_dir, "raw")
processed_dir = os.path.join(base_dir, "processed")
results_dir = os.path.join(base_dir, "results")

for d in (base_dir, raw_dir, processed_dir, results_dir):
    if not os.path.exists(d):
        os.mkdir(d)

In [9]:
ranks = [
    "kingdom",
    "phylum",
    "class",
    "order",
    "family",
    "genus"
]

genus_and_up = rq.get(f"""https://search.idigbio.org/v2/summary/top/records/?top_fields={json.dumps(ranks)}""").json()

In [14]:
def make_taxon_query(genus: str, specificEpithet: str):
    exists = {
        "type": "exists"
    }
    return {
        "rq": {
            "kingdom": exists,
            "phylum": exists,
            "class": exists,
            "order": exists,
            "family": exists,
            "genus": genus,
            "specificepithet": specificEpithet
        },
        "limit": 1
    }

rq.post("http://search.idigbio.org/v2/search/records/", json=make_taxon_query("acer", "saccharum")).json()["itemCount"]

11215

In [10]:
genus_and_up

{'kingdom': {'animalia': {'itemCount': 61623111,
   'phylum': {'arthropoda': {'itemCount': 25967349,
     'class': {'insecta': {'itemCount': 22081111,
       'order': {'hymenoptera': {'itemCount': 5905617,
         'family': {'formicidae': {'itemCount': 1476087,
           'genus': {'pheidole': {'itemCount': 155163},
            'camponotus': {'itemCount': 115513},
            'formica': {'itemCount': 77155},
            'solenopsis': {'itemCount': 54001},
            'crematogaster': {'itemCount': 51002},
            'monomorium': {'itemCount': 49265},
            'strumigenys': {'itemCount': 43895},
            'tetramorium': {'itemCount': 43139},
            'lasius': {'itemCount': 42108},
            'formicidae': {'itemCount': 36179}}},
          'apidae': {'itemCount': 1334212,
           'genus': {'bombus': {'itemCount': 649003},
            'melissodes': {'itemCount': 61475},
            'apis': {'itemCount': 56302},
            'ceratina': {'itemCount': 54482},
            'xy