In [None]:
import pprint
from pathlib import Path
import os 

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


%load_ext autoreload
%autoreload 2

# Metadata extraction

The general context of this problem is automatic metadata extraction for neuroscience datasets. 
As a proof of concept, this notebook showcases how to extact relevant experimental metadata from the abstract using LLMs. 
Following the categories currently used in DANDI for metadata handling we focus on the following fields of metadata:

- Species
- Species identifier (UBERON ID)
- Anatomy
- Anatomy identifier (NCBI Taxonomy ID)
- Approach
- Measurement technique

The approach here is  inspired by [recent research](https://ar5iv.labs.arxiv.org/html/2304.10428) in named entity recognition (NER) using LLMs.
More specifically, the technique that we use is in-context learning, where a set of relevant examples are presented
as context or instructions for inference to tailor a pre-trained language model to a specific task. The core of this 
technique is the selection of relevant examples to be used as context. Currently, those were created from a subset of
well annotated DANDI datasets (see `../data/training_data.json`).

Schematically, the pipeline during inference works as follows:
- Extract a set of examples to build a context-prompt.
- From an abstract passed as text or a doi which can be used to fetch the abstract using [Crossref API](https://www.crossref.org/documentation/retrieve-metadata/rest-api/) create a task prompt.
- Query OpenAPI endpoint for inference using the context-prompt and task-prompt.
- Parse the results and extract the relevant metadata.


Structure of a context-prompt:

```
You are a neuroscience researcher and you are interested in figuring the metadata from abstracts. Here are some
examples of how you work:

Abstract from example 1
Metadata from example 1

Abstract from example 2
Metadata from example 2
```

Structure of a task-prompt:

```
The abstract of the paper is:
{abstract} 

Fill as in the examples:
Information: {{}}
In the format of the previous reponse. If some information is missing, leave it blank.
```

# In context learning prompt

In [None]:
from utils.metadata_extraction import generate_zero_shot_prompt, generate_task_prompt_from_abstract, infer_metadata

In [None]:
preamble_prompt = """You are a neuroscience researcher and you are interested in figuring the metadata from abstracts. Here are some
examples of how you work:"""

training_dadiset_ids = ["000568", "000250", "000147", "000127", "000055", "000044"]
zero_shot_prompt = generate_zero_shot_prompt(training_dadiset_ids)

context_prompt = preamble_prompt + zero_shot_prompt
context_prompt

# Inference

## Using an abstract


In [None]:
dandiset_id_test = "000568"
doi = "https://doi.org/10.1038/s41593-022-01138-x" 
abstract_to_test = "The incorporation of new information into the hippocampal network is likely to be constrained by its innate architecture and internally generated activity patterns. However, the origin, organization and consequences of such patterns remain poorly understood. In the present study we show that hippocampal network dynamics are affected by sequential neurogenesis. We birthdated CA1 pyramidal neurons with in utero electroporation over 4 embryonic days, encompassing the peak of hippocampal neurogenesis, and compared their functional features in freely moving adult mice. Neurons of the same birthdate displayed distinct connectivity, coactivity across brain states and assembly dynamics. Same-birthdate neurons exhibited overlapping spatial representations, which were maintained across different environments. Overall, the wiring and functional features of CA1 pyramidal neurons reflected a combination of birthdate and the rate of neurogenesis. These observations demonstrate that sequential neurogenesis during embryonic development shapes the preconfigured forms of adult network dynamics."
task_prompt = generate_task_prompt_from_abstract(abstract_to_test)

In [None]:
context_prompt

In [None]:
task_prompt = generate_task_prompt_from_abstract(abstract_to_test)
context_prompt = preamble_prompt + zero_shot_prompt
prompt = f"{context_prompt} {task_prompt}"

print("abstract: \n")
pprint.pprint(abstract_to_test)
print("\n Information extracted: \n")
metadata = infer_metadata(prompt)
pprint.pprint(metadata)

#### Baseline without context learning for sanity check
1) Same as our pipeline, but without context learning

In [None]:
context_prompt = preamble_prompt 
prompt = f"{context_prompt} {task_prompt}"

print("abstract: \n")
pprint.pprint(abstract_to_test)
print("\n Information extracted: \n")
metadata = infer_metadata(prompt)
pprint.pprint(metadata)

#### Better baseline
For a comparision that is more fair, here we attempt to make the baseline better by using a more sophisticated prompt and more clear expectations.

In [None]:
def generate_task_prompt_from_abstract_without_context(abstract: str) -> str:
    prompt = f"""The abstract of the paper is:
    {abstract} 

    Extract the following information from the abstract:
    - species:
    - species identifier in the NCBI taxonomy:
    - approach:
    - measurement:
    - anatomy:
    - anatomy identifier in the Uberon ontology:

    Return the response as a JSON object with the following format:
    
    {{
        "species": [species_name_1, species_name_2, ...],
        "species_identifier": [species identifiers in the NCBI taxonomy. e.g 'http://purl.obolibrary.org/obo/NCBITaxon_10090'],
        "approach": [e.g. 'electrophysiology', 'calcium imaging', 'optogenetics'],
        "measurement": [e.g. surgery, spike sorting, etc.],
        "anatomy": [e.g. 'hippocampus', 'cortex', 'thalamus'],
        "anatomy_identifier": [anatomy identifier in the Uberon ontology]
    }}
    
    If some information is missing, leave it blank.

    """

    return prompt

In [None]:
context_prompt = preamble_prompt 
task_prompt = generate_task_prompt_from_abstract_without_context(abstract_to_test)
prompt = f"{context_prompt} {task_prompt}"

print("abstract: \n")
pprint.pprint(abstract_to_test)
print("\n Information extracted: \n")
metadata = infer_metadata(prompt)
pprint.pprint(metadata)

## From DOI to inference

In [None]:
from utils.metadata_extraction import get_crossref_abstract

In [None]:
# Random article from elife
doi = "https://doi.org/10.7554/eLife.89093.1" 
abstract_to_test = get_crossref_abstract(doi)

task_prompt = generate_task_prompt_from_abstract(abstract_to_test)
context_prompt = preamble_prompt + zero_shot_prompt
prompt = f"{context_prompt} {task_prompt}"

print("abstract: \n")
pprint.pprint(abstract_to_test)
print("\n Information extracted: \n")
metadata = infer_metadata(prompt)
pprint.pprint(metadata)

In [None]:
context_prompt = preamble_prompt 
task_prompt = generate_task_prompt_from_abstract_without_context(abstract_to_test)
prompt = f"{context_prompt} {task_prompt}"

print("abstract: \n")
pprint.pprint(abstract_to_test)
print("\n Information extracted: \n")
metadata = infer_metadata(prompt)
pprint.pprint(metadata)