# Concept Extraction via LLM calls leveraging OpenRouter API

In [8]:
from openai import OpenAI
import pandas as pd
from typing import List, Dict, Any
import time
from tqdm import tqdm
from datetime import datetime

class OpenRouterBatchInference:
    def __init__(self, api_key: str, models: List[str], system_prompt: str, structured_output_instructions: Dict[str, Any]):
        """
        Initialize the batch inference class using OpenAI client for OpenRouter.

        Args:
            api_key (str): Your OpenRouter API key.
            models (List[str]): List of model names to query.
            system_prompt (str): Common system prompt for all queries.
            structured_output_instructions (Dict[str, Any]): Instructions to guide the output structure.
        """
        self.client = OpenAI(
            base_url="https://openrouter.ai/api/v1",
            api_key=api_key
        )
        self.models = models
        self.system_prompt = system_prompt
        self.structured_output_instructions = structured_output_instructions


    def _create_messages(self, user_prompt: str) -> List[Dict[str, str]]:
        """
        Create the message payload.

        Args:
            user_prompt (str): The user-specific prompt.

        Returns:
            List[Dict[str, str]]: Messages to send to the model.
        """
        return [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": user_prompt},
            {"role": "user", "content": f"Format your output according to: {self.structured_output_instructions}"}
        ]

    def _query_model(self, model: str, user_prompt: str) -> str:
        """
        Query a single model using OpenAI client.

        Args:
            model (str): Model name.
            user_prompt (str): User-specific prompt.

        Returns:
            str: Model's output content.
        """
        completion = self.client.chat.completions.create(
            model=model,
            messages=self._create_messages(user_prompt)
        )
        return completion.choices[0].message.content

    def generate_outputs(self, dataset: pd.DataFrame) -> Dict[str, List[Dict[str, Any]]]: # Update the type hint

        all_outputs = {model: [] for model in self.models}
        with tqdm(dataset.index, file=sys.stdout) as pbar:
            for index in dataset.index: # Iterate over the DataFrame's index

           
                data_point = dataset.loc[index] # Get the row as a Series
                user_prompt = f"Text: {data_point['text']}\nLabel: {data_point['doi']}" # Access 'text' and 'doi' using dictionary-like access
    
                for model in self.models:
                    try:
                        pbar.write(f"Querying model {model}...")
                        pbar.update(1)
                        output = self._query_model(model, user_prompt)
                        all_outputs[model].append({
                            "input": data_point.to_dict(), # Convert Series to dictionary for storage
                            "output": output
                        })
                        time.sleep(1)  # Sleep between requests to be polite
                    except Exception as e:
                        print(f"Error querying model {model}: {e}")
                        all_outputs[model].append({
                            "input": data_point.to_dict(), # Convert Series to dictionary for storage
                            "output": {"error": str(e)}
                        })
    
            return all_outputs


In [11]:
api_key = 'your_secret_api_key_here'

# Models you want to query
models = [
    "openai/gpt-4.1",
    "google/gemini-2.5-pro-preview-03-25"
]

In [None]:
dataset = # completar aqui

In [None]:
# System prompt
system_prompt =  (f"""You are a scientist trained in chemistry. Your task is to extract structured information from scientific papers, identifying relevant properties associated with each natural product discussed in the publication.

For each paper (identified by its DOI), analyze the text to extract the following information:

1. **Compound name**: Name(s) of the compounds. Can include more than one, e.g., Pellitorine, Sargachromenol, 3,4-dehydrostrictosidine acid. Compounds must be one of: {possible_names}.

2. **Biological Activity**: One or more of the following:
['Anesthetic', 'Inhibition of Cathepsin V', 'Mutagenic', 'Antiangiogenic', 'Inhibition of Acetylcholinesterase', 'Inhibition of Cathepsin L', 'Inhibition of Myeloperoxodase', 'Antinociceptive', 'Antioxidant', 'Cell growth inhbition', 'Antichagasic', 'Antileishmanial', 'Genotoxic', 'Inhibition of Protease', 'Cytotoxic', 'Inhibition of phosphorylating electron transport', 'Antiallergenic', 'Inhibition of basal electron transport', 'Antitrypanosomal', 'Antibacterial', 'Insect antennae response', 'Antimalarial', 'Molluscicidal', 'Antifungal', 'Anxiolytic', 'Anti-inflamatory', 'Inhibition of Glycosidase', 'Anticancer', 'Inhibition of Cathepsin B', 'Insecticidal', 'Inhibition of ATP synthesis', 'Antiviral', 'Inhibition of uncoupled electron transport']

3. **Collection Specie**: Scientific name of the species (binomial format). Family name can be included. Must match one of the known options.

4. **Collection Type**: One of:
['Biotransformation Product', 'Plant Isolated', 'Semisynthesis Product', 'Microorganism isolated', 'Plant Isolated, Microorganism isolated']

5. **Collection Site**: One of the following:
['Sao Carlos/SP', 'Pocos De Caldas/MG', 'Araraquara/SP', 'Teodoro Sampaio/SP', 'Murici/AL', 'Maues/AM', 'Sao Sebastiao Do Passe/BA', 'Igarape-acu/PA', 'Itacoatiara/AM', 'Apore/GO', 'Peruibe/SP', 'Sao Paulo/SP', 'Pocos De Caldas/MG, Lonchocarpus atropurpureus', 'Ibirama/SC', 'Rio Claro/SP', 'Iguape/SP', 'Goiania/GO', 'Sao Miguel Arcanjo/SP', 'Rifaina/SP', 'Piracicaba/SP', 'N/A/CE', 'Urucuca/BA', 'Campinas/SP', 'Corumba/MS', 'Lavras/MG', 'Ribeirao Preto/SP, Nigrospora sphaerica', 'N/A/SP, Cedrela fissilis', 'Recife/PE', 'Cordeiropolis/SP', 'Santarem/PA', 'Itaituba/PA', 'Londrina/PR', 'Cuiaba/MT', 'Ribeirao Preto/SP', 'Itirapina/SP', 'Manaus/AM', 'Mogi Guacu/SP', 'N/A/MS', 'N/A/MG', 'Belem/PA', 'N/A/ES', 'Ibate/SP', 'N/A/PE', 'Cunha/SP', 'Rio De Janeiro/RJ', 'Chapada Dos Guimaraes/MT', 'N/A/SP', 'N/A/AM', 'Vicosa/MG', 'Rio Verde/GO', 'Pirenopolis/GO']

Return the result in the following JSON format:"""

"""json
{
    "doi": "paper identifier",
    "name": ["Piperol B", "Nilocetin"],
    "biological_activity": ["Antiangiogenic", "Antioxidant"],
    "collection_specie": ["Picramnia teapensis"],
    "collection_type": ["Plant Isolated"],
    "collection_site": ["Sao Carlos/SP"]
}
```"""
)

# Structured output instructions
structured_output_instructions = {
    "format": "JSON",
    "fields": {
        "doi": "It is an identifier provided in the input",
        "name": "It can be more than one compund.",
        "biological_activity": "One of ['Anesthetic', 'Inhibition of Cathepsin V', 'Mutagenic', 'Antiangiogenic', 'Inhibition of Acetylcholinesterase', 'Inhibition of Cathepsin L', 'Inhibition of Myeloperoxodase', 'Antinociceptive', 'Antioxidant', 'Cell growth inhbition', 'Antichagasic', 'Antileishmanial', 'Genotoxic', 'Inhibition of Protease', 'Cytotoxic', 'Inhibition of phosphorylating electron transport', 'Antiallergenic', 'Inhibition of basal electron transport', 'Antitrypanosomal', 'Antibacterial', 'Insect antennae response', 'Antimalarial', 'Molluscicidal', 'Antifungal', 'Anxiolytic', 'Anti-inflamatory', 'Inhibition of Glycosidase', 'Anticancer', 'Inhibition of Cathepsin B', 'Insecticidal', 'Inhibition of ATP synthesis', 'Antiviral', 'Inhibition of uncoupled electron transport']",
        "collection_specie": "Species from which natural products were extracted. Provide the scientific name, binomial form. Family name can be provided. One of ['Tithonia diversifolia', 'Picramnia teapensis', 'Chrysophyllum marginatum (Sapotaceae)', 'Himatanthus sucuuba (Apocynaceae)', 'Stemodia foliosa (Scrophulariaceae)', 'Murraya paniculata', 'Curvularia sp. (Pleosporaceae)', 'Tephrosia candida', 'Galipea bracteata (Rutaceae)', 'Vitex polygama (Verbenaceae)', 'Ouratea multiflora (Ochnaceae)', 'Maytenus ilicifolia (Celastraceae)', 'Khaya ivorensis', 'Croton zehntneri (Euphorbiaceae)', 'Cedrela fruticosa (Meliaceae), Neoraputia magnifica (Rutaceae), Neoraputia alba (Rutaceae)', 'Casearia obliqua (Flacourtiaceae)', 'Pilocarpus riedelianus', 'Chaetomium globosum', 'Phomopsis sp. (Valsaceae)', 'Iryanthera juruensis (Myristicaceae)', 'Pilocarpus spicatus (Rutaceae)', 'Trichilia elegans (Meliaceae)', 'Toona ciliata (Meliacee)', 'Peperomia obtusifolia (Piperaceae)', 'Cedrela odorata (Meliaceae)', 'Tocoyena brasiliensis (Rubiaceae)', 'Citrus sinensis (Rutaceae)', 'Mycoleptodiscus indicus', 'Ricinus communis (Euphorbiaceae)', 'Peperomia blanda (Piperaceae)', 'Lantana lilacina (Verbenaceae)', 'Raulinoa echinata (Rutaceae)', 'Maytenus aquifolium (Celastraceae)', 'Calycophyllum spruceanum (Rubiaceae)', 'Adiscanthus fusciflorus', 'Spiranthera odoratissima St. odoratissima (Rutaceae)', 'Esenbeckia leiocarpa', 'Coussarea hydrangeifolia (Rubiaceae)', 'Piper tuberculatum (Piperaceae)', 'Swinglea glutinosa', 'Conchocarpus heterophyllus (Rutaceae)', 'Angostura paniculata (Rutaceae)', 'Michelia champaca (Magnoliaceae)', 'Pothomorphe umbellata (Piperaceae)', 'Cabralea canjerana', 'Raulinoa echinata (Rutaceae), Swietenia macrophylla (Meliaceae)', 'Erythrina mulungu (Fabaceae)', 'Alibertia macrophylla (Rubiaceae)', 'Colletotrichum gloeosporioides (Phyllachoraceae)', 'Phomopsis cassiae (Valsaceae)', 'Zingiber officinale', 'Cnidoscolus vitifolius (Euphorbiaceae)', 'Styrax camporum (Styracaceae)', 'Senna multijuga (Fabaceae)', 'Piper arboreum (Piperaceae)', 'Streptomyces sp.', 'Amorpha fruticosa', 'Balfourodendron riedelianum (Rutaceae)', 'Pterogyne nitens (Fabaceae)', 'Piper hostmannianum (Piperaceae)', 'Neoraputia magnifica (Rutaceae)', 'Siphoneugena densiflora (Myrtaceae)', 'Swartzia langsdorfii (Fabaceae)', 'Eucalyptus maculata', 'Fusarium oxysporum', 'Alibertia sessilis (Rubiaceae)', 'Cedrela fissilis', 'Dalbergia riparia', 'Ottonia corcovadensis (Piperaceae)', 'Aspergillus terreus', 'Petiveria alliacea (Phytolaccaceae)', 'Senna spectabilis (Fabaceae)', 'Hortia superba (Rutaceae)', 'Rapanea lancifolia (Myrsinaceae)', 'Trichilia pallida', 'Casearia sylvestris (Flacourtiaceae)', 'Cinnamomum australe (Lauraceae)', 'Xylaria sp. (Xylariaceae)', 'Cassia leptophylla (Fabaceae)', 'Salacia campestris (Celastraceae)', 'Cheiloclinium cognatum (Hippocrateaceae)', 'Trichilia claussenii (Meliaceae)', 'Piper aduncum (Piperaceae)', 'Periconia atropurpurea (Annonaceae)', 'Styrax ferrugineus (Styracaceae)', 'Cassia spectabilis (Fabaceae)', 'Hortia oreadica', 'Phoma betae', 'Alchornea glandulosa (Euphorbiaceae)', 'Chimarrhis turbinata (Rubiaceae)', 'Aspergillus fumigatus', 'Almeidea rubra (Rutaceae)', 'Dimorphandra mollis', 'Penicillium corylophilum', 'Nectandra grandiflora (Lauraceae)', 'Anacardium occidentale (Anacardiaceae)', 'Rapanea umbellata,Myrsine cuneifolia (Myrsinaceae)', 'Penicillium sp. (Trichocomaceae)', 'Carapa guianensis', 'Alibertia edulis (Rubiaceae)', 'Tocoyena formosa (Rubiaceae)', 'Neoraputia alba (Rutaceae)', 'Humicola grisea', 'Pilocarpus grandiflorus', 'Nigrospora sphaerica', 'Arrabidaea samydoides (Bignoniaceae)', 'Rudgea viburnioides (Rubiaceae)', 'Piper crassinervium (Piperaceae)', 'Metrodorea stipularis', 'Vochysia thyrsoidea', 'Piper hispidum (Piperaceae)', 'Casearia rupestris (Salicaeae)', 'Piper solmsianum', 'Piper gaudichaudianum (Piperaceae)', 'Citrus sp. (Rutaceae)']",
        "collection_type": "One of ['Biotransformation Product', 'Plant Isolated', 'Semisynthesis Product', 'Microorganism isolated']",
        "collection_site": "One of ['Sao Carlos/SP', 'Pocos De Caldas/MG', 'Araraquara/SP', 'Teodoro Sampaio/SP', 'Murici/AL', 'Maues/AM', 'Sao Sebastiao Do Passe/BA', 'Igarape-acu/PA', 'Itacoatiara/AM', 'Apore/GO', 'Peruibe/SP', 'Sao Paulo/SP', 'Pocos De Caldas/MG, Lonchocarpus atropurpureus', 'Ibirama/SC', 'Rio Claro/SP', 'Iguape/SP', 'Goiania/GO', 'Sao Miguel Arcanjo/SP', 'Rifaina/SP', 'Piracicaba/SP', 'N/A/CE', 'Urucuca/BA', 'Campinas/SP', 'Corumba/MS', 'Lavras/MG', 'Ribeirao Preto/SP, Nigrospora sphaerica', 'N/A/SP, Cedrela fissilis', 'Recife/PE', 'Cordeiropolis/SP', 'Santarem/PA', 'Itaituba/PA', 'Londrina/PR', 'Cuiaba/MT', 'Ribeirao Preto/SP', 'Itirapina/SP', 'Manaus/AM', 'Mogi Guacu/SP', 'N/A/MS', 'N/A/MG', 'Belem/PA', 'N/A/ES', 'Ibate/SP', 'N/A/PE', 'Cunha/SP', 'Rio De Janeiro/RJ', 'Chapada Dos Guimaraes/MT', 'N/A/SP', 'N/A/AM', 'Vicosa/MG', 'Rio Verde/GO', 'Pirenopolis/GO']"
    }
}
# Instantiate the inference class
inference = OpenRouterBatchInference(
    api_key=api_key,
    models=models,
    system_prompt=system_prompt,
    structured_output_instructions=structured_output_instructions,
)

# Generate outputs
outputs = inference.generate_outputs(dataset)

In [13]:
outputs.to_parquet(r'data/raw/outputs.parquet')