In [None]:
!pip install pandas lxml tqdm dicttoxml

# Mapeo de Metadatos

In [None]:
"""
ETL: InBio ➜ Plinian Core 3.2 mapper
-----------------------------------
✓ Python >= 3.9
✓ pip install pandas lxml tqdm dicttoxml

Usage:
    python inbio_to_plinian.py reg_especies_INBiov5.csv
"""

from __future__ import annotations

import json
import sys
from pathlib import Path
from typing import Dict, List, Tuple

import pandas as pd
from dicttoxml import dicttoxml
from lxml import etree
from tqdm import tqdm

# ---------------------------------------------------------------------------
# 1 │ CONFIGURATION
# ---------------------------------------------------------------------------

#: map each description_type (lower-case, trimmed) to
#: (Plinian element, Plinian sub-element OR None if simple element)
DESCRIPTION_MAP: Dict[str, Tuple[str, str | None]] = {
    "common_names": ("CommonNames", "commonName"),
    "synonyms": ("Synonyms", "synonym"),
    "cr_distribution": ("Distribution", "distributionArea"),
    "regional_distribution": ("Distribution", "distributionArea"),
    "annual_cycles": ("AnnualCycles", "annualCycle"),
    "phenology": ("AnnualCycles", "phenology"),
    "behavior": ("Behavior", None),
    "feeding": ("Feeding", None),
    "habitat": ("Habitats", "habitatDescription"),
    "interaction": ("Interaction", None),
    "life_cycle": ("LifeCycle", None),
    "life_form": ("LifeForm", None),
    "population_biology": ("PopulationBiology", None),
    "demography": ("DemographyAndThreat", "demography"),
    "threat": ("ThreatStatus", None),
    "reproduction": ("Reproduction", None),
    "territory": ("Territory", None),
    "uses": ("Uses", None),
    "conservation_area_distribution": (
        "ManagementAndConservation",
        "conservationAreas",
    ),
    "wild_protected_area": (
        "ManagementAndConservation",
        "wildProtectedAreas",
    ),
    "collecting_method": (
        "IdentificationKeys",
        "collectingMethod",
    ),  # extension field
    "full_description": ("TaxonomicDescription", None),
    "myths": (
        "Notes",
        None,
    ),
}

UNMAPPED_BUCKET = "Notes"


# ---------------------------------------------------------------------------
# 2 │ HELPERS
# ---------------------------------------------------------------------------


def add_value(rec: dict, element: str, subelement: str | None, value: str) -> None:
    """Safely append *value* under rec[element][subelement]."""
    if subelement is None:  # simple element = list of strings
        rec.setdefault(element, [])
        if value not in rec[element]:
            rec[element].append(value)
    else:  # nested dict with its own list
        node = rec.setdefault(element, {})
        node.setdefault(subelement, [])
        if value not in node[subelement]:
            node[subelement].append(value)


def build_plinian_record(group: pd.DataFrame) -> dict:
    """Convert *one* taxon’s rows into a nested Plinian Core record."""
    # Pull master (scalar) fields out of the first row
    first = group.iloc[0]
    record: dict = {
        "identifier": str(int(first["taxon_record_id"])),
        "NomenclatureAndClassification": {
            "kingdom": str(first["kingdom"]).capitalize().strip(),
            "scientificName": first["default_name"].strip(),
        },
    }

    # Walk through every row that contains a description blob
    for _, row in group.iterrows():
        text = str(row["description"]).strip()
        if not text:
            continue
        desc_type = str(row["description_type"]).strip().lower()
        element, subelement = DESCRIPTION_MAP.get(desc_type, (UNMAPPED_BUCKET, None))
        add_value(record, element, subelement, text)

    return record


def dict_to_pretty_xml(d: dict) -> bytes:
    """dict ➜ XML (pretty)"""
    # dicttoxml puts the root tag <root>; we rename to <PlinianRecord>
    raw = dicttoxml(
        d,
        attr_type=False,
        custom_root="PlinianRecord",
        item_func=lambda _: "item",
    )
    tree = etree.fromstring(raw)
    return etree.tostring(
        tree,
        pretty_print=True,
        xml_declaration=True,
        encoding="UTF-8",
    )


# ---------------------------------------------------------------------------
# 3 │ MAIN PIPELINE
# ---------------------------------------------------------------------------


def main(csv_path: str | Path) -> None:
    csv_path = Path(csv_path)
    if not csv_path.is_file():
        sys.exit(f"ERROR: Cannot find {csv_path}")

    out_jsonl = csv_path.with_name("plinian_records.jsonl")
    out_xml_dir = csv_path.with_name("out_xml")
    out_xml_dir.mkdir(parents=True, exist_ok=True)

    # --- load & clean
    df = pd.read_csv(csv_path, sep="|")
    df["description_type"] = df["description_type"].str.strip().str.lower()

    # --- build records
    records: List[dict] = []
    with open(out_jsonl, "w", encoding="utf-8") as jf:
        for taxon_id, group in tqdm(
            df.groupby("taxon_record_id", sort=False),
            desc="Building Plinian records",
        ):
            rec = build_plinian_record(group)
            records.append(rec)

            # write JSON Lines
            jf.write(json.dumps(rec, ensure_ascii=False) + "\n")

            # write XML
            xml_bytes = dict_to_pretty_xml(rec)
            xml_path = out_xml_dir / f"{int(taxon_id)}.xml"
            xml_path.write_bytes(xml_bytes)

    print(f"\n✅  Wrote {len(records):,} JSON records to:  {out_jsonl}")
    print(f"✅  Wrote XML files to:               {out_xml_dir.absolute()}")

In [None]:
csv_path = "/content/reg_especies_INBiov5.csv"
main(csv_path)

#Consulta sencilla


In [None]:
from pymilvus import connections, Collection

# Cambia la IP/puerto si usas otros
connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)

colec = Collection("inbio_plinian_qa")   # nombre que usamos en la ingesta
colec.load()                             # carga los índices en memoria

print("Nº total de registros:", colec.num_entities)        # cuántas P-R hay
print("Esquema:\n", colec.schema)                          # campos y tipos


In [None]:
# Milvus permite consultar con expresiones lógicas
docs = colec.query(
    expr="kingdom == 'Plantae'",                       # sin filtro: devuelve los primeros que encuentre
    limit=5,
    output_fields=[
        "qa_id", "question", "answer",
        "taxon_id", "kingdom", "threat_status"
    ]
)

for d in docs:
    print(f"[{d['qa_id']}] P: {d['question']}\n    R: {d['answer']}\n---")

In [None]:
from sentence_transformers import SentenceTransformer

def embed(txt):
    return encoder.encode([txt], normalize_embeddings=True).tolist()[0]

# ---- consulta de ejemplo ----
consulta = "hábitat de Ateles geoffroyi"
vec = embed(consulta)

res = colec.search(
    data=[vec],
    anns_field="embedding",
    param={"nprobe": 32},
    limit=3,
    expr="kingdom == 'Animalia'",
    output_fields=["question", "answer", "taxon_id"]
)

for hit in res[0]:
    print(f"score={hit.distance:.3f}")
    print("P:", hit.entity.question)
    print("R:", hit.entity.answer, "\n---")


#Ingesta a Milvus con los datos del InBio
## Se hacen preguntas quemadas generadas automaticamente por cada uno de los metadatos para llenar la coleccion de prueba en Milvus añojada en Azure



In [None]:
!pip install pymilvus

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Ingerir registros Plinian en Milvus con etiqueta plinian_field
-------------------------------------------------------------
• pip install pymilvus==2.4.3 pandas sentence-transformers tqdm torch>=2.0
"""

import json, os, re, random
from pathlib import Path
from typing import List, Dict, Tuple

import pandas as pd
from tqdm import tqdm
from pymilvus import (
    connections, Collection, FieldSchema, CollectionSchema, DataType, utility
)
from sentence_transformers import SentenceTransformer


# ───────────────────────── CONFIGURACION PARA ACCEDER A VM AZURE y OTROS ─────────────────────────
CSV_PATH         = Path("reg_especies_INBiov5.csv")
MILVUS_HOST      = "135.237.82.231"
MILVUS_PORT      = "19530"
COLLECTION_NAME  = "inbio_plinian_qa"
HF_MODEL         = "embaas/sentence-transformers-multilingual-e5-base"   # 768 d
DIM              = 768
BATCH            = 128
SEED             = 42
# ──────────────────────────────────────────────────────────────────


# -------------------- Plantillas por campo Plinian ----------------
# Para cada elemento Plinian define AL MENOS una plantilla de pregunta
TEMPLATES_PLI = {
    "NomenclatureAndClassification": [
        "¿Cuál es el nombre científico de la especie {sci}?",
        "¿A qué reino pertenece {sci}?"
    ],
    "CommonNames": [
        "¿Cuáles son los nombres comunes de {sci}?"
    ],
    "Habitats": [
        "¿En qué hábitats se encuentra {sci}?"
    ],
    "ThreatStatus": [
        "¿Cuál es el estado de amenaza (UICN) de {sci}?"
    ],
    "AnnualCycles": [
        "¿Cómo es la fenología anual de {sci}?"
    ],
    "Behavior": [
        "¿Cuál es el comportamiento típico de {sci}?"
    ],
    "Feeding": [
        "¿De qué se alimenta {sci}?"
    ],
    "Interaction": [
        "¿Con qué otras especies interactúa {sci}?"
    ],
    "LifeCycle": [
        "Describe el ciclo de vida de {sci}."
    ],
    "LifeForm": [
        "¿Qué forma de vida presenta {sci}?"
    ],
    "PopulationBiology": [
        "¿Qué se sabe de la biología poblacional de {sci}?"
    ],
    "Reproduction": [
        "¿Cómo se reproduce {sci}?"
    ],
    "Uses": [
        "¿Qué usos se le atribuyen a {sci}?"
    ],
    "Distribution": [
        "¿Dónde se distribuye {sci}?"
    ],
    "DemographyAndThreat": [
        "¿Cuáles son los aspectos demográficos y de amenaza de {sci}?"
    ],
    "TaxonomicDescription": [
        "Proporcione una descripción taxonómica de {sci}."
    ],
    "Synonyms": [
        "¿Qué sinónimos taxonómicos tiene {sci}?"
    ],
    "Territory": [
        "¿Cuál es el territorio característico de {sci}?"
    ],
    "Dispersal": [
        "¿Cómo se dispersa {sci}?"
    ],
    "EcologicalSignificance": [
        "¿Cuál es la importancia ecológica de {sci}?"
    ],
    "ManagementAndConservation": [
        "¿Qué acciones de manejo y conservación existen para {sci}?"
    ],
    "Migratory": [
        "¿{sci} presenta comportamientos migratorios?",
        "Describa el patrón migratorio de {sci}."
    ],
    "MolecularData": [
        "¿Qué información molecular se conoce de {sci} (por ejemplo, secuencias de ADN o proteínas)?"
    ],
    "IdentificationKeys": [
        "¿Existe una clave de identificación para {sci}?",
        "¿Cómo se puede identificar {sci} mediante claves taxonómicas?"
    ],
    "Invasiveness": [
        "¿{sci} es considerada una especie invasora?"
    ],
    "Legislation": [
        "¿Qué legislación protege o regula a {sci}?",
        "Mencione leyes o normativas relacionadas con {sci}."
    ],
    "Endemic": [
        "¿{sci} es endémica de alguna región?",
        "Indique las áreas donde {sci} es endémica."
    ],
    "EnvironmentalEnvelope": [
        "¿Cuál es el rango ambiental (precipitación, temperatura, etc.) donde vive {sci}?"
    ]
}


# Campos del CSV que mapean directamente a los nombres Plinian usados
CSV_TO_PLI = {
    "common_names":            "CommonNames",
    "habitat":                 "Habitats",
    "threat":                  "ThreatStatus",
    "annual_cycles":           "AnnualCycles",
    "phenology":               "AnnualCycles",
    "behavior":                "Behavior",
    "feeding":                 "Feeding",
    "interaction":             "Interaction",
    "life_cycle":              "LifeCycle",
    "life_form":               "LifeForm",
    "population_biology":      "PopulationBiology",
    "reproduction":            "Reproduction",
    "uses":                    "Uses",
    "cr_distribution":         "Distribution",
    "regional_distribution":   "Distribution",
    "demography":              "DemographyAndThreat",
    "synonyms":                "Synonyms",
    "territory":               "Territory",
}


# ------------------------ Limpieza básica -------------------------
def clean_text(s: str) -> str:
    return re.sub(r"\s+", " ", str(s)).strip()


# ----------------- Construcción de DataFrames --------------------
def build_dataframes(csv_path: Path) -> Tuple[pd.DataFrame, pd.DataFrame]:
    df = pd.read_csv(csv_path, sep="|")

    df = (
        df.assign(kingdom=lambda d: d["kingdom"].str.title(),
                  description=lambda d: d["description"].apply(clean_text),
                  description_type=lambda d: d["description_type"].str.strip())
          .drop_duplicates(subset=["taxon_record_id", "description_type", "description"])
    )

    pivot = (
    df.pivot_table(index="default_name",
                   columns="description_type",
                   values="description",
                   aggfunc=" ".join)
      .reset_index()
    )
    basics = (
        df[["default_name", "kingdom"]]
        .drop_duplicates()
    )
    wide = basics.merge(pivot, on="default_name", how="left")

    qa_rows = []
    random.seed(SEED)

    for _, row in wide.iterrows():
        sci    = row["default_name"]
        kingdom= row["kingdom"]

        # 1. armar diccionario con todos los posibles reemplazos
        subs = {"sci": sci, "kingdom": kingdom }
        for csv_field in CSV_TO_PLI.keys():
            subs[csv_field] = row.get(csv_field, "")

        # 2. para cada campo Plinian presente, crear preguntas
        for csv_field, plinian_field in CSV_TO_PLI.items():
            if pd.isna(row.get(csv_field)):        # no hay dato
                continue
            texto_resp = row[csv_field]
            for plantilla in TEMPLATES_PLI.get(plinian_field, []):
                qa_rows.append({
                    "question": plantilla.format(**subs).strip(),
                    "answer": clean_text(texto_resp),
                    "plinian_field": plinian_field,
                    "scientific_name": sci,
                    "kingdom": kingdom
                })

        # 3. añadir también preguntas básicos siempre presentes
        for plantilla in TEMPLATES_PLI["NomenclatureAndClassification"]:
            ans = sci if "nombre científico" in plantilla else kingdom
            qa_rows.append({
                "question": plantilla.format(**subs).strip(),
                "answer": ans,
                "plinian_field": "NomenclatureAndClassification",
                "scientific_name": sci,
                "kingdom": kingdom
            })

    docs_df = wide.rename(columns={"taxon_record_id": "taxon_id"})
    qa_df   = pd.DataFrame(qa_rows)
    return docs_df, qa_df


# ----------------------- Encoder de embeddings -------------------
encoder = SentenceTransformer(HF_MODEL)
def embed(textos: List[str]) -> List[List[float]]:
    return encoder.encode(textos, normalize_embeddings=True).tolist()


# -------------------- Crear colección en Milvus ------------------
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT, alias="default")

if utility.has_collection(COLLECTION_NAME):
    collection = Collection(COLLECTION_NAME)
else:
    schema = CollectionSchema([
        FieldSchema("qa_id", DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema("embedding", DataType.FLOAT_VECTOR, dim=DIM),
        FieldSchema("question", DataType.VARCHAR, max_length=512),
        FieldSchema("answer",   DataType.VARCHAR, max_length=8192),
        FieldSchema("plinian_field", DataType.VARCHAR, max_length=64),
        FieldSchema("scientific_name", DataType.VARCHAR, max_length=128),
        FieldSchema("kingdom",         DataType.VARCHAR, max_length=32),
    ], description="P-R etiquetados con elemento Plinian")
    collection = Collection(COLLECTION_NAME, schema, consistency_level="Strong")
    collection.create_index("embedding", {
        "index_type": "IVF_FLAT",
        "metric_type": "COSINE",
        "params": {"nlist": 2048}
    })

# --------------------------- Ingesta ------------------------------
docs_df, qa_df = build_dataframes(CSV_PATH)

print(f"Especies leídas:  {docs_df.shape[0]:,}")
print(f"P-R generadas:   {qa_df.shape[0]:,}")

# Inserción por lotes
batch_q, batch_a, batch_emb, batch_f, batch_sci, batch_king = ([] for _ in range(6))

for _, fila in tqdm(qa_df.iterrows(), total=len(qa_df), desc="Insertando"):
    batch_q.append(fila["question"])
    batch_a.append(fila["answer"])
    batch_f.append(fila["plinian_field"])
    batch_sci.append(fila["scientific_name"])
    batch_king.append(fila["kingdom"])

    if len(batch_q) == BATCH:
        collection.insert([
            embed(batch_q), batch_q, batch_a,
            batch_f, batch_sci, batch_king
        ])
        batch_q, batch_a, batch_f, batch_sci, batch_king = ([] for _ in range(5))

# inserta lo restante
if batch_q:
    collection.insert([
        embed(batch_q), batch_q, batch_a,
        batch_f, batch_sci, batch_king
    ])

collection.flush()
print("✅  Ingesta completada en Milvus")


#Exploracion Metadatos InBio

In [None]:
"""
Explora los metadatos presentes en reg_especies_INBiov5.csv
-----------------------------------------------------------
• pip install pandas
"""

import pandas as pd
from pathlib import Path

CSV_PATH = Path("reg_especies_INBiov5.csv")
SEP      = "|"                                # delimitador vertical

# 1) Lee una pequeña porción para ver las columnas
preview = pd.read_csv(CSV_PATH, sep=SEP, nrows=5)
print("Columnas del CSV:")
for c in preview.columns:
    print("  •", c)

types_series = pd.read_csv(
    CSV_PATH,
    sep=SEP,
    usecols=["description_type"]
)["description_type"].astype(str).str.strip()

# 3) Obtiene lista única y ordenada
unique_types = sorted(types_series.unique())
print("\nMetadatos temáticos (description_type) encontrados:")
for t in unique_types:
    print("  •", t)

# 4) Conteo de registros por tipo
counts = types_series.value_counts().sort_values(ascending=False)
print("\nConteo de filas por metadato:")
for t, n in counts.items():
    print(f"  • {t:<35} {n:>6}")


#RAG

In [None]:
!pip install --upgrade sentence-transformers pymilvus transformers ipywidgets huggingface-hub --quiet

In [None]:
!pip install -q ipywidgets
!jupyter nbextension enable --py widgetsnbextension
!jupyter nbextension enable --py --sys-prefix widgetsnbextension

In [None]:
from huggingface_hub import login
login("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")

In [None]:
# ────────── 0 · Instalación de dependencias ──────────
!pip install gradio==4.29.0 pymilvus sentence-transformers transformers accelerate bitsandbytes --quiet

# ────────── 1 · Imports y Configuración ──────────
import os, re, torch
from typing import List, Dict

import gradio as gr
from sentence_transformers import SentenceTransformer
from pymilvus import MilvusClient, Collection, connections
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer

# Milvus
MILVUS_HOST = "135.237.82.231"
MILVUS_PORT = "19530"
COLLECTION  = "inbio_plinian_qa"
VEC_FIELD   = "embedding"
OUT_FIELDS  = ["answer", "scientific_name", "plinian_field"]

# Modelos
EMBED_MODEL = "embaas/sentence-transformers-multilingual-e5-base"   # 384 d
LLM_ID      = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO"          # abierto
DEVICE      = "cuda" if torch.cuda.is_available() else "cpu"

# Busca top-k contextos; configurable desde la UI
DEFAULT_K   = 5

# ────────── 2 · Cargar embedder y LLM ──────────
embedder = SentenceTransformer(EMBED_MODEL, device=DEVICE)

tokenizer = AutoTokenizer.from_pretrained(LLM_ID)
model = AutoModelForCausalLM.from_pretrained(
    LLM_ID,
    device_map="auto",
    torch_dtype=torch.float16,
    #load_in_4bit=True
)

# ────────── 3 · Cliente Milvus ──────────
print(f"🔹 Conectando a Milvus {MILVUS_HOST}:{MILVUS_PORT}…")
client = MilvusClient(uri=f"tcp://{MILVUS_HOST}:{MILVUS_PORT}")
client.load_collection(COLLECTION)
print("   ok\n")

# ────────── 4 · Funciones auxiliares ──────────
def detect_field(question:str)->str|None:
    """Clasificador placeholder (devuelve None = sin filtro)."""
    q = question.lower()
    if "hábitat" in q: return "Habitats"
    if "amenaz"  in q or "uicn" in q: return "ThreatStatus"
    return None

def build_prompt(question: str, hits: List[Dict]) -> str:
    """
    Crea un prompt limpio para el LLM.
    Cada hit es un dict que trae:
        • hit['distance']          (float)
        • hit['entity']['answer']  (str)
        • hit['entity']['scientific_name']
        • hit['entity']['plinian_field']
    """
    # armamos bloques numerados con las 3-5 evidencias
    bloques = []
    for i, hit in enumerate(hits, 1):
        ent = hit["entity"]
        dist = hit["distance"]
        breve = ent["answer"]
        bloques.append(
            f"[{i}] {ent['scientific_name']} · {ent['plinian_field']} "
            f"(sim={1-dist:.3f}): {breve}"
        )

    contexto = "\n\n".join(bloques)

    prompt = (
        "Sistema: Eres un redactor experto en biodiversidad de Costa Rica. "
        "Usa el contexto para responder en **un párrafos** bien redactados, "
        "en español, explicando el dato y añadiendo detalles relevantes del contexto. "
        f"### CONTEXTO\n{contexto}\n\n"
        f"### PREGUNTA\n{question}\n\n"
        "### RESPUESTA (dos párrafos):"
    )
    return prompt

# ────────── 5 · Función principal para Gradio ──────────
def rag_answer(msg: str, k: int = DEFAULT_K, temperatura: float = 0.0):
    # 1) embedding
    q_vec = embedder.encode([msg], normalize_embeddings=True)[0]

    # 2) búsqueda vectorial  (sin filtro por ahora)-> Por ahora no implementamos filtros, con milvus es muy facil y diverso
    hits = client.search(
        collection_name=COLLECTION,
        data=[q_vec],
        limit=k,
        output_fields=OUT_FIELDS,
        search_params={"metric_type": "COSINE", "params": {"nprobe": 10}}
    )[0]
    if not hits:
        return "No encontré información suficiente para responder."

    # 3) prompt
    prompt = build_prompt(msg, hits)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # 4) generación bloqueante (más tokens para evitar corte)
    out = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=temperatura > 0,
        temperature=float(temperatura),
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id
    )
    respuesta = tokenizer.decode(
        out[0][inputs["input_ids"].shape[-1]:],
        skip_special_tokens=True
    )
    return respuesta.strip()

# ────────── 6 · Construir interfaz Gradio ──────────
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("## 🌿 **Biodiversidad Costa Rica – RAG**")

    chat = gr.Chatbot(height=400)
    with gr.Row():
        txt = gr.Textbox(
            scale=4,
            placeholder="Pregunta sobre una especie, su hábitat, amenazas, etc."
        )
        enviar = gr.Button("Enviar", scale=1)

    with gr.Row():
        k_slider = gr.Slider(1,10,value=DEFAULT_K,step=1,label="Top-k documentos")
        temp = gr.Slider(0,1,value=0.0,step=0.05,label="Creatividad (temperature)")

    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history, k, temperatura):
        user_message = history[-1][0]
        respuesta = rag_answer(user_message, k=int(k), temperatura=float(temperatura))
        history[-1][1] = respuesta
        return history

    txt.submit(user, [txt, chat], [txt, chat]).then(
        bot, [chat, k_slider, temp], chat
    )
    enviar.click(user, [txt, chat], [txt, chat]).then(
        bot, [chat, k_slider, temp], chat
    )

# ────────── 7 · Lanzar servidor ──────────
demo.launch(share=True, debug=True) #da una url publica
