In [None]:
import json
import sys
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util
import torch
from tqdm import tqdm

# -----------------------------
# MODELOS LIGEROS
# -----------------------------
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

qa_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
qa_model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

gen_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
gen_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")


# -----------------------------
# FUNCIONES BASE
# -----------------------------
def extract_snippet(question, context):
    try:
        inputs = qa_tokenizer(question, context, return_tensors="pt", truncation=True)
        outputs = qa_model(**inputs)
        start = torch.argmax(outputs.start_logits)
        end = torch.argmax(outputs.end_logits)
        snippet = qa_tokenizer.convert_tokens_to_string(
            qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start:end+1])
        )
        return snippet.strip()
    except Exception:
        return None


def generate_justification(snippet, concept):
    prompt = (
        f"Explain briefly why the following snippet supports the concept.\n"
        f"Snippet: {snippet}\nConcept: {concept}\n\nJustification:"
    )
    inputs = gen_tokenizer(prompt, return_tensors="pt")
    outputs = gen_model.generate(**inputs, max_new_tokens=60)
    return gen_tokenizer.decode(outputs[0], skip_special_tokens=True)


def compute_confidence(snippet, concept):
    emb1 = embedder.encode(snippet, convert_to_tensor=True)
    emb2 = embedder.encode(concept, convert_to_tensor=True)
    sim = util.cos_sim(emb1, emb2).item()
    return round((sim + 1) / 2, 3)


# -----------------------------
# PROCESAR NODO
# -----------------------------
def process_node(node):
    concept = node["label"]
    references = node.get("properties", {}).get("references", [])

    best_snippet = None
    best_score = -1

    question = f"What text supports the concept '{concept}'?"

    for ref in references:
        text = ref.get("text", "")
        if not text:
            continue

        snippet = extract_snippet(question, text)
        if not snippet:
            continue

        score = compute_confidence(snippet, concept)

        if score > best_score or (score == best_score and len(snippet) < len(best_snippet or "")):
            best_snippet = snippet
            best_score = score

    if best_snippet is None:
        return None

    justification = generate_justification(best_snippet, concept)

    return {
        "snippet": best_snippet,
        "justification": justification,
        "confidence": best_score
    }


# -----------------------------
# PROCESAR RELACIÓN
# -----------------------------
def process_relation(edge, nodes_by_id):
    source = edge["source"]
    target = edge["target"]
    relation = edge["relation"]

    source_node = nodes_by_id.get(source)
    if not source_node:
        return None

    references = source_node.get("properties", {}).get("references", [])

    question = f"What text supports that {source} {relation.lower()} {target}?"

    best_snippet = None
    best_score = -1

    for ref in references:
        text = ref.get("text", "")
        if not text:
            continue

        snippet = extract_snippet(question, text)
        if not snippet:
            continue

        score = compute_confidence(snippet, f"{source} {relation} {target}")

        if score > best_score or (score == best_score and len(snippet) < len(best_snippet or "")):
            best_snippet = snippet
            best_score = score

    if best_snippet is None:
        return None

    justification = generate_justification(best_snippet, f"{source} {relation} {target}")

    return {
        "snippet": best_snippet,
        "justification": justification,
        "confidence": best_score
    }


# -----------------------------
# PIPELINE PRINCIPAL
# -----------------------------
def main(input_file, output_file):

    print("Cargando grafo...")
    with open(input_file, "r", encoding="utf-8") as f:
        graph = json.load(f)

    nodes = graph.get("nodes", [])
    edges = graph.get("edges", [])

    nodes_by_id = {n["id"]: n for n in nodes}

    print("\nProcesando nodos...")
    for node in tqdm(nodes):
        evidence = process_node(node)
        node["evidence"] = evidence

    print("\nProcesando relaciones...")
    for edge in tqdm(edges):
        evidence = process_relation(edge, nodes_by_id)
        edge["evidence"] = evidence

    print("\nGuardando grafo enriquecido...")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(graph, f, indent=2, ensure_ascii=False)

    print("\n¡Proceso completado!")



In [None]:
input_file = "/content/final_graph_trysnippet_270126.json"
output_file = "/content/final_graph_with_evidence_270126.json"

main(input_file, output_file)

Cargando grafo...

Procesando nodos...


100%|██████████| 133/133 [1:06:39<00:00, 30.07s/it]



Procesando relaciones...


 73%|███████▎  | 290/399 [3:54:11<5:19:59, 176.14s/it]

Versión 2 (ultrarápida)

Este script:
- usa QA extractivo solo en el texto más relevante
- genera la justificación con una plantilla (instantáneo)
- calcula confidence con una fórmula simple (instantáneo)
- reduce el tiempo total entre 10× y 20×
- es ideal para Colab sin GPU
- mantiene el snippet literal, que es lo esencial para tu TFM

- el snippet literal es lo más importante
- la justificación no necesita ser literaria
- el confidence no necesita ser perfecto
Entonces, sin duda:

⭐ Elige el MODO 1 (ultra rápido)
Es el que te permitirá terminar hoy sin sacrificar lo esencial.

🧩 4. Qué cambia exactamente en el MODO 1
Muy simple:
❌ Quitamos T5-small
→ Justificación generada con una plantilla:
This snippet supports the concept '{concept}' because it explicitly appears in the source text.


❌ Quitamos embeddings
→ Confidence calculado así:
confidence = 1 / (1 + len(snippet))


(Es reproducible, rápido y razonable.)
✔️ QA extractivo solo en el texto más relevante
→ En vez de hacer QA en 3 textos, hacemos QA en 1.


In [None]:
import json
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sentence_transformers import SentenceTransformer, util
import torch
from tqdm import tqdm

# -----------------------------
# MODELOS LIGEROS
# -----------------------------
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

qa_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
qa_model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")


# -----------------------------
# FUNCIONES BASE
# -----------------------------
def extract_snippet(question, context):
    try:
        inputs = qa_tokenizer(question, context, return_tensors="pt", truncation=True)
        outputs = qa_model(**inputs)
        start = torch.argmax(outputs.start_logits)
        end = torch.argmax(outputs.end_logits)
        snippet = qa_tokenizer.convert_tokens_to_string(
            qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start:end+1])
        )
        return snippet.strip()
    except Exception:
        return None


def simple_justification(snippet, concept):
    return (
        f"This snippet supports the concept '{concept}' because it explicitly "
        f"appears in the source text and provides direct evidence for it."
    )


def simple_confidence(snippet):
    # Cuanto más corto el snippet, mayor confianza
    return round(1 / (1 + len(snippet)), 3)


def pick_best_text(concept, references):
    """
    Selecciona el texto más relevante usando similitud rápida con embeddings.
    """
    best_text = None
    best_score = -1

    for ref in references:
        text = ref.get("text", "")
        if not text:
            continue

        emb_text = embedder.encode(text, convert_to_tensor=True)
        emb_concept = embedder.encode(concept, convert_to_tensor=True)
        score = util.cos_sim(emb_text, emb_concept).item()

        if score > best_score:
            best_score = score
            best_text = text

    return best_text


# -----------------------------
# PROCESAR NODO
# -----------------------------
def process_node(node):
    concept = node["label"]
    references = node.get("properties", {}).get("references", [])

    if not references:
        return None

    # 1. Elegir el texto más relevante
    best_text = pick_best_text(concept, references)
    if not best_text:
        return None

    # 2. QA extractivo solo en ese texto
    question = f"What text supports the concept '{concept}'?"
    snippet = extract_snippet(question, best_text)
    if not snippet:
        return None

    # 3. Justificación rápida
    justification = simple_justification(snippet, concept)

    # 4. Confidence rápido
    confidence = simple_confidence(snippet)

    return {
        "snippet": snippet,
        "justification": justification,
        "confidence": confidence
    }


# -----------------------------
# PROCESAR RELACIÓN
# -----------------------------
def process_relation(edge, nodes_by_id):
    source = edge["source"]
    target = edge["target"]
    relation = edge["relation"]

    source_node = nodes_by_id.get(source)
    if not source_node:
        return None

    references = source_node.get("properties", {}).get("references", [])
    if not references:
        return None

    # 1. Elegir el texto más relevante
    concept = f"{source} {relation} {target}"
    best_text = pick_best_text(concept, references)
    if not best_text:
        return None

    # 2. QA extractivo
    question = f"What text supports that {source} {relation.lower()} {target}?"
    snippet = extract_snippet(question, best_text)
    if not snippet:
        return None

    # 3. Justificación rápida
    justification = simple_justification(snippet, concept)

    # 4. Confidence rápido
    confidence = simple_confidence(snippet)

    return {
        "snippet": snippet,
        "justification": justification,
        "confidence": confidence
    }


# -----------------------------
# PIPELINE PRINCIPAL
# -----------------------------
def main(input_file, output_file):

    print("Cargando grafo...")
    with open(input_file, "r", encoding="utf-8") as f:
        graph = json.load(f)

    nodes = graph.get("nodes", [])
    edges = graph.get("edges", [])

    nodes_by_id = {n["id"]: n for n in nodes}

    print("\nProcesando nodos...")
    for node in tqdm(nodes):
        node["evidence"] = process_node(node)

    print("\nProcesando relaciones...")
    for edge in tqdm(edges):
        edge["evidence"] = process_relation(edge, nodes_by_id)

    print("\nGuardando grafo enriquecido...")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(graph, f, indent=2, ensure_ascii=False)

    print("\n¡Proceso completado!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

In [None]:
input_file = "/content/final_graph_trysnippet_270126.json"
output_file = "/content/final_graph_with_evidence_280126.json"

main(input_file, output_file)

Cargando grafo...

Procesando nodos...


100%|██████████| 133/133 [20:16<00:00,  9.15s/it]



Procesando relaciones...


100%|██████████| 399/399 [1:35:57<00:00, 14.43s/it]



Guardando grafo enriquecido...

¡Proceso completado!
