ENRIQUECIMIENTO DE GRAFO CON DATOS CORPUS ORIGINAL

In [1]:
!pip install faiss-cpu
import json
from datetime import datetime, timezone

from sentence_transformers import SentenceTransformer, util
import numpy as np
import faiss


GRAPH_PATH = "/content/Grafo_incompleted_abstracts_250126.json"
CORPUS_PATH = "/content/climate_corpus_291225.json"
OUTPUT_PATH = "/content/graph_enriched.json"

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"


def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def save_json(obj, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2, ensure_ascii=False)


def normalize_text(t):
    if t is None:
        return ""
    # algunos NaN pueden venir como string "NaN"
    if isinstance(t, float):
        return ""
    if isinstance(t, str) and t.strip().lower() == "nan":
        return ""
    return " ".join(str(t).split()).strip()


# =========================
# CARGA DE DATOS
# =========================

graph = load_json(GRAPH_PATH)
corpus_raw = load_json(CORPUS_PATH)

docs_list = corpus_raw.get("documents", [])
nodes = graph.get("nodes", [])


# =========================
# SEPARAR ARTÍCULOS vs INFORMES
# =========================

articles = {}  # key: doc_id (source_id), value: doc
reports = {}

for doc in docs_list:
    doc_id = doc.get("source_id") or doc.get("corpus_id")
    if not doc_id:
        continue

    abstract = normalize_text(doc.get("abstract"))
    full_text = normalize_text(doc.get("full_text"))

    if abstract:
        articles[doc_id] = doc
    elif full_text:
        reports[doc_id] = doc
    else:
        # ni abstract ni full_text usable
        continue

print(f"Artículos con abstract: {len(articles)}")
print(f"Informes con full_text: {len(reports)}")


# =========================
# ÍNDICE DE ABSTRACTS
# =========================

model = SentenceTransformer(MODEL_NAME)

abstract_texts = {
    doc_id: normalize_text(doc.get("abstract"))
    for doc_id, doc in articles.items()
}

abstract_embeddings = {
    doc_id: model.encode(text)
    for doc_id, text in abstract_texts.items()
}


def match_against_abstracts(fragment_text, min_score=0.8):
    fragment_text_norm = normalize_text(fragment_text)
    if not fragment_text_norm:
        return None, None

    emb = model.encode(fragment_text_norm)
    best_doc = None
    best_score = -1

    for doc_id, doc_emb in abstract_embeddings.items():
        score = util.cos_sim(emb, doc_emb).item()
        if score > best_score:
            best_score = score
            best_doc = doc_id

    if best_score >= min_score:
        return best_doc, best_score
    return None, None


# =========================
# ÍNDICE DE FULL_TEXT POR CHUNKS (INFORMES)
# =========================

def chunk_text(text, size=400, overlap=80):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = " ".join(words[i:i+size])
        chunks.append(chunk)
        i += size - overlap
    return chunks


report_chunks = []
report_metadata = []

for doc_id, doc in reports.items():
    full_text_norm = normalize_text(doc.get("full_text"))
    if not full_text_norm:
        continue
    chunks = chunk_text(full_text_norm, size=400, overlap=80)
    for idx, ch in enumerate(chunks):
        report_chunks.append(ch)
        report_metadata.append({
            "doc_id": doc_id,
            "chunk_idx": idx
        })

if report_chunks:
    report_emb_matrix = model.encode(report_chunks)
    report_emb_matrix = np.array(report_emb_matrix).astype("float32")

    dim = report_emb_matrix.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(report_emb_matrix)
else:
    index = None


def match_against_fulltext(fragment_text, k=5, min_score=0.65):
    if index is None:
        return None, None, None

    fragment_text_norm = normalize_text(fragment_text)
    if not fragment_text_norm:
        return None, None, None

    emb = model.encode(fragment_text_norm).astype("float32")
    D, I = index.search(np.array([emb]), k)

    best_doc = None
    best_score = -1
    best_chunk_idx = None

    for dist, idx in zip(D[0], I[0]):
        if idx == -1:
            continue
        score = float(1 / (1 + dist)) # Convert numpy.float32 to Python float
        meta = report_metadata[idx]
        if score > best_score:
            best_score = score
            best_doc = meta["doc_id"]
            best_chunk_idx = meta["chunk_idx"]

    if best_score >= min_score:
        return best_doc, best_score, best_chunk_idx
    return None, None, None


# =========================
# ORQUESTADOR
# =========================

def match_reference_text(ref_text):
    # 1) intentamos con abstracts
    doc_id, score = match_against_abstracts(ref_text, min_score=0.8)
    if doc_id is not None:
        return {
            "doc_id": doc_id,
            "score": score,
            "type": "article",
            "chunk_idx": None
        }

    # 2) si no, intentamos con full_text de informes
    doc_id, score, chunk_idx = match_against_fulltext(ref_text, k=5, min_score=0.65)
    if doc_id is not None:
        return {
            "doc_id": doc_id,
            "score": score,
            "type": "report",
            "chunk_idx": chunk_idx
        }

    return None


# =========================
# ENRIQUECER EL GRAFO
# =========================

def enrich_graph(nodes, docs_list):
    now = datetime.now(timezone.utc).isoformat() # Updated to use timezone-aware datetime
    # índice rápido doc_id -> doc
    corpus_by_id = {}
    for d in docs_list:
        doc_id = d.get("source_id") or d.get("corpus_id")
        if doc_id:
            corpus_by_id[doc_id] = d

    for node in nodes:
        props = node.get("properties", {})
        refs = props.get("references", [])

        for ref in refs:
            text = ref.get("text", "")
            if not text:
                continue

            match = match_reference_text(text)
            if match is None:
                ref["traceability_status"] = "unmatched"
                continue

            doc_id = match["doc_id"]
            doc = corpus_by_id.get(doc_id, {})

            # conservar ID artificial
            if "source_id" in ref:
                ref["legacy_source_id"] = ref["source_id"]

            ref["source_doc_id"] = doc_id
            ref["source"] = doc.get("source")  # arxiv, ipcc, etc.
            ref["source_id"] = doc.get("source_id") or doc_id
            ref["doi"] = doc.get("doi")
            ref["url"] = doc.get("url")
            ref["title"] = doc.get("title")
            ref["authors"] = doc.get("authors_str") or doc.get("authors")
            ref["year"] = doc.get("year")
            ref["pdf_url"] = doc.get("pdf_url")

            ref["match_type"] = match["type"]
            ref["match_score"] = match["score"]
            ref["match_chunk_idx"] = match["chunk_idx"]
            ref["ingestion_method"] = "reconstructed_from_corpus_json"
            ref["traceability_status"] = "matched"
            ref["traceability_timestamp"] = now


enrich_graph(nodes, docs_list)
save_json(graph, OUTPUT_PATH)

print(f"Grafo enriquecido guardado en: {OUTPUT_PATH}")

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m88.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2
Artículos con abstract: 2253
Informes con full_text: 397


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Grafo enriquecido guardado en: /content/graph_enriched.json
