## Examen Segundo Bimestre RI

Preprocesamiento

In [2]:
import nltk
import pandas as pd
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:

# Descarga de recursos necesarios de NLTK
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [4]:
df = pd.read_json('arxiv-metadata-oai-snapshot.json', lines=True)

In [5]:
df = df[['title', 'abstract']]

In [6]:
df['document'] = df['title'] + '. ' + df['abstract']

In [7]:
df[['document']].head()

Unnamed: 0,document
0,Calculation of prompt diphoton production cros...
1,Sparsity-certifying Graph Decompositions. We...
2,The evolution of the Earth-Moon system based o...
3,A determinant of Stirling cycle numbers counts...
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...


In [8]:
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

def preprocess_documents(documents, return_type='df'):
    df = pd.DataFrame(documents, columns=['document'])
    df['regex_tokens'] = df['document'].str.lower().apply(
        lambda text: regexp_tokenize(text, pattern=r'\w[a-z]+')
    )
    df['no_stopwords'] = df['regex_tokens'].apply(remove_stopwords)
    df['lemmas'] = df['no_stopwords'].apply(lemmatize_tokens)
    df['prep_doc'] = df['lemmas'].str.join(' ')
    return df[['document', 'regex_tokens', 'no_stopwords', 'lemmas', 'prep_doc']] if return_type == 'df' else df['lemmas'].tolist()

def preprocess_both(text: str) -> tuple[str, list[str]]:
    df = preprocess_documents([text])
    clean = df['prep_doc'].iloc[0]
    tokens = preprocess_documents([text], return_type='tokens')[0]
    return clean, tokens


In [10]:
subset = df.head()['document'].tolist()
preprocessed_df = preprocess_documents(subset)

In [11]:
preprocessed_df.head()

Unnamed: 0,document,regex_tokens,no_stopwords,lemmas,prep_doc
0,Calculation of prompt diphoton production cros...,"[calculation, of, prompt, diphoton, production...","[calculation, prompt, diphoton, production, cr...","[calculation, prompt, diphoton, production, cr...",calculation prompt diphoton production cross s...
1,Sparsity-certifying Graph Decompositions. We...,"[sparsity, certifying, graph, decompositions, ...","[sparsity, certifying, graph, decompositions, ...","[sparsity, certifying, graph, decomposition, d...",sparsity certifying graph decomposition descri...
2,The evolution of the Earth-Moon system based o...,"[the, evolution, of, the, earth, moon, system,...","[evolution, earth, moon, system, based, dark, ...","[evolution, earth, moon, system, based, dark, ...",evolution earth moon system based dark matter ...
3,A determinant of Stirling cycle numbers counts...,"[determinant, of, stirling, cycle, numbers, co...","[determinant, stirling, cycle, numbers, counts...","[determinant, stirling, cycle, number, count, ...",determinant stirling cycle number count unlabe...
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,"[from, dyadic, lambda, alpha, to, lambda, alph...","[dyadic, lambda, alpha, lambda, alpha, paper, ...","[dyadic, lambda, alpha, lambda, alpha, paper, ...",dyadic lambda alpha lambda alpha paper show co...


Implementacion de TF-IDF y BM25

In [13]:
# Documentos en formato texto para TF/TF-IDF
prep_texts = preprocessed_df['prep_doc'].tolist()

In [14]:
# Documentos tokenizados para BM25
prep_tokens = preprocessed_df['lemmas'].tolist()

Contruccion de indices en TF-IDF

In [18]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#from src.perf_metrics import execute_time

import numpy as np
from rank_bm25 import BM25Okapi

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(prep_texts)


In [53]:
def search_tfidf(query, top_k=10):
    clean_query, _ = preprocess_both(query)
    query_vec = tfidf_vectorizer.transform([clean_query])
    scores = cosine_similarity(tfidf_matrix, query_vec).flatten()
    top_indices = scores.argsort()[::-1][:top_k]

    results = []
    print(f"\n🔹 TF-IDF results for: '{query}'\n")
    for i, idx in enumerate(top_indices):
        title = df.iloc[idx]['title']
        abstract = df.iloc[idx]['abstract']
        abstract_snippet = abstract[:300].strip().replace('\n', ' ')
        print(f"{i+1}. [{idx}] {title}\n{abstract_snippet}\n")

        results.append({
            "index": idx,
            "title": title,
            "abstract": abstract
        })

    return top_indices.tolist(), results



Indices de BM25

In [None]:
bm25_model = BM25Okapi(prep_tokens)


In [58]:
def search_bm25(query, top_k=10):
    _, query_tokens = preprocess_both(query)
    scores = bm25_model.get_scores(query_tokens)
    top_indices = np.argsort(scores)[::-1][:top_k]

    results = []
    print(f"\n🔹 BM25 results for: '{query}'\n")
    for i, idx in enumerate(top_indices):
        title = df.iloc[idx]['title']
        abstract = df.iloc[idx]['abstract']
        abstract_snippet = abstract[:300].strip().replace('\n', ' ')
        print(f"{i+1}. [{idx}] {title}\n{abstract_snippet}\n")

        results.append({
            "index": idx,
            "title": title,
            "abstract": abstract
        })

    return top_indices.tolist(), results


FAISS

In [22]:

import faiss
from sentence_transformers import SentenceTransformer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

model = SentenceTransformer('all-MiniLM-L6-v2')  


embeddings = model.encode(prep_texts, convert_to_numpy=True)


dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings) 

In [60]:
def search_faiss(query, top_k=10):
    clean_query, _ = preprocess_both(query)
    query_vec = model.encode([clean_query], convert_to_numpy=True)
    distances, indices = index.search(query_vec, top_k)

    results = []
    print(f"\n🔹 FAISS results for: '{query}'\n")
    for i, idx in enumerate(indices[0]):
        title = df.iloc[idx]['title']
        abstract = df.iloc[idx]['abstract']
        abstract_snippet = abstract[:300].strip().replace('\n', ' ')
        print(f"{i+1}. [{idx}] {title}\n{abstract_snippet}\n")

        results.append({
            "index": idx,
            "title": title,
            "abstract": abstract
        })

    return indices[0].tolist(), results


Recuperacion

In [77]:
query = "Sparsity-certifying Graph Decompositions"

In [78]:
search_tfidf(query, top_k=10)




🔹 TF-IDF results for: 'Sparsity-certifying Graph Decompositions'

1. [1] Sparsity-certifying Graph Decompositions
We describe a new algorithm, the $(k,\ell)$-pebble game with colors, and use it obtain a characterization of the family of $(k,\ell)$-sparse graphs and algorithmic solutions to a family of problems concerning tree decompositions of graphs. Special instances of sparse graphs appear in rigidity theo

2. [4] From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\alpha}$
In this paper we show how to compute the $\Lambda_{\alpha}$ norm, $\alpha\ge 0$, using the dyadic grid. This result is a consequence of the description of the Hardy spaces $H^p(R^N)$ in terms of dyadic and special atoms.

3. [3] A determinant of Stirling cycle numbers counts unlabeled acyclic
  single-source automata
We show that a determinant of Stirling cycle numbers counts unlabeled acyclic single-source automata. The proof involves a bijection from these automata to certain marked lattice paths and a sign-reversing 

([1, 4, 3, 2, 0],
 [{'index': np.int64(1),
   'title': 'Sparsity-certifying Graph Decompositions',
   'abstract': '  We describe a new algorithm, the $(k,\\ell)$-pebble game with colors, and use\nit obtain a characterization of the family of $(k,\\ell)$-sparse graphs and\nalgorithmic solutions to a family of problems concerning tree decompositions of\ngraphs. Special instances of sparse graphs appear in rigidity theory and have\nreceived increased attention in recent years. In particular, our colored\npebbles generalize and strengthen the previous results of Lee and Streinu and\ngive a new proof of the Tutte-Nash-Williams characterization of arboricity. We\nalso present a new decomposition that certifies sparsity based on the\n$(k,\\ell)$-pebble game with colors. Our work also exposes connections between\npebble game algorithms and previous sparse graph algorithms by Gabow, Gabow and\nWestermann and Hendrickson.\n'},
  {'index': np.int64(4),
   'title': 'From dyadic $\\Lambda_{\\alpha}

In [61]:
search_bm25(query, top_k=10)



🔹 BM25 results for: 'machine learning'

1. [4] From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\alpha}$
In this paper we show how to compute the $\Lambda_{\alpha}$ norm, $\alpha\ge 0$, using the dyadic grid. This result is a consequence of the description of the Hardy spaces $H^p(R^N)$ in terms of dyadic and special atoms.

2. [3] A determinant of Stirling cycle numbers counts unlabeled acyclic
  single-source automata
We show that a determinant of Stirling cycle numbers counts unlabeled acyclic single-source automata. The proof involves a bijection from these automata to certain marked lattice paths and a sign-reversing involution to evaluate the determinant.

3. [2] The evolution of the Earth-Moon system based on the dark matter field
  fluid model
The evolution of Earth-Moon system is described by the dark matter field fluid model proposed in the Meeting of Division of Particle and Field 2004, American Physical Society. The current behavior of the Earth-Moon system agrees with this mod

([4, 3, 2, 1, 0],
 [{'index': np.int64(4),
   'title': 'From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\alpha}$',
   'abstract': '  In this paper we show how to compute the $\\Lambda_{\\alpha}$ norm, $\\alpha\\ge\n0$, using the dyadic grid. This result is a consequence of the description of\nthe Hardy spaces $H^p(R^N)$ in terms of dyadic and special atoms.\n'},
  {'index': np.int64(3),
   'title': 'A determinant of Stirling cycle numbers counts unlabeled acyclic\n  single-source automata',
   'abstract': '  We show that a determinant of Stirling cycle numbers counts unlabeled acyclic\nsingle-source automata. The proof involves a bijection from these automata to\ncertain marked lattice paths and a sign-reversing involution to evaluate the\ndeterminant.\n'},
  {'index': np.int64(2),
   'title': 'The evolution of the Earth-Moon system based on the dark matter field\n  fluid model',
   'abstract': "  The evolution of Earth-Moon system is described by the dark matter field\nfluid model pro

In [62]:
search_faiss(query, top_k=10)


🔹 FAISS results for: 'machine learning'

1. [1] Sparsity-certifying Graph Decompositions
We describe a new algorithm, the $(k,\ell)$-pebble game with colors, and use it obtain a characterization of the family of $(k,\ell)$-sparse graphs and algorithmic solutions to a family of problems concerning tree decompositions of graphs. Special instances of sparse graphs appear in rigidity theo

2. [0] Calculation of prompt diphoton production cross sections at Tevatron and
  LHC energies
A fully differential calculation in perturbative quantum chromodynamics is presented for the production of massive photon pairs at hadron colliders. All next-to-leading order perturbative contributions from quark-antiquark, gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as a

3. [2] The evolution of the Earth-Moon system based on the dark matter field
  fluid model
The evolution of Earth-Moon system is described by the dark matter field fluid model proposed in the Meeting of Division of 

([1, 0, 2, 4, 3, -1, -1, -1, -1, -1],
 [{'index': np.int64(1),
   'title': 'Sparsity-certifying Graph Decompositions',
   'abstract': '  We describe a new algorithm, the $(k,\\ell)$-pebble game with colors, and use\nit obtain a characterization of the family of $(k,\\ell)$-sparse graphs and\nalgorithmic solutions to a family of problems concerning tree decompositions of\ngraphs. Special instances of sparse graphs appear in rigidity theory and have\nreceived increased attention in recent years. In particular, our colored\npebbles generalize and strengthen the previous results of Lee and Streinu and\ngive a new proof of the Tutte-Nash-Williams characterization of arboricity. We\nalso present a new decomposition that certifies sparsity based on the\n$(k,\\ell)$-pebble game with colors. Our work also exposes connections between\npebble game algorithms and previous sparse graph algorithms by Gabow, Gabow and\nWestermann and Hendrickson.\n'},
  {'index': np.int64(0),
   'title': 'Calculation

RETRIEVAL AUMENTED GENERATION

In [92]:
from dotenv import load_dotenv
from openai import OpenAI
import os

In [93]:
# Cargar las variables de entorno desde el archivo .env
load_dotenv()
# Objeto tipo cliente de OpenAi
api_key = os.getenv("CHATGPT_API_KEY")
client = OpenAI(api_key=api_key)

In [94]:
indices, context_items = search_tfidf(query, top_k=3)
context_tfidf = "\n\n---\n\n".join(
    f"{item['title']}\n{item['abstract']}" for item in context_items
)

print("🔹 Contexto generado para LLM:\n")
print(context_tfidf)



🔹 TF-IDF results for: 'Sparsity-certifying Graph Decompositions'

1. [1] Sparsity-certifying Graph Decompositions
We describe a new algorithm, the $(k,\ell)$-pebble game with colors, and use it obtain a characterization of the family of $(k,\ell)$-sparse graphs and algorithmic solutions to a family of problems concerning tree decompositions of graphs. Special instances of sparse graphs appear in rigidity theo

2. [4] From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\alpha}$
In this paper we show how to compute the $\Lambda_{\alpha}$ norm, $\alpha\ge 0$, using the dyadic grid. This result is a consequence of the description of the Hardy spaces $H^p(R^N)$ in terms of dyadic and special atoms.

3. [3] A determinant of Stirling cycle numbers counts unlabeled acyclic
  single-source automata
We show that a determinant of Stirling cycle numbers counts unlabeled acyclic single-source automata. The proof involves a bijection from these automata to certain marked lattice paths and a sign-reversing 

Contexto BM25

In [95]:
indices_bm25, context_items_bm25 = search_bm25(query, top_k=3)

context_bm25 = "\n\n---\n\n".join(
    f"{item['title']}\n{item['abstract']}" for item in context_items_bm25
)

print("🔹 Contexto BM25 para LLM:\n")
print(context_bm25)



🔹 BM25 results for: 'Sparsity-certifying Graph Decompositions'

1. [1] Sparsity-certifying Graph Decompositions
We describe a new algorithm, the $(k,\ell)$-pebble game with colors, and use it obtain a characterization of the family of $(k,\ell)$-sparse graphs and algorithmic solutions to a family of problems concerning tree decompositions of graphs. Special instances of sparse graphs appear in rigidity theo

2. [4] From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\alpha}$
In this paper we show how to compute the $\Lambda_{\alpha}$ norm, $\alpha\ge 0$, using the dyadic grid. This result is a consequence of the description of the Hardy spaces $H^p(R^N)$ in terms of dyadic and special atoms.

3. [3] A determinant of Stirling cycle numbers counts unlabeled acyclic
  single-source automata
We show that a determinant of Stirling cycle numbers counts unlabeled acyclic single-source automata. The proof involves a bijection from these automata to certain marked lattice paths and a sign-reversing in

Contexto FAISS

In [96]:
indices_faiss, context_items_faiss = search_faiss(query, top_k=3)

context_faiss = "\n\n---\n\n".join(
    f"{item['title']}\n{item['abstract']}" for item in context_items_faiss
)

print("🔹 Contexto FAISS para LLM:\n")
print(context_faiss)



🔹 FAISS results for: 'Sparsity-certifying Graph Decompositions'

1. [1] Sparsity-certifying Graph Decompositions
We describe a new algorithm, the $(k,\ell)$-pebble game with colors, and use it obtain a characterization of the family of $(k,\ell)$-sparse graphs and algorithmic solutions to a family of problems concerning tree decompositions of graphs. Special instances of sparse graphs appear in rigidity theo

2. [3] A determinant of Stirling cycle numbers counts unlabeled acyclic
  single-source automata
We show that a determinant of Stirling cycle numbers counts unlabeled acyclic single-source automata. The proof involves a bijection from these automata to certain marked lattice paths and a sign-reversing involution to evaluate the determinant.

3. [0] Calculation of prompt diphoton production cross sections at Tevatron and
  LHC energies
A fully differential calculation in perturbative quantum chromodynamics is presented for the production of massive photon pairs at hadron colliders

In [97]:
print(context_faiss)

Sparsity-certifying Graph Decompositions
  We describe a new algorithm, the $(k,\ell)$-pebble game with colors, and use
it obtain a characterization of the family of $(k,\ell)$-sparse graphs and
algorithmic solutions to a family of problems concerning tree decompositions of
graphs. Special instances of sparse graphs appear in rigidity theory and have
received increased attention in recent years. In particular, our colored
pebbles generalize and strengthen the previous results of Lee and Streinu and
give a new proof of the Tutte-Nash-Williams characterization of arboricity. We
also present a new decomposition that certifies sparsity based on the
$(k,\ell)$-pebble game with colors. Our work also exposes connections between
pebble game algorithms and previous sparse graph algorithms by Gabow, Gabow and
Westermann and Hendrickson.


---

A determinant of Stirling cycle numbers counts unlabeled acyclic
  single-source automata
  We show that a determinant of Stirling cycle numbers counts un

In [98]:
tfidf_prompt = f"""
Tu rol es el de una aplicación de tipo Retrieval-Augmented Generation (RAG). Siempre debes responder en **español**, siguiendo las instrucciones con precisión.

## Instrucciones:
- Usa **solo** el contexto proporcionado.
- Si encuentras artículos relevantes, incluye:
  - Título: en español si es posible
  - Resumen: traducido al español de forma clara
- Si **no** hay información relevante en el contexto, responde exactamente con:
  "Lo siento, no encontré una publicación relacionada con tu consulta."

## Contexto:
{context_tfidf}

## Pregunta del usuario:
{query}
"""




In [99]:
tdifd_response = client.responses.create(
    model="gpt-4.1",
    input= tfidf_prompt
)

print(tdifd_response.output_text)

Título: Descomposiciones de grafos que certifican la esparsidad

Resumen: El artículo describe un nuevo algoritmo llamado el "juego de guijarros $(k,\ell)$ con colores", que se utiliza para caracterizar la familia de grafos $(k,\ell)$-dispersos (esparsos) y para resolver algorítmicamente problemas relacionados con descomposiciones tipo árbol de grafos. Los grafos dispersos tienen aplicaciones en la teoría de rigidez y últimamente han recibido mayor atención. El método de guijarros coloreados generaliza y mejora resultados previos, especialmente los de Lee y Streinu, y proporciona una nueva demostración de la caracterización de la arboricidad de Tutte-Nash-Williams. Además, presentan una nueva descomposición que certifica la esparsidad basada en su juego de guijarros coloreados, mostrando también conexiones con algoritmos previos de Gabow, Gabow y Westermann, y Hendrickson para grafos dispersos.


In [100]:
bm25_prompt = f"""
Tu rol es el de una aplicación de tipo Retrieval-Augmented Generation (RAG). Siempre debes responder en **español**, siguiendo las instrucciones con precisión.

## Instrucciones:
- Usa **solo** el contexto proporcionado.
- Si encuentras artículos relevantes, incluye:
  - Título: en español si es posible
  - Resumen: traducido al español de forma clara
- Si **no** hay información relevante en el contexto, responde exactamente con:
  "Lo siento, no encontré una publicación relacionada con tu consulta."

## Contexto:
{context_bm25}

## Pregunta del usuario:
{query}
"""




In [101]:
bm25_response = client.responses.create(
    model="gpt-4.1",
    input= bm25_prompt
)

print(bm25_response.output_text)

Título: Descomposiciones de grafos certificadoras de esparsidad

Resumen: Se presenta un nuevo algoritmo llamado el "juego de fichas $(k,\ell)$ con colores", que permite caracterizar la familia de grafos $(k,\ell)$-dispersos (esparsos) y proporciona soluciones algorítmicas a problemas relacionados con descomposiciones arbóreas de grafos. Los grafos esparsos tienen aplicaciones en la teoría de la rigidez y han recibido atención creciente. El uso de fichas coloreadas generaliza y mejora resultados previos, incluyendo una nueva demostración de la caracterización de arborecidad de Tutte-Nash-Williams. Además, se introduce una nueva descomposición que certifica la esparsidad utilizando este juego de fichas, y se revelan conexiones entre estos algoritmos y algoritmos anteriores para grafos esparsos.


In [102]:
faiss_prompt = f"""
Tu rol es el de una aplicación de tipo Retrieval-Augmented Generation (RAG). Siempre debes responder en **español**, siguiendo las instrucciones con precisión.

## Instrucciones:
- Usa **solo** el contexto proporcionado.
- Si encuentras artículos relevantes, incluye:
  - Título: en español si es posible
  - Resumen: traducido al español de forma clara
- Si **no** hay información relevante en el contexto, responde exactamente con:
  "Lo siento, no encontré una publicación relacionada con tu consulta."

## Contexto:
{context_faiss}

## Pregunta del usuario:
{query}
"""




In [103]:
faiss_response = client.responses.create(
    model="gpt-4.1",
    input= faiss_prompt
)

print(faiss_response.output_text)

Título: Descomposiciones de grafos que certifican la esparcidad

Resumen: Se describe un nuevo algoritmo denominado "el juego de fichas $(k,\ell)$ con colores", utilizado para caracterizar la familia de grafos $(k,\ell)$-esparsos y proporcionar soluciones algorítmicas a problemas relacionados con descomposiciones arbóreas de grafos. Este tipo de grafos esparcidos tienen aplicaciones en la teoría de rigidez y han sido objeto de atención reciente. El método propuesto generaliza y refuerza resultados previos de Lee y Streinu, además de ofrecer una nueva demostración de la caracterización de la arboricidad por Tutte-Nash-Williams. También se presenta una nueva descomposición que certifica la esparcidad con base en el juego de fichas coloreadas, mostrando conexiones entre este enfoque y algoritmos previos sobre grafos esparsos.


Comparar documentos en común y diferencias de ordenamiento

In [109]:
def comparar_resultados(indices_tfidf, indices_bm25, indices_faiss):
    set_tfidf = set(indices_tfidf)
    set_bm25 = set(indices_bm25)
    set_faiss = set(indices_faiss)

    print("\n Documentos en común (intersecciones):\n")
    print(f"TF-IDF ∩ BM25: {set_tfidf & set_bm25}")
    print(f"TF-IDF ∩ FAISS: {set_tfidf & set_faiss}")
    print(f"BM25 ∩ FAISS: {set_bm25 & set_faiss}")

    print("\n Cantidad de coincidencias en top-k:\n")
    print(f"TF-IDF vs BM25: {len(set_tfidf & set_bm25)} coincidencias")
    print(f"TF-IDF vs FAISS: {len(set_tfidf & set_faiss)} coincidencias")
    print(f"BM25 vs FAISS: {len(set_bm25 & set_faiss)} coincidencias")

    print("\n Orden de documentos por método:\n")
    print(f"TF-IDF: {indices_tfidf}")
    print(f"BM25: {indices_bm25}")
    print(f"FAISS: {indices_faiss}")


In [110]:
comparar_resultados(indices, indices_bm25, indices_faiss)



 Documentos en común (intersecciones):

TF-IDF ∩ BM25: {1, 3, 4}
TF-IDF ∩ FAISS: {1, 3}
BM25 ∩ FAISS: {1, 3}

 Cantidad de coincidencias en top-k:

TF-IDF vs BM25: 3 coincidencias
TF-IDF vs FAISS: 2 coincidencias
BM25 vs FAISS: 2 coincidencias

 Orden de documentos por método:

TF-IDF: [1, 4, 3]
BM25: [1, 4, 3]
FAISS: [1, 3, 0]


Medicion de similitud de Rankings

In [111]:
def jaccard_sim(set1, set2):
    return len(set1 & set2) / len(set1 | set2)

print("\n Similitud de Jaccard entre rankings:")
print("TF-IDF vs BM25:", jaccard_sim(set(indices), set(indices_bm25)))
print("TF-IDF vs FAISS:", jaccard_sim(set(indices), set(indices_faiss)))
print("BM25 vs FAISS:", jaccard_sim(set(indices_bm25), set(indices_faiss)))



 Similitud de Jaccard entre rankings:
TF-IDF vs BM25: 1.0
TF-IDF vs FAISS: 0.5
BM25 vs FAISS: 0.5


Evaluacion de respuestas del LLM

In [135]:
def evaluar_respuesta_llm(respuesta, context_items, query):
    from difflib import SequenceMatcher

 
    titulos = [item["title"].lower() for item in context_items]
    titulo_mencionado = any(titulo in respuesta.lower() for titulo in titulos)


    palabras_clave = query.lower().split()
    tokens_resp = respuesta.lower().split()
    coincidencias = sum(1 for w in palabras_clave if w in tokens_resp)

    print("\n Evaluación básica de la respuesta:")
    print("¿Se menciona algún título del contexto?:", "SI" if titulo_mencionado else "NO")
    print("¿Coincidencias con palabras clave de la query?:", f"{coincidencias}/{len(palabras_clave)}")


In [None]:
print("\n Evaluación de la respuesta TF-IDF:")
evaluar_respuesta_llm(tdifd_response.output_text, context_items, query)
print("\n Evaluación de la respuesta BM25:")
evaluar_respuesta_llm(bm25_response.output_text, context_items_bm25, query)
print("\n Evaluación de la respuesta FAISS:")
evaluar_respuesta_llm(faiss_response.output_text, context_items_faiss, query)



 Evaluación de la respuesta TF-IDF:

 Evaluación básica de la respuesta:
¿Se menciona algún título del contexto?: NO
¿Coincidencias con palabras clave de la query?: 0/3

 Evaluación de la respuesta BM25:

 Evaluación básica de la respuesta:
¿Se menciona algún título del contexto?: NO
¿Coincidencias con palabras clave de la query?: 0/3

 Evaluación de la respuesta FAISS:

 Evaluación básica de la respuesta:
¿Se menciona algún título del contexto?: NO
¿Coincidencias con palabras clave de la query?: 0/3


Tabla comparativa

In [None]:
def ranking_comparison_simple(query, top_k=10):

    indices_tfidf, _ = search_tfidf(query, top_k=top_k)
    indices_bm25, _ = search_bm25(query, top_k=top_k)
    indices_faiss, _ = search_faiss(query, top_k=top_k)

 
    df_simple = pd.DataFrame({
        'Rank': list(range(1, top_k + 1)),
        'TF-IDF': indices_tfidf,
        'BM25': indices_bm25,
        'FAISS': indices_faiss
    })


    set_tfidf = set(indices_tfidf)
    set_bm25 = set(indices_bm25)
    set_faiss = set(indices_faiss)

    print(f"\n🔎 Coincidencias entre modelos para la query: '{query}'")
    print(f"- TF-IDF & BM25: {len(set_tfidf & set_bm25)} coincidencias")
    print(f"- TF-IDF & FAISS: {len(set_tfidf & set_faiss)} coincidencias")
    print(f"- BM25 & FAISS: {len(set_bm25 & set_faiss)} coincidencias")
    print(f"- Común a los tres: {len(set_tfidf & set_bm25 & set_faiss)} coincidencias")

    return df_simple



In [134]:
tabla= ranking_comparison_simple(query, top_k=5)
tabla


🔹 TF-IDF results for: 'Sparsity-certifying Graph Decompositions'

1. [1] Sparsity-certifying Graph Decompositions
We describe a new algorithm, the $(k,\ell)$-pebble game with colors, and use it obtain a characterization of the family of $(k,\ell)$-sparse graphs and algorithmic solutions to a family of problems concerning tree decompositions of graphs. Special instances of sparse graphs appear in rigidity theo

2. [4] From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\alpha}$
In this paper we show how to compute the $\Lambda_{\alpha}$ norm, $\alpha\ge 0$, using the dyadic grid. This result is a consequence of the description of the Hardy spaces $H^p(R^N)$ in terms of dyadic and special atoms.

3. [3] A determinant of Stirling cycle numbers counts unlabeled acyclic
  single-source automata
We show that a determinant of Stirling cycle numbers counts unlabeled acyclic single-source automata. The proof involves a bijection from these automata to certain marked lattice paths and a sign-reversing 

Unnamed: 0,Rank,TF-IDF,BM25,FAISS
0,1,1,1,1
1,2,4,4,3
2,3,3,3,0
3,4,2,2,4
4,5,0,0,2
