# Tarea 1

## Recuperación ranqueada y vectorización de documentos (RRDV) usando GENSIM

In [17]:
!pip install gensim



In [19]:
pip install scikit-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/77/85/bff3a1e818ec6aa3dd466ff4f4b0a727db9fdb41f2e849747ad902ddbe95/scikit_learn-1.3.0-cp311-cp311-win_amd64.whl.metadata
  Downloading scikit_learn-1.3.0-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Obtaining dependency information for threadpoolctl>=2.0.0 from https://files.pythonhosted.org/packages/81/12/fd4dea011af9d69e1cad05c75f3f7202cdcbeac9b712eea58ca779a72865/threadpoolctl-3.2.0-py3-none-any.whl.metadata
  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Downloading scikit_learn-1.3.0-cp311-cp311-win_amd64.whl (9.2 MB)
   ---------------------------------------- 0.0/9.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.2 MB ? eta -:--:--
    --------------------------------------- 0.1/9.2 MB 1.8 MB/s eta 0:00:06
   ---- ----------------------------------- 1.1/9.2 MB 10.3 MB

In [1]:
import os
import zipfile
import xml.etree.ElementTree as ET
import numpy as np
import nltk
from gensim.parsing.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models, similarities  # Importar Gensim

In [2]:
# Specify the paths to the compressed files and the target directory
compressed_files = ['docs-raw-texts.zip', 'queries-raw-texts.zip']

In [3]:
# Extract files from each compressed file
for compressed_file in compressed_files:
    with zipfile.ZipFile(compressed_file, 'r') as zip_ref:
        folder_name = os.path.splitext(compressed_file)[0]  # Remove the ".zip" extension
        target_folder = os.path.join(folder_name)
        
        if not os.path.exists(target_folder):
            # Create the folder within the target directory
            os.mkdir(target_folder)
        
            # Extract all files to the target folder
            zip_ref.extractall(target_folder)

print("Extracción completada")

Extracción completada


In [4]:
# Directorios que contienen los archivos necesarios, cambiar acá si es necesario
xml_files_directory = 'docs-raw-texts'

relevance_judgments_directory = "relevance-judgments.tsv"

queries_directory = "queries-raw-texts"

In [5]:
# NLTK setup
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(nltk.corpus.stopwords.words('english'))

def extract_raw_text(xml_path: str, title: bool = False) -> str:
    """Extrae el texto sin procesar de un archivo .naf.

    Args:
        xml_path (str): La ruta al archivo .naf.
        title (bool): Si es True, el título del documento también se agrega al texto extraído.

    Returns:
        Str: El texto sin procesar del archivo .naf.
    """
    if title:
        # Parse the XML file
        tree = ET.parse(xml_path)
        root = tree.getroot()
        # Extract content from the XML
        return root.find(".//nafHeader/fileDesc").get("title") + ", " + root.find('raw').text  # Añade título cuando se especifíca
    else:
        # Parse the XML file
        tree = ET.parse(xml_path)
        root = tree.getroot()
        # Extract content from the XML
        return root.find('raw').text

In [6]:
# Función de preprocesamiento, se usará para todos los inputs al modelo (queries y documentos)
def preprocess_text(text: str) -> list[str]:
    """Preprocesa un texto para eliminar palabras vacías, aplicar stemming y convertir a minúsculas.

    Args:
        text (str): El texto a preprocesar.

    Returns:
        List: Una lista con las palabras del texto preprocesado.
    """
    text = text.strip().lower()  # Normalización del texto, todo en minúscula y se quitan espacios innecesarios.
    tokens = tokenizer.tokenize(text)  # Tokenización por espacio
    
    # Usar Gensim para aplicar PorterStemmer y eliminar stopwords
    tokens = [word for word in tokens if word not in stop_words]  # Eliminar stopwords
    stemmer = PorterStemmer()  # Utilizar PorterStemmer de Gensim
    tokens = stemmer.stem_documents(tokens)  # Aplicar PorterStemmer
    return tokens  # Retorna lista con el texto preprocesado

In [7]:
# Dictionary to store the inverted index (term -> list of documents)
inverted_index = {}

# Dictionary to store term frequencies per document (term -> {document: frequency})
term_freq_per_document = {}

# Iterate over XML files in the directory
for filename in os.listdir(xml_files_directory):
    if filename.endswith('.naf'):
        xml_path = os.path.join(xml_files_directory, filename)
        content = extract_raw_text(xml_path, title=True)
        # Preprocess the content
        preprocessed_tokens = preprocess_text(content)
        
        # Create the inverted index and update term frequencies per document
        for term in preprocessed_tokens:
            if term in inverted_index:
                if filename not in inverted_index[term]:
                    inverted_index[term].append(filename)
            else:
                inverted_index[term] = [filename]
            
            if term in term_freq_per_document:
                if filename in term_freq_per_document[term]:
                    term_freq_per_document[term][filename] += 1
                else:
                    term_freq_per_document[term][filename] = 1
            else:
                term_freq_per_document[term] = {filename: 1}

print("Inverted index created.")

Inverted index created.


In [8]:
# Crear un diccionario de corpora y un modelo TF-IDF usando Gensim
documents = [preprocess_text(extract_raw_text(os.path.join(xml_files_directory, filename), title=True)) for filename in os.listdir(xml_files_directory) if filename.endswith('.naf')]
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]
tfidf = models.TfidfModel(corpus)

In [9]:
def preprocess_input(query_string):
    # Unir los términos de la lista en una cadena
    query_string = ' '.join(query_string)
    query_terms = preprocess_text(query_string)
    return {term: query_terms.count(term) for term in query_terms}

# Function to calculate TF-IDF vector for a query
def calculate_tfidf_vector(input, dictionary, tfidf_model):
    query = preprocess_input(input)
    query_bow = dictionary.doc2bow(query)
    tfidf_vector = tfidf_model[query_bow]
    return tfidf_vector

# Function to calculate cosine similarity between two vectors
def calculate_cosine_similarity(vector1, vector2):
    index = similarities.SparseMatrixSimilarity([vector1], num_features=len(vector1))
    similarity = index[vector2]
    return similarity[0]

In [10]:
# Function to retrieve and rank documents based on cosine similarity scores
def retrieve_and_rank_documents(query_string, dictionary, tfidf_model):
    similarity_scores = {}  # Dictionary to store cosine similarity scores
    query_vector = calculate_tfidf_vector(query_string, dictionary, tfidf_model)
    
    for document in os.listdir(xml_files_directory):
        if document.endswith('.naf'):
            document_path = os.path.join(xml_files_directory, document)
            document_text = preprocess_text(extract_raw_text(document_path, title=True))
            document_vector = calculate_tfidf_vector(document_text, dictionary, tfidf_model)
            similarity = calculate_cosine_similarity(query_vector, document_vector)
            
            if similarity > 0:
                similarity_scores[document[:-4]] = similarity  # Remove the ".naf" extension
    
    ranked_documents = sorted(similarity_scores.keys(), key=lambda doc: similarity_scores[doc], reverse=True)
    return ranked_documents, similarity_scores

In [11]:
output_filename = "GENSIM-consultas_resultados.tsv"

if os.path.exists(output_filename):
    print("Results in {} already exist".format(output_filename))
else:
    # Write results to file
    results_file = open(output_filename, "w")

    # Iterate over queries
    for query_file in os.listdir(queries_directory):
        if query_file.endswith('.naf'):
            query_path = os.path.join(queries_directory, query_file)
            query_text = preprocess_text(extract_raw_text(query_path))
            
            ranked_documents, similarity_scores = retrieve_and_rank_documents(query_text, dictionary, tfidf)
            
            # Write results to the file
            result_line = query_file[8:-4] + "\t" + ",".join([f"{doc[8:]}:{similarity_scores[doc]:.6f}" for doc in ranked_documents])
            results_file.write(result_line + "\n")
            print("finished query {}".format(query_file))

    results_file.close()
    print("Results written to " + output_filename)

: 

: 

In [None]:
# Import metrics from sklearn
from sklearn.metrics import precision_score, recall_score

# Load the relevance judgments from the file
relevance_judgments_file = open(relevance_judgments_directory, "r")
relevance_judgments = {}

for line in relevance_judgments_file:
    query, judgments = line.strip().split('\t')
    relevance_judgments[query] = judgments.split(',')

relevance_judgments_file.close()

# Load the query results from the file
query_results_file = open("GENSIM-consultas_resultados.tsv", "r")
query_results = {}

for line in query_results_file:
    query, results = line.strip().split('\t')
    query_results[query] = results.split(',')

query_results_file.close()

In [None]:
# Define function to calculate Precision at K
def precision_at_k(k, query, query_results, relevance_judgments):
    top_k_results = query_results[query][:k]
    relevant_documents = set(relevance_judgments[query])
    retrieved_documents = set(top_k_results)
    relevant_and_retrieved = relevant_documents.intersection(retrieved_documents)
    
    return len(relevant_and_retrieved) / k

# Define function to calculate Recall at K
def recall_at_k(k, query, query_results, relevance_judgments):
    top_k_results = query_results[query][:k]
    relevant_documents = set(relevance_judgments[query])
    retrieved_documents = set(top_k_results)
    relevant_and_retrieved = relevant_documents.intersection(retrieved_documents)
    
    return len(relevant_and_retrieved) / len(relevant_documents)

In [None]:
# Calculate Precision and Recall at K
k_values = [1, 3, 5, 10]
for k in k_values:
    precision_scores = []
    recall_scores = []

    for query in query_results:
        if query in relevance_judgments:
            precision = precision_at_k(k, query, query_results, relevance_judgments)
            recall = recall_at_k(k, query, query_results, relevance_judgments)
            precision_scores.append(precision)
            recall_scores.append(recall)

    if len(precision_scores) > 0:
        mean_precision = sum(precision_scores) / len(precision_scores)
    else:
        mean_precision = 0.0

    if len(recall_scores) > 0:
        mean_recall = sum(recall_scores) / len(recall_scores)
    else:
        mean_recall = 0.0

    print(f"Precision@{k}: {mean_precision:.4f}")
    print(f"Recall@{k}: {mean_recall:.4f}")

In [None]:
# Define function to calculate NDCG at K
def ndcg_at_k(k, query, query_results, relevance_judgments):
    top_k_results = query_results[query][:k]
    relevance_levels = [1 if doc in relevance_judgments[query] else 0 for doc in top_k_results]
    
    # Calculate DCG
    dcg = relevance_levels[0]
    for i in range(1, k):
        dcg += relevance_levels[i] / np.log2(i + 1)
    
    # Calculate IDCG
    ideal_relevance_levels = sorted(relevance_levels, reverse=True)
    idcg = ideal_relevance_levels[0]
    for i in range(1, k):
        idcg += ideal_relevance_levels[i] / np.log2(i + 1)
    
    # Calculate NDCG
    if idcg == 0:
        return 0
    else:
        return dcg / idcg

# Calculate NDCG at K
k_values = [1, 3, 5, 10]
for k in k_values:
    ndcg_scores = []

    for query in query_results:
        if query in relevance_judgments:
            ndcg = ndcg_at_k(k, query, query_results, relevance_judgments)
            ndcg_scores.append(ndcg)

    if len(ndcg_scores) > 0:
        mean_ndcg = sum(ndcg_scores) / len(ndcg_scores)
    else:
        mean_ndcg = 0.0

    print(f"NDCG@{k}: {mean_ndcg:.4f}")