# Exploring Ranking Models in Information Retrieval

## Objective
Understand the practical implementation and differences between the Vector Space Model and the Binary Independence Model in ranking documents relative to a user query.

### Step 1: Data Preprocessing

Ensure that the documents are still loaded and preprocessed from the previous task. The data should be clean and ready for advanced querying.
Write a function to load and preprocess the text documents from a specified directory. This step involves reading each file, converting the text to lowercase for uniform processing, and storing the results in a dictionary.

In [85]:
!pip install beautifulsoup4
!pip install requests



In [86]:
import os
import time
import requests
import re
from collections import defaultdict
from bs4 import BeautifulSoup
import operator

regExpresion = r'[^0-9a-zA-Z\s]+'
names_file_name = 'libros_nombres.txt'

inverted_index = {}
matriz_booleana = []
matriz_pesos = []
libros = []

# Define the path to the directory containing the text files
books_folder = 'week01'
CORPUS_DIR = 'data'
book_list = 'notebooks'

def load_book_names():
    wrkDir = os.getcwd()
    wrkDir = os.path.join(wrkDir, os.pardir, os.pardir, books_folder, book_list)
    wrkDir = os.path.abspath(wrkDir)
    names_file_path = os.path.join(wrkDir, names_file_name)
    names_file_path = os.path.abspath(names_file_path)
    with open(names_file_path, 'r', encoding='utf-8') as f:
        return f.readlines()

In [87]:
def obtener_indice_invertido():
    print("Obteniendo índice invertido...")
    
    global libros
    for libro_id, libro in enumerate(libros):
        file = os.getcwd()
        file = os.path.join(file, os.pardir, os.pardir, books_folder, CORPUS_DIR, libro.strip())
        file = os.path.abspath(file)

        try:
            with open(file, 'r', encoding='utf-8') as f:
                content = f.read()
                content = re.sub(regExpresion, '', content)
                content = content.lower()
                words = content.split()
                for word in words:
                    global inverted_index
                    if word not in inverted_index:
                        inverted_index[word] = set()
                    inverted_index[word].add(libro_id)
        except:
            print(f"Error al abrir el archivo {file}. No existe o no se puede leer.")
            continue
    pass

In [88]:
def obtener_matriz_booleana(tokens, cols):
    global matriz_booleana
    matriz_booleana = []

    for i, token in enumerate(tokens):
        tmp_row = [False for _ in range(cols)]
        if token in inverted_index:
            for j in inverted_index[token]:
                tmp_row[j] = True
        matriz_booleana.append(tmp_row)

    print(matriz_booleana)
    pass

### Step 2:  Vector Space Model (VSM)

Task: Implement a simple Vector Space Model using term frequency.

Requirements:
* _Document and Query Representation:_ Convert each document and the query into a vector where each dimension corresponds to a term from the corpus. Use simple term frequency for weighting.
* _Cosine Similarity Calculation:_ Calculate the cosine similarity between the query vector and each document vector.
* _Ranking:_ Rank the documents based on their cosine similarity scores from highest to lowest.

In [121]:

def rankearLibroUniGrama(book, token):
    file = os.getcwd()
    file = os.path.join(file, os.pardir, os.pardir, books_folder, CORPUS_DIR, book.strip())
    file = os.path.abspath(file)

    try:
        with open(file, 'r', encoding='utf-8') as f:
            content = f.read()
            content = re.sub(regExpresion, '', content)
            words = content.split()
            count = words.count(token)
            return count
    except:
        print(f"Error al abrir el archivo {file}. No existe o no se puede leer.")
        return 0
    pass

def obtener_matriz_rankeada(tokens):
    global libros
    global matriz_pesos
    global matriz_booleana

    matriz_pesos = [[0 for _ in range(len(libros))] for _ in range(len(tokens))]

    print("matriz booleana: ", matriz_booleana)

    for i, token in enumerate(tokens):
        for j, book in enumerate(libros):
            if matriz_booleana[i][j]:
                matriz_pesos[i][j] = rankearLibroUniGrama(book, token)

    print("matriz pesos: ", matriz_pesos)

    pass

In [96]:
def busqueda_matricial_con_operadores():
    print("Realizando búsqueda matricial con operadores...")
    print("ingrese las palabras a buscar separadas por espacios junto los operadores AND, OR, NOT (recuerde que NOT debe precederse con un OR o un AND)")
    
    query = input("Ingrese la expresion a buscar (ej: juan and pedro or not zapato): ")
    query = query.lower()
    query = re.sub(r'[^\w\s]', '', query)
    tokens = re.findall(r'\b(?!and\b|or\b|not\b)\w+\b', query)
    content = re.findall(r'\b\w+\b|[()]|[and|or|not]+', query)

    global libros
    cols = len(libros)
    rows = len(tokens)

    obtener_matriz_booleana(tokens, cols)

    obtener_matriz_rankeada(tokens)

    global matriz_booleana

    found_books = []

    for i in range(cols):
        token_names = tokens
        token_values = [matriz_booleana[j][i] for j in range(rows)]

        token_map = {name: value for name, value in zip(token_names, token_values)}
        content_temp = [token_map.get(token, token) for token in content]

        expression = ' '.join(map(str, content_temp))
        result = eval(expression)
        if result:
            tpm_name = libros[i]
            tpm_name = tpm_name.rstrip("\n")
            found_books.append(tpm_name)

    print("Los libros que cumplen con la expresión son:", found_books)

    return found_books

### Step 3: Binary Independence Model (BIM)

Task: Implement a basic Binary Independence Model to rank documents.

Requirements:
* _Binary Representation:_ Represent the corpus and the query in binary vectors (1 if the term is present, 0 otherwise).
* _Probability Estimation:_ Assume arbitrary probabilities for the presence of each term in relevant and non-relevant documents.
* _Relevance Scoring:_ Calculate the relevance score for each document based on the product of probabilities for terms present in the query.
* _Ranking:_ Rank the documents based on their relevance scores from highest to lowest.

In [122]:

global libros
libros = load_book_names()

obtener_indice_invertido()

found_books = busqueda_matricial_con_operadores()


Obteniendo índice invertido...
Error al abrir el archivo c:\Users\AlexanderSaavedra\Documents\Repositorio_Politecnica\RI\Codigo\ir24a\week01\data\Spoon River Anthology.txt. No existe o no se puede leer.
Error al abrir el archivo c:\Users\AlexanderSaavedra\Documents\Repositorio_Politecnica\RI\Codigo\ir24a\week01\data\Tractatus LogicoPhilosophicus.txt. No existe o no se puede leer.
Error al abrir el archivo c:\Users\AlexanderSaavedra\Documents\Repositorio_Politecnica\RI\Codigo\ir24a\week01\data\Calculus Made Easy.txt. No existe o no se puede leer.
Realizando búsqueda matricial con operadores...
ingrese las palabras a buscar separadas por espacios junto los operadores AND, OR, NOT (recuerde que NOT debe precederse con un OR o un AND)
[[False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, 