# Rerank Approaches to Improve Retrieval Performance
After an initial retrieval of data, reranking is a crucial step to enhance the relevance and quality of the results. Reranking involves reordering the initially retrieved items based on more sophisticated criteria or models

In [None]:
import torch
import pandas as pd

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from db_connection import create_db_connection
from io import StringIO


In [11]:
conn = create_db_connection()

In [None]:
with conn.cursor() as cur:
    cur.execute("CREATE EXTENSION IF NOT EXISTS aidb cascade;")
    cur.execute("CREATE EXTENSION IF NOT EXISTS pgfs;")
    cur.execute("DROP TABLE IF EXISTS books;")
    cur.execute("""CREATE TABLE books (
    bookID INT,
    title TEXT,
    authors TEXT,
    average_rating FLOAT,
    isbn TEXT,
    isbn13 BIGINT,
    language_code TEXT,
    num_pages INT,
    ratings_count INT,
    text_reviews_count INT,
    publication_date DATE,
    publisher TEXT
);""")
conn.commit()

In [None]:
# Read the train.csv file into a pandas dataframe, skipping bad lines
df = pd.read_csv('sample_data/books.csv', on_bad_lines="skip")

In [None]:
# Create a string buffer
output = StringIO()
df_copy = df.copy()

# Convert data types and handle NULL values
df_copy['bookID'] = pd.to_numeric(df_copy['bookID'], errors='coerce')
df_copy['average_rating'] = pd.to_numeric(df_copy['average_rating'], errors='coerce')
df_copy['isbn13'] = pd.to_numeric(df_copy['isbn13'], errors='coerce')
df_copy['num_pages'] = pd.to_numeric(df_copy['num_pages'], errors='coerce')
df_copy['ratings_count'] = pd.to_numeric(df_copy['ratings_count'], errors='coerce')
df_copy['text_reviews_count'] = pd.to_numeric(df_copy['text_reviews_count'], errors='coerce')

#  Convert publication_date to datetime if it's not already
if not pd.api.types.is_datetime64_any_dtype(df_copy['publication_date']):
    df_copy['publication_date'] = pd.to_datetime(df_copy['publication_date'], errors='coerce')

# Replace NaN with None for proper NULL handling in PostgreSQL
df_copy = df_copy.replace({pd.NA: None, pd.NaT: None})
df_copy = df_copy.where(pd.notnull(df_copy), None)

In [None]:
# Convert DataFrame to csv format in memory
df_copy.to_csv(output, sep='\t', header=False, index=False, na_rep='\\N')
output.seek(0)
with conn.cursor() as cur:
# Use COPY to insert data
    cur.copy_from(
        file=output,
        table='books',
        null='\\N'
    )

# Commit and close
conn.commit()
conn.close()

In [None]:
conn = create_db_connection()
with conn.cursor() as cur:
    cur.execute("""SELECT aidb.create_model('paraphrase', 
                            'bert_local', 
                            '{"model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
                             "revision": "main"}'::JSONB);""")
    cur.execute("""SELECT aidb.create_retriever_for_table(
            name => 'book_retriever_aidb',
            model_name => 'paraphrase',
            source_key_column => 'bookid',
            source_table => 'books',
            source_data_column => 'title',
            source_data_type => 'Text');""")
    
    cur.execute("SELECT aidb.bulk_embedding('book_retriever_aidb');")
    # Commit and close
conn.commit()


In [19]:
with conn.cursor() as cur:
    cur.execute("""SELECT 
        key AS id, 
        value AS title,
        distance,
        rank() OVER (ORDER BY distance ASC) AS rank 
    FROM aidb.retrieve_text('book_retriever_aidb', 'books with dragon and magic', 40);
""")
    
    embeddings = cur.fetchall()
    df_embeddings = pd.DataFrame(embeddings, columns=['id', 'title', 'distance', 'rank'])
    

## Using reranker model
- Retrieve-rerank is a well known practice when it comes to improve retriever output.
- One approach can be using a reranking model to rank more relevant data first.

In [None]:
# Load reranker model directly from the Hugging Face Hub

tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-v2-m3")
model = AutoModelForSequenceClassification.from_pretrained("BAAI/bge-reranker-v2-m3")

query = "books with dragon and magic"
pairs = [[query,title] for title in df_embeddings['title'].tolist()]

with torch.no_grad():
    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
    reranking_scores = model(**inputs, return_dict=True).logits.view(-1, ).float()
    
# Add reranking scores to the dataframe
df_embeddings['reranking_score'] = reranking_scores.numpy()

# Sort the dataframe based on reranking scores
df_embeddings_sorted = df_embeddings.sort_values(by='reranking_score', ascending=False).reset_index(drop=True)


In [29]:
df_embeddings_sorted.head(10)

Unnamed: 0,id,title,distance,rank,reranking_score
0,27880,The Girl the Dragon and the Wild Magic (Rhia...,0.656513,4,1.804317
1,23460,Dragon's Treasure,0.739054,10,1.20513
2,34899,The Book of the Dragon,0.517602,1,1.144251
3,17727,The Books of Magic,0.589481,2,0.96578
4,28539,The Magic of Krynn (DragonLance: Tales I #1),0.623367,3,0.635804
5,18127,The Eyes of the Dragon,0.796344,35,0.589727
6,25286,The Dragon Prince: A Chinese Beauty & the Beas...,0.804866,39,0.221783
7,19819,Harrowing the Dragon,0.781455,26,0.175849
8,28532,Realms of Dragons: The Universes of Margaret W...,0.750788,12,0.074729
9,5374,The Dragon's Eye (Dragonology Chronicles #1),0.729348,9,0.02159
