## Load data

In [1]:
import warnings
import pandas as pd

from datasets import get_dataset_config_names

warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
for subset in get_dataset_config_names('rungalileo/ragbench'):
    print(subset)

covidqa
cuad
delucionqa
emanual
expertqa
finqa
hagrid
hotpotqa
msmarco
pubmedqa
tatqa
techqa


In [2]:
df = pd.read_parquet(f'hf://datasets/rungalileo/ragbench/techqa')

df['question_id'] = df.index
df['n_relevant'] = df['all_relevant_sentence_keys'].apply(len)

df.sort_values(by=['id', 'n_relevant'], ascending=[False, False], inplace=True)
df.drop_duplicates(subset=['id'], keep='first', inplace=True)

df.head(3)

Unnamed: 0,id,question,documents,response,generation_model_name,annotating_model_name,dataset_name,documents_sentences,response_sentences,sentence_support_information,unsupported_response_sentence_keys,adherence_score,overall_supported_explanation,relevance_explanation,all_relevant_sentence_keys,all_utilized_sentence_keys,trulens_groundedness,trulens_context_relevance,ragas_faithfulness,ragas_context_relevance,gpt3_adherence,gpt3_context_relevance,gpt35_utilization,relevance_score,utilization_score,completeness_score,question_id,n_relevant
490,techqa_TRAIN_Q599,ICC configuration manager error: This implemen...,[ SUBSCRIBE\nYou can track all active APARs fo...,The error is caused by an incompatible Windows...,gpt-3.5-turbo-0125,gpt-4-turbo-2024-04-09,techqa_train,"[[[0a, SUBSCRIBE You can track all active APA...","[[a, The error is caused by an incompatible Wi...",[{'explanation': 'This sentence is supported b...,[],True,The response accurately describes the cause of...,The relevant information pertaining to the pro...,"[4a, 4b, 4c, 4g, 4q, 4r, 4s, 4t, 4u, 4v, 4w, 4...","[4a, 4s, 4q, 4r, 4g, 4y, 4aa, 4ac]",,,,,,,,0.071713,0.031873,0.444444,490,18
385,techqa_TRAIN_Q598,Is Internet Explorer 11 supported by Lombardi ...,[browser; update; event; details; security eve...,"According to the provided information, IBM Bus...",gpt-3.5-turbo-0125,gpt-4-turbo-2024-04-09,techqa_train,"[[[0a, browser; update; event; details; securi...","[[a, According to the provided information, IB...",[{'explanation': 'This sentence partially leve...,"[b, c]",False,The response implies a lack of support for Int...,The question asks whether Internet Explorer 11...,"[3ak, 3am, 3an]","[3ak, 3an]",,,,,,,,0.020408,0.013605,0.666667,385,3
1069,techqa_TRAIN_Q597,How do I change the default 'fit content by' b...,[fit by height; fit by width; fit; default fit...,"To change the default ""fit content by"" behavio...",claude-3-haiku-20240307,gpt-4-turbo-2024-04-09,techqa_train,"[[[0a, fit by height; fit by width; fit; defau...","[[a, To change the default ""fit content by"" be...",[{'explanation': 'This is a general introducto...,[],True,The response is supported by sentences from Do...,Document 0 contains useful information for ans...,"[0b, 0d, 0e, 0f, 0h, 0i, 0j, 0k, 0m, 0n, 0o, 0...","[0d, 0e, 0f, 0h, 0i, 0j, 0k, 0m, 0n, 0o, 0p, 0...",,,,,,,,0.079096,0.073446,0.928571,1069,14


In [3]:
df['n_relevant'].value_counts(bins=25)

(-0.158, 6.28]      238
(6.28, 12.56]       184
(12.56, 18.84]      149
(18.84, 25.12]      129
(25.12, 31.4]        76
(31.4, 37.68]        42
(37.68, 43.96]       30
(43.96, 50.24]       20
(50.24, 56.52]       12
(56.52, 62.8]         9
(62.8, 69.08]         4
(81.64, 87.92]        3
(75.36, 81.64]        2
(69.08, 75.36]        2
(87.92, 94.2]         1
(94.2, 100.48]        1
(100.48, 106.76]      1
(113.04, 119.32]      1
(150.72, 157.0]       1
(106.76, 113.04]      0
(119.32, 125.6]       0
(125.6, 131.88]       0
(131.88, 138.16]      0
(138.16, 144.44]      0
(144.44, 150.72]      0
Name: count, dtype: int64

## Embedder

In [4]:
from pydantic import BaseModel


class EmbedderSettings(BaseModel):
    batch_size: int = 16
    model_name: str
    type_model: str
    dimension: int
    prefix_query: str
    prefix_document: str

In [5]:
e5_embedder_settings = EmbedderSettings(batch_size=16, 
                                     model_name='intfloat/multilingual-e5-base', 
                                     type_model="", 
                                     dimension=768, 
                                     prefix_query="query: {}",
                                     prefix_document="passage: {}")

In [6]:
import abc
from typing import List

import more_itertools
import numpy as np
import torch
import torch.nn.functional as F
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer, XLMRobertaModel, XLMRobertaTokenizer


class IEmbedder(abc.ABC):
    def __init__(self):
        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

    @abc.abstractmethod
    def encode(self, sentences: List[str], doc_type: str) -> np.ndarray:
        """Calculate sentences embedding(s)"""


class Embedder(IEmbedder):
    def __init__(self, settings: EmbedderSettings):
        super().__init__()
        self._settings = settings
        self.batch_size = self._settings.batch_size
        self.model_type = self._settings.type_model
        self.prefix_query = self._settings.prefix_query
        self.prefix_document = self._settings.prefix_document

        if self.model_type == 'e5':
            self.model = XLMRobertaModel.from_pretrained(self._settings.model_name).to(self.device)
            self.tokenizer = XLMRobertaTokenizer.from_pretrained(self._settings.model_name)
        else:
            self.model = AutoModel.from_pretrained(self._settings.model_name).to(self.device)
            self.tokenizer = AutoTokenizer.from_pretrained(self._settings.model_name)

    @staticmethod
    def average_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

    def encode(self, sentences: List[str], doc_type: str) -> np.ndarray:
        sentences = self.preprocess_sentences(sentences, doc_type)
        embeddings = torch.tensor([]).to(self.device)

        for batch in more_itertools.chunked(sentences, self.batch_size):
            tokenized_batch = self.tokenizer(batch, max_length=512, padding=True,
                                             truncation=True, return_tensors='pt').to(self.device)

            with torch.no_grad():
                outputs = self.model(**tokenized_batch).last_hidden_state
                embed = self.average_pool(outputs, tokenized_batch['attention_mask'])

            torch.cuda.empty_cache()

            for tensor in embed:
                embeddings = torch.cat((embeddings, tensor.unsqueeze(0)), 0)

        return np.array([torch.Tensor.cpu(emb) for emb in F.normalize(embeddings, dim=-1)])

    def preprocess_sentences(self, sentences: List[str], doc_type: str) -> List[str]:
        if doc_type == 'query':
            return [self.prefix_query.format(sentence) for sentence in sentences]
        elif doc_type == 'document':
            return [self.prefix_document.format(sentence) for sentence in sentences]
        return sentences

In [7]:
embedder = Embedder(e5_embedder_settings)

## Vector Database

In [11]:
import yaml
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker


SQLALCHEMY_DATABASE_URL = "postgresql://user:password@localhost:5434/ugragdb"

engine = create_engine(SQLALCHEMY_DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()

In [12]:
import yaml
from pgvector.sqlalchemy import Vector
from sqlalchemy import Column, Date, ForeignKey, Integer, String, Text


dimension = 768

class DocumentDB(Base):
    __tablename__ = "techqa_dataset"

    id = Column(String, primary_key=True, index=True)
    doc_id = Column(String, nullable=False)
    question_id = Column(String, nullable=False)
    text = Column(Text, nullable=False)
    vector = Column(Vector(dimension))

Base.metadata.create_all(bind=engine)

ProgrammingError: (psycopg2.errors.UndefinedObject) type "vector" does not exist
LINE 7:  vector VECTOR(768), 
                ^

[SQL: 
CREATE TABLE techqa_dataset (
	id VARCHAR NOT NULL, 
	doc_id VARCHAR NOT NULL, 
	question_id VARCHAR NOT NULL, 
	text TEXT NOT NULL, 
	vector VECTOR(768), 
	PRIMARY KEY (id)
)

]
(Background on this error at: https://sqlalche.me/e/14/f405)

In [9]:
db = SessionLocal()


for question_id, document_sentences in tqdm(zip(df['question_id'].values, df['documents_sentences'].values), total=len(df)):
    for doc_group in document_sentences:
        for doc in doc_group:
            doc_id = doc[0]
            doc_text = doc[1]
            key_id = f"{question_id}_{doc_id}"

            # Should rewrite this to multiple documents at encoding if one use GPU
            embedding = embedder.encode([doc_text], doc_type="document")[0]

            db_doc = DocumentDB(id=key_id, doc_id=doc_id,
                                  question_id=str(question_id),
                                  text=doc_text,
                                  vector=embedding)
            db.add(db_doc)

    db.commit()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 905/905 [9:07:35<00:00, 36.30s/it]


## ElasticSearch

In [33]:
import logging
from typing import Any, Dict, Iterator, List

import more_itertools
from elasticsearch import Elasticsearch, ElasticsearchException
from elasticsearch.helpers import bulk

logger = logging.getLogger(__name__)


def create_index(index_name: str, es_client: Elasticsearch) -> None:
    mapping: Dict = {
        "mappings": {
            "properties": {
                "text": {"type": "text"},
                "doc_id": {"type": "keyword"},
                "question_id": {"type": "keyword"}
            }
        }
    }

    if not es_client.indices.exists(index=index_name):
        es_client.indices.create(index=index_name, body=mapping)
        logger.info(f"Successfully created index {index_name}")


def load(df: pd.DataFrame) -> Iterator[Any]:
    for _, row in df.iterrows():
        try:
            yield generate_document_source(row.to_dict())
        except Exception:
            raise

def generate_document_source(row: Dict) -> Dict:
    result = {
        "id": row['key_id'],
        "text": row['doc_text'],
        "doc_id": row['doc_id'],
        "question_id": row['question_id']
    }

    return result


def update_search(df: pd.DataFrame, es_client: Elasticsearch, batch_size: int = 500) -> None:
    total_inserted_docs: int = 0
    total_errors: int = 0

    for chunk in more_itertools.ichunked(load(df), batch_size):
        bucket_data = []
        for document in chunk:
            cur = {
                "_index": "techqa_data",
                "_source": document,
            }
            if 'id' in document:
                cur['_id'] = str(document['id'])
            bucket_data.append(cur)
        try:
            inserted, errors = bulk(es_client, bucket_data, max_retries=4, raise_on_error=False)
            errors_num = len(errors) if isinstance(errors, list) else errors  # type: ignore
            logger.debug(f"{inserted} docs successfully inserted by bulk with {errors_num} errors")
            total_inserted_docs += inserted
            total_errors += errors_num
            if isinstance(errors, list):  # type: ignore
                for error in errors:  # type: ignore
                    logger.error(f"Doc was not inserted with error: {error}")
        except ElasticsearchException as e:
            logger.exception(f"Error while pushing data to elasticsearch: {e}")
            raise


In [27]:
es_client = Elasticsearch(([{"host": "localhost", "port": 9200}]))

In [18]:
doc_ids, doc_texts, key_ids, question_ids = [], [], [], []


for question_id, document_sentences in tqdm(zip(df['question_id'].values, df['documents_sentences'].values), total=len(df)):
    for doc_group in document_sentences:
        for doc in doc_group:
            doc_ids.append(doc[0])
            doc_texts.append(doc[1])
            key_ids.append(f"{question_id}_{doc[0]}")
            question_ids.append(str(question_id))

fulltext_df = pd.DataFrame({'key_id': key_ids, 'question_id': question_ids, 'doc_id': doc_ids, 'doc_text': doc_texts})

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 905/905 [00:00<00:00, 9127.19it/s]


In [34]:
update_search(fulltext_df, es_client)

## Retrieval

In [87]:
def retrieve_semantic_search(query: str, question_id: str, top_k: int = 40, similarity_threshold: float = 0.8) -> list:
    query_vector = embedder.encode([query], doc_type="query")[0].tolist()

    results = (
        db.query(
            DocumentDB,
            DocumentDB.vector.cosine_distance(query_vector).label("distance"),
        )
        .filter(
            DocumentDB.vector.cosine_distance(query_vector) < similarity_threshold,
            DocumentDB.question_id == str(question_id)
        )
        .order_by("distance")
        .limit(top_k)
        .all()
    )
    return [{"text": result.DocumentDB.text, 
             "doc_id": result.DocumentDB.doc_id,
             "score": result[1]} for result in results]

In [88]:
def retrieve_fulltext_search(query: str, question_id: str, top_k: int = 40) -> list:
    query: Dict = {"query": {
        "bool": {
          "must": [
            {
              "match": {
                "text": query
              }
            },
            {
              "term": {
                "question_id": {
                  "value": str(question_id)
                }
              }
            }
          ]
        }
      }, "size": top_k}
    response: Dict = es_client.search(index="techqa_data", body=query)

    return [{"text": hit["_source"]['text'], 
             "doc_id": hit["_source"]['doc_id'],
             "score": hit["_score"]} for hit in response['hits']['hits']]

In [89]:
semantic_search_results = {}
fulltext_search_results = {}

for question_id, query in tqdm(zip(df['question_id'].values, df['question'].values), total=len(df)):
    semantic_search_results[str(question_id)] = retrieve_semantic_search(query, question_id)

    fulltext_search_results[str(question_id)] = retrieve_fulltext_search(query, question_id)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 905/905 [01:03<00:00, 14.32it/s]


## Reranker

In [150]:
from sentence_transformers.cross_encoder import CrossEncoder

model = CrossEncoder("cross-encoder/stsb-distilroberta-base")

In [179]:
combined_search_results = {}

for question_id in tqdm(df['question_id'].values):
    values = fulltext_search_results[str(question_id)] + semantic_search_results[str(question_id)]

    unique_data = {entry['doc_id']: entry for entry in values}
    docs = list(unique_data.values())

    query = df.loc[df['question_id']==question_id]['question'].values[0]
    corpus = [val['text'] for val in docs]
    
    sentence_combinations = [[query, sentence] for sentence in corpus]
    scores = model.predict(sentence_combinations)
    
    for score, doc in zip(scores, docs):
        doc['ml_score'] = score
    
    combined_search_results[str(question_id)] = sorted(docs, key=lambda x: x['ml_score'], reverse=True)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 905/905 [2:14:35<00:00,  8.92s/it]


In [194]:
model = CrossEncoder("cross-encoder/stsb-roberta-base")

In [195]:
combined_search_results_v1 = {}

for question_id in tqdm(df['question_id'].values):
    values = fulltext_search_results[str(question_id)] + semantic_search_results[str(question_id)]

    unique_data = {entry['doc_id']: entry for entry in values}
    docs = list(unique_data.values())

    query = df.loc[df['question_id']==question_id]['question'].values[0]
    corpus = [val['text'] for val in docs]
    
    sentence_combinations = [[query, sentence] for sentence in corpus]
    scores = model.predict(sentence_combinations)
    
    for score, doc in zip(scores, docs):
        doc['ml_score'] = score
    
    combined_search_results_v1[str(question_id)] = sorted(docs, key=lambda x: x['ml_score'], reverse=True)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 905/905 [55:45<00:00,  3.70s/it]


## Metrics

In [233]:
def calculate_precision_recall(true_relevance: list[str], retrieved_documents: list[str], at_k: int = 10) -> tuple[float, float, float]:
    true_relevance = set(true_relevance)
    retrieved_documents = set(retrieved_documents[:at_k])
    
    true_positives = true_relevance.intersection(retrieved_documents)
    
    precision = len(true_positives) / len(retrieved_documents)
    recall = len(true_positives) / len(true_relevance)
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score


def calculate_mrr(true_relevance: list[str], retrieved_documents: list[str], ks: list[int] = [1, 3, 5, 10]) -> dict[int, float]:
    true_relevance = set(true_relevance)
    mrr_scores = []
    
    for k in ks:
        rank_found = 0
        for i, doc_id in enumerate(retrieved_documents[:k], start=1):
            if doc_id in true_relevance:
                rank_found = i
                break
        mrr_scores.append(1 / rank_found if rank_found > 0 else 0)
    
    return mrr_scores


def calculate_metrics(df: pd.DataFrame, search_results: dict, top_k: int = 10) -> None:
    mrr_1, mrr_3, mrr_5, mrr_10 = [], [], [], []
    precisions, recalls, f1s = [], [], []
    
    for question_id, true_relevance in zip(df['question_id'].values, df['all_relevant_sentence_keys'].values):
        if len(true_relevance) == 0:
            continue
    
        relevances = [val['doc_id'] for val in search_results[str(question_id)]]
        
        mrr = calculate_mrr(true_relevance, relevances)
        mrr_1.append(mrr[0]), mrr_3.append(mrr[1]), mrr_5.append(mrr[2]), mrr_10.append(mrr[3])
        
        precision, recall, f1_score = calculate_precision_recall(true_relevance, relevances, top_k)
        precisions.append(precision), recalls.append(recall), f1s.append(f1_score)

    print(f"precision@{top_k}: {round(np.mean(precisions), 3)}, recall@{top_k}: {round(np.mean(recalls), 3)}, f1@{top_k}: {round(np.mean(f1s), 3)}")
    print(f"mrr@1: {round(np.mean(mrr_1), 3)}, mrr@3: {round(np.mean(mrr_3), 3)}, mrr@5: {round(np.mean(mrr_5), 3)}, mrr@10: {round(np.mean(mrr_10), 3)}")
    

In [234]:
calculate_metrics(df, fulltext_search_results)

precision@10: 0.222, recall@10: 0.177, f1@10: 0.169
mrr@1: 0.388, mrr@3: 0.469, mrr@5: 0.489, mrr@10: 0.505


In [235]:
calculate_metrics(df, semantic_search_results)

precision@10: 0.271, recall@10: 0.218, f1@10: 0.207
mrr@1: 0.436, mrr@3: 0.539, mrr@5: 0.556, mrr@10: 0.569


In [236]:
calculate_metrics(df, combined_search_results)

precision@10: 0.199, recall@10: 0.149, f1@10: 0.146
mrr@1: 0.283, mrr@3: 0.369, mrr@5: 0.392, mrr@10: 0.415


In [237]:
calculate_metrics(df, combined_search_results_v1)

precision@10: 0.184, recall@10: 0.141, f1@10: 0.135
mrr@1: 0.298, mrr@3: 0.374, mrr@5: 0.391, mrr@10: 0.412


In [207]:
query = df.loc[df['question_id']==847]['question'].values[0]
query

'How can I configure a DataPower MQ client to use mutual authentication with SSL? I am looking to setup a DataPower MQ Client to use SSL in mutual authentication mode with a Remote WebSphere MQ Manager. '

In [211]:
df.loc[df['question_id']==847]

Unnamed: 0,id,question,documents,response,generation_model_name,annotating_model_name,dataset_name,documents_sentences,response_sentences,sentence_support_information,unsupported_response_sentence_keys,adherence_score,overall_supported_explanation,relevance_explanation,all_relevant_sentence_keys,all_utilized_sentence_keys,trulens_groundedness,trulens_context_relevance,ragas_faithfulness,ragas_context_relevance,gpt3_adherence,gpt3_context_relevance,gpt35_utilization,relevance_score,utilization_score,completeness_score,question_id,n_relevant
847,techqa_TRAIN_Q595,How can I configure a DataPower MQ client to u...,[datapower; mq; websphere mq; ssl; mutual auth...,To configure a DataPower MQ client to use mutu...,gpt-3.5-turbo-0125,gpt-4-turbo-2024-04-09,techqa_train,"[[[0a, datapower; mq; websphere mq; ssl; mutua...","[[a, To configure a DataPower MQ client to use...",[{'explanation': 'This is a general introducto...,[],True,The overall response is supported by Document ...,Document 0 is the primary document of relevanc...,"[0a, 0b, 0c, 0d, 0f, 0g, 0h, 0i, 0j, 0k, 0l, 0...","[0j, 0q, 0r, 0s, 0w, 0ag, 0ah, 0ai, 0aj, 0ak, ...",,,,,,,,0.215385,0.071795,0.333333,847,84


In [210]:
combined_search_results['847']

[{'text': ' The WebSphere DataPower MQ Client can be configured to use SSL in mutual authentication mode with a Remote WebSphere MQ Manager.',
  'doc_id': '0b',
  'score': 0.05614918135707325,
  'ml_score': 0.789327},
 {'text': ' How do I secure communications between a WebSphere Application Server JMS client and a WebSphere MQ server, using SSL repertoires? CAUSE',
  'doc_id': '3b',
  'score': 0.13019458887640944,
  'ml_score': 0.7973819},
 {'text': 'Configure WebSphere MQ Queue Manager (Remote MQ Server) to use SSL authentication (SSLCAUTH as "required") using URL: http://publib.boulder.ibm.com/infocenter/wmqv7/v7r0/topic/com.ibm.mq.explorer.doc/e_ssl.htm [http://publib.boulder.ibm.com/infocenter/wmqv7/v7r0/topic/com.ibm.mq.explorer.doc/e_ssl.htm].  2.',
  'doc_id': '0n',
  'score': 0.10133565680213008,
  'ml_score': 0.7765573},
 {'text': 'In order to utilize server and client authentication a CA signer certificate and a signed personal certificate for each connection you wish to sec