# Evaluation
In order to experiment with new models, we first need a way to evaluate the results we achieved.

In [1]:
import pandas as pd

# We're using a document retriever class, to make it easier to perform the evaluator code on different models
class DocumentRetriever:
    def __init__(self):
        self.all_docs = pd.read_csv("data/all_docs.csv")
        self.all_queries = pd.read_csv("data/dev_queries.csv")

    def retrieve_documents(self, query_number, n):
        # Return the n best recommendations (ordered by decreasing relevance) for given query
        assert Exception("Function not implemented in subclass")

In [2]:
class LuceneRetriever(DocumentRetriever):
    def __init__(self):
        super().__init__()

        self.lucene_raw_retrievals = pd.read_csv("data/raw_dev_Lucene_retrievals.csv")

    def retrieve_documents(self, query_number, n):
        relevant_docs = self.lucene_raw_retrievals[self.lucene_raw_retrievals['Query_number'] == query_number]
        top_n = relevant_docs.head(n)
        return top_n['doc_number'].to_list()

luceneRetriever = LuceneRetriever()
results = luceneRetriever.retrieve_documents(1089071, 10)
results

[476602, 432658, 429474, 346632, 122086, 362869, 60461, 417115, 29215, 467667]

In [3]:
class GroundTruthRetriever(DocumentRetriever):
    def __init__(self):
        super().__init__()
        self.ground_truth = pd.read_csv("data/dev_data.csv")

    def retrieve_documents(self, query_number, n=None):
        assert n is None # The truth has no limits, it just retrieves all relevant documents
        relevant_docs = self.ground_truth[self.ground_truth['Query_number'] == query_number]
        return relevant_docs['doc_number'].to_list()

In [6]:
class ModelRater:
    def __init__(self):
        self.ground_truth_retriever = GroundTruthRetriever()
        self.probe_data = pd.read_csv("data/dev_queries.csv")

    def single_query_recall_precission(self, query_nr, n, retriever):
        query_results = set(retriever.retrieve_documents(query_nr, n))
        ground_truth = set(self.ground_truth_retriever.retrieve_documents(query_nr))

        # Some of this code is based on https://stackoverflow.com/questions/55952408/how-to-calculate-precision-and-recall-of-2-lists-in-python
        intersect_length = len(query_results.intersection(ground_truth))
        precision = intersect_length/len(query_results)
        recall = intersect_length/len(ground_truth)

        # print(f"R({recall}), P({precision})")

        return recall, precision


    def get_rating(self, retriever: DocumentRetriever, n=10) -> (float, float):
        # returns the recall, precision

        precision_sum = 0
        recall_sum = 0
        query_count = 0
        # For every query in probe data, determine the results
        for counter, query in tqdm(self.probe_data.iterrows(), total=self.probe_data.shape[0]):
            query_nr = query['Query_number']

            recall, precision = self.single_query_recall_precission(query_nr, n, retriever)

            precision_sum += precision
            recall_sum += recall
            query_count += 1

        avg_precision = precision_sum/query_count
        avg_recall = recall_sum/query_count

        print(f"Recall: {avg_recall}, Precision: {avg_precision}")

        return avg_recall, avg_precision

rater = ModelRater()

In [7]:
recall, precision = rater.get_rating(luceneRetriever)

  0%|          | 0/644 [00:00<?, ?it/s]

Recall: 0.32744769762105247, Precision: 1.0


It's strange that the precision is always exactly 1.0. This is probably because the ground truth results (dev_data.csv) is based on the lucene retrievals (raw_dev_Lucene_retrievals.csv).

## BM25 retrieval
Another lexical search, which was mentioned in the SBERT-Documentation.

We first define a helper functions to get the text given a query number

In [8]:
all_queries = pd.read_csv('data/dev_queries.csv')
def query_nr_to_text(query_nr):
    return all_queries[all_queries['Query_number'] == query_nr]['Query'].iat[0]
query_nr_to_text(1099178)

'how does sperm develop'

In [9]:
# We also compare the results to lexical search (keyword search). Here, we use
# the BM25 algorithm which is implemented in the rank_bm25 package.

from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
# from tqdm.notebook import tqdm # or autonotebook?
from tqdm import tqdm
import numpy as np

class Bm25Retriever(DocumentRetriever):
    def __init__(self):
        super().__init__()

        all_docs = pd.read_csv('data/all_docs.csv')

        self.reverse_index = []
        self.tokenized_corpus = []
        for count, row in tqdm(all_docs.iterrows(), total=all_docs.shape[0]):
            # Work with a smaller dataset when debugging
            # if len(self.reverse_index) == 1000:
            #     break
            passage = str(row['doc_text'])
            doc_nr = row['doc_number']
            self.tokenized_corpus.append(self.bm25_tokenizer(passage))
            self.reverse_index.append(doc_nr)

        self.bm25 = BM25Okapi(self.tokenized_corpus)

    # We lower case our text and remove stop-words from indexing
    @staticmethod
    def bm25_tokenizer(text):
        tokenized_doc = []
        for token in text.lower().split():
            token = token.strip(string.punctuation)

            if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
                tokenized_doc.append(token)
        return tokenized_doc


     ##### BM25 search (lexical search) #####
    def search(self, query_text, n):
        # This code is based on the code from sbert-doc.ipynb
        bm25_scores = self.bm25.get_scores(self.bm25_tokenizer(query_text))
        top_n = np.argpartition(bm25_scores, -n)[-n:]
        bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
        bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)

        results = []
        for hit in bm25_hits:
            results.append(self.reverse_index[hit['corpus_id']])
        return results


    def retrieve_documents(self, query_number, n):
        return self.search(query_nr_to_text(query_number), n)

bm25Retriever = Bm25Retriever()
recall, precision = rater.get_rating(bm25Retriever)

  0%|          | 0/126203 [00:00<?, ?it/s]

  0%|          | 0/644 [00:00<?, ?it/s]

Recall: 0.24225742435777975, Precision: 0.9601671103223895


## MsMarco
See what's the precision on the pretrained MsMarco bi-encoder and cross-encoder. This code was based on code from the sbert-doc.ipynb notebook.

In [19]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch
import pandas as pd
from tqdm import tqdm

class BiCrossRetriever(DocumentRetriever):
    def __init__(self):
        super().__init__()
        print("Super inited")

        if not torch.cuda.is_available():
            print("Warning: No GPU found. Please add GPU to your notebook")

        #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
        self.bi_encoder = SentenceTransformer('msmarco-bert-base-dot-v5')
        self.bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
        self.top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

        #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
        self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2-v2')

        self.passages = []
        self.reverse_indices = []
        for index, row in tqdm(self.all_docs.iterrows(), total=self.all_docs.shape[0]):
            if len(self.passages) > 50:
                # Use a very small subset when debugging
                break
            data = row['doc_text']
            first_words = " ".join(str(data).split()[:500]) # Only take the first 500 words
            self.passages.append(first_words)
            self.reverse_indices.append(row['doc_number'])

        print("Passages:", len(self.passages))

        # We encode all passages into our vector space. This takes about 5 minutes (depends on your GPU speed)
        self.corpus_embeddings = self.bi_encoder.encode(self.passages, convert_to_tensor=True, show_progress_bar=True)

    def get_bi_encoder_hits(self, query):
        question_embedding = self.bi_encoder.encode(query, convert_to_tensor=True)
        question_embedding = question_embedding.cuda()
        hits = util.semantic_search(question_embedding, self.corpus_embeddings, top_k=self.top_k)
        hits = hits[0]  # Get the hits for the first query
        return hits

    def retrieve_documents(self, query_nr, n):
        # Find potentially relevant passages with bi_encoder
        query = query_nr_to_text(query_nr)
        hits = self.get_bi_encoder_hits(query)

        # Re-rank those matches with the cross-encoder
        cross_inp = [[query, self.passages[hit['corpus_id']]] for hit in hits]
        cross_scores = self.cross_encoder.predict(cross_inp)

        # Sort results by the cross-encoder scores
        for idx in range(len(cross_scores)):
            hits[idx]['cross-score'] = cross_scores[idx]
        hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)

        results = []
        for hit in hits[0:n]:
            results.append(self.reverse_indices[hit['corpus_id']])

        return results

biCrossRetriever = BiCrossRetriever()
recall, precision = rater.get_rating(biCrossRetriever)

Super inited


  0%|          | 51/126203 [00:00<00:54, 2317.35it/s]


Passages: 51


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 644/644 [00:47<00:00, 13.70it/s]

Recall: 0.0003867636647012301, Precision: 0.0013975155279503104





  0%|          | 0/644 [06:30<?, ?it/s]


KeyboardInterrupt: 

# Training
Some additional experiments. Be sure to take a look at `sbert-doc.ipynb` before this.

This is the same as the previous example, but by using the MsMacro dataset.

In [10]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2" # Since GPU 0 won't work on my laptop
# import torch
# available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]
# available_gpus

import torch
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
device = torch.device("cuda:0")
torch.cuda.empty_cache()
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Read the dataset
model_name = 'msmarco-distilbert-base-tas-b'
train_batch_size = 16
num_epochs = 4
model_save_path = 'output/training_stsbenchmark_continue_training-' + model_name + '-' + datetime.now().strftime(
    "%Y-%m-%d_%H-%M-%S")

# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_name, device=device)

# Convert the dataset to a DataLoader ready for training
logging.info("Read the train dataset")

from sentence_transformers import InputExample
import pandas as pd


def get_sample_list(file_path, test_data=False):
    df = pd.read_csv(file_path)
    samples = []
    label_score = 0.99
    previous_query = None
    for index, row in df.iterrows():
        query = str(row['Query'])
        doc_text = str(row['doc_text'])

        # Test data doesn't have a label, so we have to improvise here
        if not test_data:
            label = float(row['label'])
        else:
            if previous_query == query:
                label_score *= 0.99  # Later result for the same query, so the score gets a little bit lower
            else:
                label_score = 0.99
            previous_query = query
            label = label_score

        inp_example = InputExample(texts=[query, doc_text], label=label)

        samples.append(inp_example)
    return samples


train_samples = get_sample_list("./data/training_data.csv")
dev_samples = get_sample_list("./data/dev_data.csv")
test_samples = get_sample_list("./data/test_data.csv", test_data=True)

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

# Development set: Measure correlation between cosine score and gold labels
logging.info("Read dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

# Load the stored model and evaluate its performance on the test dataset
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)