installing sentence BERT python library

In [None]:
!pip install sentence_transformers
!pip install transformers

In [59]:
import csv
from post_parser_record import PostParserRecord
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, models, util, InputExample, losses, CrossEncoder
from sklearn.model_selection import train_test_split
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
import random
from scipy.stats import ttest_ind, ttest_rel


Reading the duplicate questions and xml file (Similar to Assignment-3)

In [36]:
def read_tsv_test_data(file_path, post_reader):
    dic_similar_questions = {}
    lst_all_test = []
    with open(file_path) as fd:
        rd = csv.reader(fd, delimiter="\t", quotechar='"')
        for row in rd:
            question_id = int(row[0])
            lst_similar = [int(qid) for qid in row[1:] if int(qid) in post_reader.map_questions]
            if question_id in post_reader.map_questions and lst_similar:
                dic_similar_questions[question_id] = lst_similar
                lst_all_test.append(question_id)
                lst_all_test.extend(lst_similar)
    return dic_similar_questions, lst_all_test

post_reader = PostParserRecord("Posts_law.xml")
dic_similar_questions, lst_all_test = read_tsv_test_data("duplicate_questions.tsv", post_reader)


Step One: Using pre-trained Quora duplicate question to encode questions and find similar questions

In [37]:
# in question one, we are using the pre-trained model on quora with no further fine-tuning
model_name = 'distilbert-base-nli-stsb-quora-ranking'
model = SentenceTransformer(model_name)


corpus = []
idx_to_qid = {}
idx = 0

for question_id in post_reader.map_questions:
    question = post_reader.map_questions[question_id]
    text = question.title
    corpus.append(text)
    idx_to_qid[idx] = question_id
    idx += 1

corpus_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)

def find_top_similar_questions(corpus_embeddings, query_embedding, top_k=100):
    return torch.topk(util.cos_sim(query_embedding, corpus_embeddings)[0], k=top_k)[1].tolist()

def calculate_metrics(question_id, top_indices, idx_to_qid, similar_questions_dict):
    for rank, index in enumerate(top_indices):
        if idx_to_qid[index] in similar_questions_dict[question_id]:
            return int(rank == 0), (1 / (rank + 1))
    return 0, 0

def evaluate_corpus(corpus_embeddings, test_idx_to_qid, similar_questions_dict):
    metrics = [calculate_metrics(test_idx_to_qid[index], find_top_similar_questions(corpus_embeddings, corpus_embeddings[index]), test_idx_to_qid, similar_questions_dict) for index in test_idx_to_qid if test_idx_to_qid[index] in similar_questions_dict]
    first_correct_list, reciprocal_ranks = zip(*metrics)
    return np.mean(first_correct_list), np.mean(reciprocal_ranks)

avg_first_correct, avg_mrr = evaluate_corpus(corpus_embeddings, idx_to_qid, dic_similar_questions)

print("Average First Correct:", avg_first_correct)
print("Average Mean Reciprocal Rank:", avg_mrr)

Batches:   0%|          | 0/811 [00:00<?, ?it/s]

Average First Correct: 0.01090909090909091
Average Mean Reciprocal Rank: 0.1264595001545107


Step 2: Fine-tune any Sentence-BERT Model

In [48]:
def create_pairs(dic_similar_questions, lst_all_test):
    positive_pairs = []
    negative_pairs = []
    for key, value in dic_similar_questions.items():
        for v in value:
            positive_pairs.append((key, v))
            negative_pairs.append((key, random.choice([x for x in lst_all_test if x not in value])))
    return positive_pairs, negative_pairs

# Split the data into training (90%) and testing (10%) sets
train_similar_questions, test_similar_questions = train_test_split(list(dic_similar_questions.items()), test_size=0.1, random_state=42)
train_similar_questions = dict(train_similar_questions)
test_similar_questions = dict(test_similar_questions)

# Create positive and negative pairs for training
positive_pairs, negative_pairs = create_pairs(train_similar_questions, lst_all_test)


In [50]:
pretrained_models = ['bert-base-uncased', 'distilbert-base-uncased', 'roberta-base', 'xlnet-base-cased']
best_model = None
best_mrr_score = -1

for model_name in pretrained_models:
    print(f"Training with Pre-trained Model: {model_name}")

    tokenizer = models.Transformer(model_name, max_seq_length=128)
    pooling_model = models.Pooling(tokenizer.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[tokenizer, pooling_model])

    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

    num_train_steps = len(train_dataloader) * 2
    warmup_steps = int(num_train_steps * 0.1)

    # Fine-tune the model
    model.fit(train_objectives=[(train_dataloader, losses.BatchHardTripletLoss(model))],
              epochs=2,
              warmup_steps=warmup_steps,
              optimizer_params={'lr': 2e-5},
              use_amp=True,
              show_progress_bar=True)

    # Create the test corpus
    test_corpus = {qid: post_reader.map_questions[qid].title for qid in lst_all_test}

    # Compute the corpus embeddings using the model
    corpus_embeddings = model.encode(list(test_corpus.values()), convert_to_tensor=True, show_progress_bar=True)

    # Evaluate the performance of the model on the test set
    avg_first_correct, avg_mrr = evaluate_corpus(corpus_embeddings, test_idx_to_qid, dic_similar_questions)
    print(f"Average First Correct for {model_name}: {avg_first_correct}")
    print(f"Average Mean Reciprocal Rank for {model_name}: {avg_mrr}")

    # If the current model has a higher MRR score, update the best_model and best_mrr_score
    if avg_mrr > best_mrr_score:
        best_model = model
        best_mrr_score = avg_mrr

print(f"Best Model: {best_model.__class__.__name__} with Average Mean Reciprocal Rank: {best_mrr_score}")

# Compute the corpus embeddings using the best model
corpus_embeddings = best_model.encode(list(test_corpus.values()), convert_to_tensor=True, show_progress_bar=True)

# Create a new idx_to_qid and qid_to_idx dictionary for the test set
test_idx_to_qid = {i: qid for i, qid in enumerate(test_corpus.keys())}
test_qid_to_idx = {qid: i for i, qid in enumerate(test_corpus.keys())}

Training with Pre-trained Model: bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Average First Correct for bert-base-uncased: 0.0
Average Mean Reciprocal Rank for bert-base-uncased: 0.18612189116189454
Training with Pre-trained Model: distilbert-base-uncased


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Average First Correct for distilbert-base-uncased: 0.0
Average Mean Reciprocal Rank for distilbert-base-uncased: 0.21069340323867544
Training with Pre-trained Model: roberta-base


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Average First Correct for roberta-base: 0.0
Average Mean Reciprocal Rank for roberta-base: 0.2468782852247763
Training with Pre-trained Model: xlnet-base-cased


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Average First Correct for xlnet-base-cased: 0.0
Average Mean Reciprocal Rank for xlnet-base-cased: 0.12048633485113801
Best Model: SentenceTransformer with Average Mean Reciprocal Rank: 0.2468782852247763


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

In [53]:
# Evaluate the performance of the model on the test set (part two)
avg_first_correct, avg_mrr = evaluate_corpus(corpus_embeddings, test_idx_to_qid, dic_similar_questions)
print("Average First Correct:", avg_first_correct)
print("Average Mean Reciprocal Rank:", avg_mrr)


Average First Correct: 0.0
Average Mean Reciprocal Rank: 0.2468782852247763


Part Three: Using only 10% of the training set

In [63]:
train_examples_10_percent, _ = train_test_split(train_examples, train_size=0.1, random_state=42)

best_model_10_percent = SentenceTransformer(modules=[tokenizer, pooling_model])
model.fit(train_objectives=[(train_dataloader_10_percent, losses.BatchHardTripletLoss(model))],
          epochs=2,
          warmup_steps=warmup_steps,
          evaluation_steps=500,
          optimizer_params={'lr': 2e-5},
          use_amp=True,
          show_progress_bar=True)


corpus_embeddings_10_percent = best_model_10_percent.encode(list(test_corpus.values()), convert_to_tensor=True, show_progress_bar=True)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

In [64]:
avg_first_correct_10_percent, avg_mrr_10_percent = evaluate_corpus(corpus_embeddings_10_percent, test_idx_to_qid, dic_similar_questions)
print("Average First Correct (10% Training Set):", avg_first_correct_10_percent)
print("Average Mean Reciprocal Rank (10% Training Set):", avg_mrr_10_percent)

Average First Correct (10% Training Set): 0.0
Average Mean Reciprocal Rank (10% Training Set): 0.12048633485113801


In [65]:
# Calculate MRR for each question in the test set for both models
mrr_full_training_set = [calculate_metrics(test_idx_to_qid[index], find_top_similar_questions(corpus_embeddings, corpus_embeddings[index]), test_idx_to_qid, dic_similar_questions)[1] for index in test_idx_to_qid if test_idx_to_qid[index] in dic_similar_questions]
mrr_10_percent_training_set = [calculate_metrics(test_idx_to_qid[index], find_top_similar_questions(corpus_embeddings_10_percent, corpus_embeddings_10_percent[index]), test_idx_to_qid, dic_similar_questions)[1] for index in test_idx_to_qid if test_idx_to_qid[index] in dic_similar_questions]

# Perform a paired t-test
t_statistic, p_value = ttest_rel(mrr_full_training_set, mrr_10_percent_training_set)

print("Paired t-test statistic:", t_statistic)
print("Paired t-test p-value:", p_value)

alpha = 0.05  # Set a significance level
if p_value < alpha:
    print("The difference in MRR between the models is statistically significant.")
else:
    print("The difference in MRR between the models is not statistically significant.")


Paired t-test statistic: 10.154887438624609
Paired t-test p-value: 8.994995613411595e-21
The difference in MRR between the models is statistically significant.
