In [185]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [186]:
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, util, models, evaluation
from transformers import AutoTokenizer, AutoModel
import torch
import nltk
from torch.utils.data import DataLoader

import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from sentence_transformers import losses, util
from sentence_transformers import LoggingHandler, SentenceTransformer, evaluation
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import csv
import os
from zipfile import ZipFile
import random

In [187]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [188]:
## https://huggingface.co/nlpaueb/legal-bert-base-uncased
## https://www.sbert.net/examples/applications/computing-embeddings/README.html?highlight=autotokenizer


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask



#Sentences we want sentence embeddings for
sentences = ['If a company agrees to pay travel cost for a job interview, is the promise binding and enforceable?',
             'Is a job offer letter sent and accepted by email, legally binding?']

#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")

#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
cos = torch.nn.CosineSimilarity(dim=0)
output = cos(sentence_embeddings[0], sentence_embeddings[1])
print(output)

Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor(0.8943)


# Step 1: Use Sentence-BERT on Quora Duplicate Question

In [189]:
import csv
from post_parser_record import PostParserRecord

def read_tsv_test_data(file_path):
  # Takes in the file path for test file and generate a dictionary
  # of question id as the key and the list of question ids similar to it
  # as value. It also returns the list of all question ids that have
  # at least one similar question
  dic_similar_questions = {}
  lst_all_test = []
  with open(file_path) as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
        question_id = int(row[0])
        lst_similar = list(map(int, row[1:]))
        dic_similar_questions[question_id] = lst_similar
        lst_all_test.append(question_id)
        lst_all_test.extend(lst_similar)
  return dic_similar_questions, lst_all_test

dic_similar_questions, lst_all_test = read_tsv_test_data("duplicate_questions.tsv")
post_reader = PostParserRecord("Posts_law.xml")

In [190]:
from sentence_transformers import SentenceTransformer, util
import torch

# in question one, we are using the pre-trained model on quora with no further fine-tuning
model_name = 'distilbert-base-nli-stsb-quora-ranking'
model = SentenceTransformer(model_name)

# list of text to be indexed (encoded)
corpus = []
# this dictionary is used as key: corpus index [0, 1, 2, ...] and value: corresponding question id
index_to_question_id = {}
idx = 0

# indexing all the questions in the law stack exchange -- only using the question titles
for question_id in post_reader.map_questions:
  question = post_reader.map_questions[question_id]
  text = question.title
  q_id = question.post_id
  corpus.append(text)
  index_to_question_id[idx] = question_id
  idx += 1

# Indexing (embedding) the 
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)

lst_test_question_ids = list(dic_similar_questions.keys())
top_k = 100

for question_id in lst_test_question_ids:
  query_text = post_reader.map_questions[question_id].title
  query_embedding = model.encode(query_text, convert_to_tensor=True)

  # We use cosine-similarity and torch.topk to find the highest 5 scores
  cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
  top_results = torch.topk(cos_scores, k=top_k)
  #scores_dic = {}
  for score, idx in zip(top_results[0], top_results[1]):
    index = int(idx)
    #scores_dic[index] = score
    # printing question id and similarity score
    #print(index_to_question_id[index], "(Score: {:.4f})".format(score))

Batches:   0%|          | 0/756 [00:00<?, ?it/s]

In [191]:
def p_at_1(dic_similar_questions, the_model, corp_embeddings):
  p_at_1_list = []
  sum = 0
  #p_at_1 = 0
  for question_id in lst_test_question_ids:
    if question_id in dic_similar_questions:
      query_text = post_reader.map_questions[question_id].title
      query_embedding = the_model.encode(query_text, convert_to_tensor=True)
      cos_scores = util.cos_sim(query_embedding, corp_embeddings)[0]
      if len(corp_embeddings) > 99:
        k = 100
      else:
        k = len(corp_embeddings)
      top_results = torch.topk(cos_scores, k=k)

      top_indices = top_results[1].tolist()
      for j, index in enumerate(top_indices):
        if index_to_question_id[index] in dic_similar_questions[question_id]:
          sum = 1
        else:
          sum = 0
        
      p_at_1_list.append(sum)
  
  p_at_1_avg = np.mean(p_at_1_list)
  print("P@1:")
  print(sum, "matches")
  print("Average: ", p_at_1_avg)

In [192]:
def mrr(dic_similar_questions, the_model, corp_embeddings):
  mrr_list = []

  for question_id in lst_test_question_ids:
    if question_id in dic_similar_questions:
      query_text = post_reader.map_questions[question_id].title
      query_embedding = the_model.encode(query_text, convert_to_tensor=True)
      cos_scores = util.cos_sim(query_embedding, corp_embeddings)[0]
      if len(corp_embeddings) > 99:
        k = 100
      else:
        k = len(corp_embeddings)
      top_results = torch.topk(cos_scores, k=k)

      top_indices = top_results[1].tolist()
      ranks = []
      for j, index in enumerate(top_indices):
        if index_to_question_id[index] in dic_similar_questions[question_id]:
          ranks.append(1 / (j+1))
        
      if len(ranks) > 0:
        mrr_list.append(np.mean(ranks))
      else:
        mrr_list.append(0)
  mrr_avg = np.mean(mrr_list)
  print("MRR Average: ", mrr_avg)

In [193]:
p_at_1(dic_similar_questions, model, corpus_embeddings)

P@1:
0 matches
Average:  0.0


In [194]:
mrr(dic_similar_questions, model, corpus_embeddings)

MRR Average:  0.12613650434278634


# Question 2

In [195]:
import csv
from post_parser_record import PostParserRecord
import re

def read_tsv_test_data(file_path):
  # Takes in the file path for test file and generate a dictionary
  # of question id as the key and the list of question ids similar to it
  # as value. It also returns the list of all question ids that have
  # at least one similar question
  dic_similar_questions = {}
  lst_all_test = []
  with open(file_path) as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
        question_id = int(row[0])
        lst_similar = list(map(int, row[1:]))
        dic_similar_questions[question_id] = lst_similar
        lst_all_test.append(question_id)
        lst_all_test.extend(lst_similar)
  return dic_similar_questions, lst_all_test


def train_model(lst_sentences):
  model = FastText(sg=1,vector_size=100,window=5,min_n=1)
  model.build_vocab(lst_sentences)
  model.train(lst_sentences, total_examples=len(lst_sentences), epochs=10)
  model.save("my_model.model")

  return model



duplicate_file = "duplicate_questions.tsv"
post_file = "Posts_law.xml"
dic_similar_questions, lst_all_test = read_tsv_test_data(duplicate_file)
post_reader = PostParserRecord(post_file)

lst_training_sentences = []
for question_id in post_reader.map_questions:
  if question_id in lst_all_test:
    continue
  question = post_reader.map_questions[question_id]
  title = question.title
  body = question.body
  # Collect sentences here
  title_sentences = nltk.sent_tokenize(question.title)
  body_sentences = nltk.sent_tokenize(question.body)

  titles = []
  bodies = []
  for sentence in title_sentences:
        sentence = re.sub('<[^<]+?>', '', sentence)
        words = nltk.word_tokenize(sentence)
        titles.append(words)
  for sentence in body_sentences:
        sentence = re.sub('<[^<]+?>', '', sentence)
        words = nltk.word_tokenize(sentence)
        bodies.append(words)
  lst_training_sentences.extend(titles)
  lst_training_sentences.extend(bodies)
  
  lst_answers = question.answers

  if lst_answers is not None:
    for answer in lst_answers:
      answer_body = answer.body
      answer_sentences = nltk.sent_tokenize(answer_body)
      answers = []

      for sentence in answer_sentences:
        sentence = re.sub('<[^<]+?>', '', sentence)
        words = nltk.word_tokenize(sentence)
        answers.append(words)
      lst_training_sentences.extend(answers)

In [196]:
print(dic_similar_questions)

{3526: [1837], 3815: [3526], 4043: [1985], 4182: [4161], 4231: [1985], 4593: [4483], 5033: [5034], 5645: [657], 5943: [4429], 6061: [887], 6113: [5130], 6726: [6510], 7610: [1334], 7705: [1330], 8264: [178], 8564: [4845], 9103: [9097], 9488: [56], 11532: [1037], 13275: [5614], 14725: [9109], 14874: [5749], 14879: [14878], 15035: [12081], 15279: [15177], 15337: [4285], 15374: [3494], 15985: [15986], 16272: [14273], 16380: [13118], 8154: [4800], 17753: [16615], 17959: [9412], 18975: [5252], 19564: [15895], 21607: [21424], 22637: [5089], 22743: [7446], 23063: [5010], 23581: [22237], 23776: [13482], 24128: [6263], 24759: [19526], 25348: [25330], 25838: [15055], 26078: [23692], 26292: [1049], 26018: [1049], 26420: [217], 26523: [26477], 27093: [27084, 9412], 27588: [8687], 27595: [8455], 27976: [27965], 28014: [27919], 28111: [1523], 28232: [28139], 28628: [14383], 29231: [22562], 5422: [5421], 30410: [1369], 30632: [24302], 27333: [6934], 30915: [247], 30927: [29001], 31065: [9620], 31077:

# Split into training/testing data

In [197]:
def generate_negative_samples(dic, lst_of_ids):
  neg_samples = {}
  for id in dic:
    cnt = len(dic[id])
    neg_samples[id] = []
    while cnt > 0:
      rand_id = random.choice(lst_of_ids)
      if rand_id not in dic[id]:
        neg_samples[id].append(rand_id)
        cnt -= 1
  return neg_samples

In [198]:
negative_ids = generate_negative_samples(dic_similar_questions, list(dic_similar_questions.keys()))

In [199]:
def split_train_test(dic_similar_questions, negative_ids):
  train_keys = []
  test_keys = []
  training_data = []
  testing_data = {}
  cnt = 0
  for id in dic_similar_questions:
      if cnt < int(len(dic_similar_questions) * 0.9):
          train_keys.append(id)
          key_text = post_reader.map_questions[id].title
          similar_id = dic_similar_questions[id][0]
          value_text = post_reader.map_questions[similar_id].title
          
          training_data.append((key_text, value_text, 1))

          similar_id = negative_ids[id][0]
          value_text = post_reader.map_questions[similar_id].title
          training_data.append((key_text, value_text, 0))
      else:
          test_keys.append(id)
          testing_data[id] = dic_similar_questions[id][0]
      cnt += 1
  return training_data, testing_data, train_keys, test_keys

In [202]:
training_data, testing_data, train_keys, test_keys = split_train_test(dic_similar_questions, negative_ids)

In [203]:
model_name = 'distilbert-base-nli-stsb-quora-ranking'
model = SentenceTransformer(model_name)

batch_size = 16 
num_epochs = 10

In [204]:
def train_model(model, training_data):
  multiple_neg_rank_loss = []
  constrative_loss = []
  
  for data in training_data:
      constrative_loss.append(InputExample(texts=[data[0], data[1]], label=data[2]))
      if data[2] == 1:
          multiple_neg_rank_loss.append(InputExample(texts=[data[0], data[1]], label=1))
          multiple_neg_rank_loss.append(InputExample(texts=[data[1], data[0]], label=0))  # if A is a duplicate of B, then B is a duplicate of A
  
  neg_rank_loss_dataloader = DataLoader(SentencesDataset(multiple_neg_rank_loss, model=model), batch_size=batch_size)
  neg_rank_train_loss = losses.MultipleNegativesRankingLoss(model)
  
  constrative_loss_dataLoader = DataLoader(SentencesDataset(constrative_loss, model=model), shuffle = True, batch_size=batch_size)
  constrative_train_loss = losses.OnlineContrastiveLoss(model=model, distance_metric=util.pytorch_cos_sim, margin=.5)
  
  model.fit(train_objectives=[(neg_rank_loss_dataloader, neg_rank_train_loss), (constrative_loss_dataLoader, constrative_train_loss)],
            epochs=num_epochs,
            warmup_steps=1000
            )

In [205]:
train_model(model, training_data)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

In [206]:
corpus.clear()
for question_id in post_reader.map_questions:
  if question_id in train_keys:
    question = post_reader.map_questions[question_id]
    text = question.title
    q_id = question.post_id
    corpus.append(text)
    index_to_question_id[idx] = question_id
    idx += 1

In [207]:
corpus_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

# Lets see how the model performs trained on the Stack Exchange data

In [208]:
print(len(test_keys))

29


In [209]:
p_at_1(dic_similar_questions, model, corpus_embeddings)

P@1:
0 matches
Average:  0.0


In [210]:
mrr(dic_similar_questions, model, corpus_embeddings)

MRR Average:  0.00047281323877068556


# Question 3

In [211]:
corpus.clear()
for question_id in post_reader.map_questions:
  if question_id in test_keys:
    question = post_reader.map_questions[question_id]
    text = question.title
    q_id = question.post_id
    corpus.append(text)
    index_to_question_id[idx] = question_id
    idx += 1

In [212]:
p_at_1(dic_similar_questions, model, corpus_embeddings)

P@1:
0 matches
Average:  0.0


In [213]:
mrr(dic_similar_questions, model, corpus_embeddings)

MRR Average:  0.00047281323877068556
