## Context Specific Quote Recommendation from Historical Text

### Downloading dependencies

In [None]:
!pip install transformers



### Importing Dependencies

In [None]:
import transformers
import pandas as pd
import numpy as np
import torch
import json
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm
from transformers import BertForQuestionAnswering
from transformers import DistilBertForQuestionAnswering
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
from sklearn.metrics import f1_score

### Setting the hyperparameters

In [None]:
epochs = 20

### Downloading Dataset

In [None]:
#!wget -L 'https://raw.githubusercontent.com/agnik2019/cn_project/main/qutus_dataset.json'

### Reading the dataset into a dataframe

In [None]:
qutus_dataset_df = pd.read_json('/content/fixed_data_4.json')

#### Example of Dataset

In [None]:
qutus_dataset_df.head(3)

Unnamed: 0,answers,title,context,question,id
0,"{'answer_start': [401], 'text': [['when the Un...",<OPTION>,What a different speech this could have been. ...,The presentation opened with a series of banal...,1
1,"{'answer_start': [191], 'text': [['When the ma...",<OPTION>,The presentation opened with a series of banal...,"“When the Union was turned back at Bull Run, a...",2
2,"{'answer_start': [112], 'text': [['Again, we a...",<OPTION>,"While more dark days will come, things are get...","While more dark days will come, things are get...",3


In [None]:
print(f'Total Data Instances: {len(qutus_dataset_df)}')

Total Data Instances: 898


### Splitting the dataset into training and test dataset

In [None]:
training_dataset, test_dataset = train_test_split(qutus_dataset_df, test_size=0.2, random_state=42, shuffle=False)

In [None]:
test_dataset

Unnamed: 0,answers,title,context,question,id
718,"{'answer_start': [282], 'text': [['And when hi...",<OPTION>,"open schools for them, and teach them how to r...",He becomes insane when he's under the influenc...,1347
719,"{'answer_start': [282], 'text': [['And when hi...",<OPTION>,"open schools for them, and teach them how to r...",It's certain that civilization must lead to th...,1348
720,"{'answer_start': [282], 'text': [['And when hi...",<OPTION>,"open schools for them, and teach them how to r...",A poll-tax and hut-tax have been imposed on th...,1349
721,"{'answer_start': [282], 'text': [['And when hi...",<OPTION>,"open schools for them, and teach them how to r...","If these imposts were not levied, this race of...",1350
722,"{'answer_start': [282], 'text': [['And when hi...",<OPTION>,"open schools for them, and teach them how to r...",Europeans would find it hard to get servants i...,1351
...,...,...,...,...,...
893,"{'answer_start': [245], 'text': [['There is ha...",<OPTION>,"the same pity and love like you, but I only wi...",In the exercise of brute force there is harm.,1522
894,"{'answer_start': [78], 'text': [['An instituti...",<OPTION>,"public institutions, which should not last a d...",Public institutions shouldn't last a day longe...,1523
895,"{'answer_start': [78], 'text': [['An instituti...",<OPTION>,"public institutions, which should not last a d...",An institution run with the interest of capita...,1524
896,"{'answer_start': [78], 'text': [['An instituti...",<OPTION>,"public institutions, which should not last a d...",This isn't the place to discuss the corruption...,1525


### Preparing question, answers, and contexts of Training  and Test Dataset

#### Preparing Training Questions, Contexts, and Answers

In [None]:
training_questions = training_dataset["question"].tolist()
training_contexts = training_dataset["context"].tolist()
training_answers = []
len_training_dataset = len(training_dataset)
for itr in range(len_training_dataset):
  answer = {}
  answer['text'] = training_dataset['answers'][itr]['text']
  answer['answer_start'] = training_dataset['answers'][itr]['answer_start']
  training_answers.append(answer)

#### Preparing Test Questions, Contexts, and Answers

In [None]:
test_questions = test_dataset["question"].tolist()
test_contexts = test_dataset["context"].tolist()
test_answers = []
len_test_dataset = len_training_dataset + len(test_dataset)
start_index = len_training_dataset

for start_index in range(start_index, len_test_dataset):
  answer = {}
  answer['text'] = test_dataset['answers'][start_index]['text']
  answer['answer_start'] = test_dataset['answers'][start_index]['answer_start']
  test_answers.append(answer)

#### Helper method to add end-index of the answers

In [None]:
# Helper function to add end-index of the answers 
def add_end_idx(answers, contexts):
  count = 0
  answer_text = "" 
  for answer, context in zip(answers, contexts):
      answer_text = answer['text'][0]
      if answer['answer_start'] == None:
        start_idx = 0
      else: 
        start_idx = answer['answer_start'][0]
      
      end_idx = start_idx + len(answer_text)
      #print(f'{start_idx} - {end_idx}')

      training_answers[count]['answer_end'] = end_idx
      #print(training_contexts[count])
      count += 1            

In [None]:
add_end_idx(training_answers, training_contexts)

In [None]:
# Helper function to add end-index of the answers 
def add_end_idx(answers, contexts):
  count = 0
  answer_text = "" 
  for answer, context in zip(answers, contexts):
      answer_text = answer['text'][0]
      if answer['answer_start'] == None:
        start_idx = 0
      else: 
        start_idx = answer['answer_start'][0]
      
      end_idx = start_idx + len(answer_text)
      #print(f'{start_idx} - {end_idx}')

      test_answers[count]['answer_end'] = end_idx
      #print(training_contexts[count])
      count += 1 

In [None]:
add_end_idx(test_answers, test_contexts)

#### Creating the encodings of the Train and Test Dataset

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
train_encodings = tokenizer(training_contexts, training_questions, truncation=True, padding=True)

In [None]:
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)

#### Creating the token positions

In [None]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start'][-1]))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
        
            start_positions[-1] = tokenizer.model_max_length
        # end position cannot be found, char_to_token found space, so shift position until found
        
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [None]:
add_token_positions(train_encodings, training_answers)

In [None]:
add_token_positions(test_encodings, test_answers)

#### Class the dataset to feed the model from the encodings

In [None]:
class QuotusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
encoded_train_dataset = QuotusDataset(train_encodings)

In [None]:
encoded_test_dataset = QuotusDataset(test_encodings)

#### Initialization and Training of the model

In [None]:
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [None]:
# Selection of device GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Adding device to the model
model.to(device)

# Activation of Training
#model.train()

# Setting the Adam Optimizer as the model optimizer with weight decay
# It reduces chances of overfitting
optim = AdamW(model.parameters(), lr=5e-5)

# Creation of training dataloader from the training dataset
train_loader = DataLoader(encoded_train_dataset, batch_size=16, shuffle=True)
for epoch in range(epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        
        loss = outputs[0]
        #print(loss)
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())


Epoch 0: 100%|██████████| 45/45 [00:25<00:00,  1.78it/s, loss=1.12]
Epoch 1: 100%|██████████| 45/45 [00:25<00:00,  1.74it/s, loss=0.399]
Epoch 2: 100%|██████████| 45/45 [00:26<00:00,  1.68it/s, loss=0.131]
Epoch 3: 100%|██████████| 45/45 [00:27<00:00,  1.61it/s, loss=0.0836]
Epoch 4: 100%|██████████| 45/45 [00:28<00:00,  1.56it/s, loss=0.0499]
Epoch 5: 100%|██████████| 45/45 [00:28<00:00,  1.59it/s, loss=0.0397]
Epoch 6: 100%|██████████| 45/45 [00:28<00:00,  1.59it/s, loss=0.155]
Epoch 7: 100%|██████████| 45/45 [00:28<00:00,  1.59it/s, loss=0.0188]
Epoch 8: 100%|██████████| 45/45 [00:28<00:00,  1.59it/s, loss=0.00615]
Epoch 9: 100%|██████████| 45/45 [00:28<00:00,  1.59it/s, loss=0.0213]
Epoch 10: 100%|██████████| 45/45 [00:28<00:00,  1.59it/s, loss=0.0014]
Epoch 11: 100%|██████████| 45/45 [00:28<00:00,  1.59it/s, loss=0.00198]
Epoch 12: 100%|██████████| 45/45 [00:28<00:00,  1.58it/s, loss=0.734]
Epoch 13: 100%|██████████| 45/45 [00:28<00:00,  1.59it/s, loss=0.0136]
Epoch 14: 100%|█████

#### Saving the Mode

In [None]:
model_path = 'models/distilbert-custom'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('models/distilbert-custom/tokenizer_config.json',
 'models/distilbert-custom/special_tokens_map.json',
 'models/distilbert-custom/vocab.txt',
 'models/distilbert-custom/added_tokens.json',
 'models/distilbert-custom/tokenizer.json')

In [None]:
error_range = 10
index = []

actual_starts = []
predicted_starts = []

actual_ends = []
predicted_ends = []

In [None]:
def calculate_accuracy(s_pred, s_true, e_pred, e_true):
  length_of_tensor = len(s_pred)
  correct_predictions = 0
  for i in range(length_of_tensor):
    index.append(i)
    actual_starts.append(s_true[i].item())
    predicted_starts.append(s_pred[i].item())
    actual_ends.append(e_true[i].item())
    predicted_ends.append(e_pred[i].item())

    print(f'{s_true}:{s_pred} {e_true}:{e_pred}')
    #print(f'{pred[i]} : {true[i]}')
    if s_pred[i] in range(s_true[i] - error_range, s_true[i] + error_range) and e_pred[i] in range(e_true[i] - error_range, e_true[i] + error_range):
    #if s_pred[i] in range(s_true[i] - error_range, s_true[i] + error_range):
      correct_predictions += 1
  acc = 0
  if correct_predictions != 0:
    acc = correct_predictions / length_of_tensor
  return acc
      

In [None]:
# Turing on the eval mode of the model
model.eval()

# Creating the test data loader from test dataset
test_loader = DataLoader(encoded_test_dataset, batch_size=16)
acc = []

loop = tqdm(test_loader)

for batch in loop:
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        # make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        # pull preds out
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        # calculate accuracy for both and append to accuracy list
        
        
        acc.append(calculate_accuracy(start_pred, start_true, end_pred, end_true))



100%|██████████| 12/12 [00:01<00:00,  7.17it/s]


In [None]:
acc = sum(acc)/len(acc)
print(f'Accuracy: {acc}')

Accuracy: 0.828125


In [None]:
actual_starts

[60,
 60,
 60,
 60,
 60,
 60,
 60,
 60,
 60,
 60,
 60,
 60,
 60,
 90,
 90,
 90,
 90,
 90,
 90,
 90,
 90,
 90,
 90,
 61,
 61,
 61,
 61,
 61,
 61,
 100,
 100,
 100,
 100,
 100,
 100,
 100,
 152,
 152,
 152,
 152,
 152,
 152,
 152,
 178,
 178,
 178,
 178,
 178,
 178,
 178,
 178,
 178,
 178,
 178,
 178,
 178,
 178,
 178,
 135,
 135,
 135,
 135,
 135,
 135,
 135,
 135,
 135,
 135,
 135,
 135,
 27,
 27,
 27,
 21,
 21,
 21,
 95,
 95,
 95,
 95,
 95,
 1,
 1,
 1,
 1,
 1,
 202,
 202,
 202,
 202,
 202,
 202,
 202,
 202,
 202,
 202,
 202,
 202,
 202,
 202,
 81,
 81,
 81,
 81,
 81,
 81,
 81,
 81,
 81,
 12,
 12,
 9,
 9,
 9,
 59,
 59,
 59,
 59,
 59,
 59,
 59,
 59,
 59,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 512,
 37,
 37,
 37,
 37,
 37,
 37,
 37,
 37,
 37,
 12,
 12,
 12,
 16,
 16,
 16,
 16,
 1,
 1,
 1,
 1,
 115,
 115,
 115,
 115,
 115,
 115,
 115,
 115,
 115,
 115,
 115,
 115,
 115,
 115,
 3,
 3,
 184,
 184,
 184,
 184,
 184,
 184,
 184,
 56,
 56,
 16,
 16,
 16,
 16,
 60,
 60,
 60,
 60,
 60,
 60,
