In [None]:
!pip install transformers 



In [None]:
import os
import requests
import json
import tensorflow as tf

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path_data = '/squad'

#Load Squad Datset

In [None]:
def read_squad(path, num_samples):
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    # initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa['answers']:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts[:num_samples], questions[:num_samples], answers[:num_samples]

In [None]:
num_samples = 2000*16
train_contexts, train_questions, train_answers = read_squad(path_data+'/train-v2.0.json', num_samples)
val_contexts, val_questions, val_answers = read_squad(path_data+'/dev-v2.0.json', num_samples)

In [None]:
train_contexts[0][269:269+len(train_answers[0]['text'])], train_questions[0], len(train_answers)

('in the late 1990s', 'When did Beyonce start becoming popular?', 32000)

## Prepare Dataset

In [None]:
from transformers import DistilBertTokenizerFast
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased-distilled-squad')
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
len(tokenizer.vocab)

30522

In [None]:
class Sample:
    def __init__(self, question, context, start_char_idx=None, answer_text=None, all_answers=None):
        self.question = question
        self.context = context
        self.start_char_idx = start_char_idx
        self.answer_text = answer_text
        self.skip = False
        self.start_token_idx = -1
        self.end_token_idx = -1

    def preprocess(self):
        context = " ".join(str(self.context).split())
        question = " ".join(str(self.question).split()) #'max_length'
        tokenized_context = tokenizer(context,question,truncation=True, padding=True, return_offsets_mapping=True)
        if len(tokenized_context.input_ids) > 256:
              return None
        tokenized_context = tokenizer(context,question,max_length=256,truncation=True, padding='max_length', return_offsets_mapping=True)

        if self.answer_text is not None:
            answer = " ".join(str(self.answer_text).split())
            end_char_idx = self.start_char_idx + len(answer)
            if end_char_idx >= len(context):
                self.skip = True
                return None
            is_char_in_ans = [0] * len(context)
            for idx in range(self.start_char_idx, end_char_idx):
                is_char_in_ans[idx] = 1
            ans_token_idx = []
            for idx, (start, end) in enumerate(tokenized_context.offset_mapping):
                if sum(is_char_in_ans[start:end]) > 0:
                    ans_token_idx.append(idx)
            if len(ans_token_idx) == 0:
                self.skip = True
                return None
            self.start_token_idx = ans_token_idx[0]
            self.end_token_idx = ans_token_idx[-1]
         #, 'start_token_idx':self.start_token_idx, 'end_token_idx':self.end_token_idx  
        return tokenized_context.input_ids , tokenized_context.attention_mask, self.start_token_idx, self.end_token_idx

In [None]:
input_ids = []
attention_mask = []
start_positions = []
end_positions = []
for x ,(context, question, answer) in enumerate(zip(train_contexts, train_questions, train_answers)):
        start_char_idx  = answer['answer_start']
        answer_text = answer['text']
        squad_eg = Sample(question, context, start_char_idx, answer_text)
        x = squad_eg.preprocess()
        if x==None:
            continue
        else :
            input_ids.append(x[0])
            attention_mask.append(x[1])
            start_positions.append(x[2])
            end_positions.append(x[3])

In [None]:
num_samples = len(input_ids)
num_samples

29094

---

# Fine-tuning BERT

In [None]:
BUFFER_SIZE = num_samples
BATCH_SIZE = 64

data = tf.data.Dataset.from_tensor_slices((input_ids, attention_mask,start_positions,end_positions)).shuffle(BUFFER_SIZE)
data = data.batch(BATCH_SIZE, drop_remainder=False)

In [None]:
SIZE = num_samples/BATCH_SIZE
SPLIT = 0.9
def map_func(input_ids, masks, start_positions, end_positions):
    return {'input_ids': input_ids, 'attention_mask': masks, 'start_positions' : start_positions, 'end_positions':end_positions}
  
data = data.map(map_func)

train = data.take(int(SIZE*SPLIT))
val = data.skip(int(SIZE*SPLIT))

del data

In [None]:
from transformers import TFDistilBertForQuestionAnswering, AdamWeightDecay
model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

In [None]:
for layer in model.layers:
    layer.trainable = True
len(model.trainable_variables)

102

In [None]:
optimizer = AdamWeightDecay(learning_rate=1e-5)

In [None]:
@tf.function
def train_step(inp):
  with tf.GradientTape() as tape:
        input_ids = inp['input_ids']
        attention_mask = inp['attention_mask']
        start_positions = inp['start_positions']
        end_positions = inp['end_positions']
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions,training=True)
        loss = outputs[0][0]
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))
  return loss
def val_step(inp):
        input_ids = inp['input_ids']
        attention_mask = inp['attention_mask']
        start_positions = inp['start_positions']
        end_positions = inp['end_positions']
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask,  start_positions=start_positions,end_positions=end_positions)
        loss = outputs[0][0]
        return loss  

In [None]:
import time
from tqdm import tqdm
EPOCHS = 5
history = {
  "epoch": [],
  "loss": [],
  "Accuracy" :[],
  "val_loss" :[],
  "val_Accuracy" :[]
}
for epoch in range(EPOCHS):
  start = time.time()
  total_loss = 0
  val_loss = 0
  # TRAIN LOOP
  loop = tqdm(train, leave=True)
  for (batch, (inp)) in enumerate(loop):
      batch_loss = train_step(inp)
      total_loss = total_loss + batch_loss
      loop.set_description(f'Epoch {epoch+1}')
      loop.set_postfix(loss=batch_loss.numpy())
  history['epoch'].append(epoch)
  history['loss'].append(total_loss/(batch+1))
  for (batc, (inp)) in enumerate(val):
      batch_loss = val_step(inp)
      val_loss += batch_loss
  history['val_loss'].append(val_loss/(batc+1))
  if (epoch+1) % 1 == 0: 
        print(f'Epoch {epoch+1}, Loss: {total_loss/(batch+1):.4f}, Val_Loss: {val_loss/(batc+1):.4f}') 
        print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')

Epoch 1: 100%|██████████| 409/409 [06:11<00:00,  1.10it/s, loss=0.387]


Epoch 1, Loss: 2.0124, Val_Loss: 1.9777
Time taken for 1 epoch: 385.14 secs



Epoch 2: 100%|██████████| 409/409 [06:06<00:00,  1.12it/s, loss=0.441]


Epoch 2, Loss: 1.9975, Val_Loss: 1.9757
Time taken for 1 epoch: 379.44 secs



Epoch 3: 100%|██████████| 409/409 [06:06<00:00,  1.12it/s, loss=3.73]


Epoch 3, Loss: 1.9201, Val_Loss: 1.6532
Time taken for 1 epoch: 379.44 secs



Epoch 4: 100%|██████████| 409/409 [06:05<00:00,  1.12it/s, loss=1.32]


Epoch 4, Loss: 1.8982, Val_Loss: 2.0173
Time taken for 1 epoch: 379.28 secs



Epoch 5: 100%|██████████| 409/409 [06:06<00:00,  1.12it/s, loss=4.02]


Epoch 5, Loss: 1.8527, Val_Loss: 1.9521
Time taken for 1 epoch: 379.38 secs



In [None]:
acc = []
for inp in val:
        input_ids = inp['input_ids']
        attention_mask = inp['attention_mask']
        start_positions = inp['start_positions']
        end_positions = inp['end_positions']
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask,  start_positions=start_positions,end_positions=end_positions)
        start_pred = tf.argmax(outputs['start_logits'], axis=1)
        end_pred = tf.argmax(outputs['end_logits'], axis=1)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred.numpy() == start_positions.numpy()).sum()/len(start_pred)).item())
        acc.append(((end_pred.numpy() == end_positions.numpy()).sum()/len(end_pred)).item())
# calculate average accuracy in total
acc = sum(acc)/len(acc)

In [None]:
acc

0.561945795194508

In [None]:
print("T/F\tstart\tend\n")
for i in range(len(start_positions)):
    print(f"true\t{start_positions[i]}\t{end_positions[i]}\n"
          f"pred\t{start_pred[i]}\t{end_pred[i]}\n")

T/F	start	end

true	60	61
pred	60	61

true	6	201
pred	6	201

true	25	25
pred	25	25

true	26	29
pred	28	29

true	100	101
pred	99	101

true	22	24
pred	22	24

true	36	36
pred	36	36

true	84	86
pred	84	86

true	39	44
pred	39	43

true	76	76
pred	76	80

true	36	38
pred	40	43

true	84	86
pred	111	86

true	89	93
pred	83	84

true	32	40
pred	39	40

true	103	105
pred	103	105

true	14	22
pred	7	12

true	20	20
pred	28	30

true	44	45
pred	44	83

true	183	183
pred	41	41

true	68	68
pred	68	68

true	59	59
pred	59	63

true	82	83
pred	81	83

true	89	91
pred	108	110

true	35	35
pred	35	35

true	17	21
pred	52	55

true	18	18
pred	18	18

true	9	224
pred	55	56

true	17	17
pred	16	17

true	59	62
pred	1	62

true	56	58
pred	56	57

true	2	64
pred	2	2

true	33	33
pred	33	33

true	57	58
pred	56	58

true	62	63
pred	62	63

true	151	152
pred	8	26

true	41	42
pred	41	15

true	31	31
pred	31	31

true	140	144
pred	140	38



In [None]:
asd = {}
for i in range(len(start_positions)):
    start = None
    end = None
    for j, x in enumerate(input_ids[i]):
        if x == 102 and start == None:
           start = j
        if x == 0 and end == None:
           end = j
    print(f"\nQuestion : {(tokenizer.decode(input_ids[i][start+1:end-1]))}")
    print(f"Actual answer : {' '.join(tokenizer.convert_ids_to_tokens(input_ids[i])[start_positions[i]:end_positions[i]+1])}")
    print(f"Predicted answer : {tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i]+1])}")


Question : how many paper cups are used by americans each year?
Actual answer : 16 billion
Predicted answer : 16 billion

Question : john was deeply suspicious of who?
Actual answer : the barons , particularly those with sufficient power and wealth to potentially challenge the king . numerous barons were subjected to john ' s male ##vo ##lent ##ia , even including william marshal , a famous knight and baron normally held up as a model of utter loyalty . the most infamous case , which went beyond anything considered acceptable at the time , proved to be that of william de bra ##ose , a powerful marche ##r lord with lands in ireland . de bra ##ose was subjected to pun ##itive demands for money , and when he refused to pay a huge sum of 40 , 000 marks ( equivalent to £2 ##6 , 66 ##6 at the time ) , [ n ##b 13 ] his wife and one of his sons were imprisoned by john , which resulted in their deaths . de bra ##ose died in exile in 121 ##1 , and his grandson ##s remained in prison until 121 #