# Answering date questions by using SOTA BERT through simple transformers

In [43]:
import torch
import json
import os
import pandas as pd
from datetime import datetime
from simpletransformers.question_answering import QuestionAnsweringModel


In [None]:
# Intiate usage of GPU for training

print(torch.__version__)

cuda_available = torch.cuda.is_available()

# If there's a GPU available...
if cuda_available:    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

   




## Fine-tuning of pre-trained BERT for QA process using SQuAD 2.0 dataset

In [2]:
with open('./data/train-v2.0.json', 'r') as f:
    train_data = json.load(f)


train_data = [item for topic in train_data['data'] for item in topic['paragraphs']]

In [3]:
# define the model


train_args = {
    'learning_rate': 3e-5,
    'optimizer': 'AdamW',
    'adam_epsilon': 1e-8,
    'manual_seed': 101,
    'num_train_epochs': 2,
    'max_seq_length': 384, #Maximum sequence length the model will support
    'doc_stride': 128, # splitting up a long document into chunks, how much stride to take between chunks.
    'reprocess_input_data': False,
    'train_batch_size': 2,
    'gradient_accumulation_steps': 8,
    'fp16': False, #True --> fp16 mode requires NVidia Apex library
    'overwrite_output_dir':True,  #If True, the trained model will be saved (overwrite) to the ouput_dir= './outputs/'
    'save_steps': 2000, # Save a model checkpoint at every 2000 steps. Set to -1 to disable.
    'tensorboard_dir': False, # Tensorboard events will be saved in a subfolder inside runs/ like runs/Dec02_09-32-58_36d9e58955b0/
}

model = QuestionAnsweringModel('bert', 'bert-base-cased', use_cuda=True, cuda_device=-1, args=train_args)



Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and a

In [4]:
model.train_model(train_data)

convert squad examples to features: 100%|██████████| 130319/130319 [00:37<00:00, 3499.84it/s] 
add example index and unique id: 100%|██████████| 130319/130319 [00:00<00:00, 1076867.54it/s]


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/16457 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/16457 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/16457 [00:00<?, ?it/s]

(49371, 0.7569055093715983)

## Evaluation of model performance on SQuAD 2.0 database


In [12]:

with open('data/dev-v2.0.json', 'r') as f:
    dev_data = json.load(f)

dev_data = [item for topic in dev_data['data'] for item in topic['paragraphs'] ]

preds, probs = model.predict(dev_data)

os.makedirs('results', exist_ok=True)




convert squad examples to features: 100%|██████████| 11873/11873 [00:05<00:00, 2305.88it/s]
add example index and unique id: 100%|██████████| 11873/11873 [00:00<00:00, 735474.40it/s]


Running Prediction:   0%|          | 0/1513 [00:00<?, ?it/s]

In [17]:
# save results of evaluation
submission = dict()
submission = {pred['id']: pred['answer'] for pred in preds}

    
with open('results/submission.json', 'w') as f:
    json.dump(submission, f)

## Import provided Reg data 

In [39]:
# Read JSON file that include transformed reg data for QA classifier
with open('sample_data.json') as data_file:
    data_loaded = json.load(data_file)
    
    
pred_data = [item for topic in data_loaded['data'] for item in topic['paragraphs'] ]

## Capturing dates via QA prediction

In [54]:
predictions, raw_outputs = model.predict(pred_data, n_best_size=1)

convert squad examples to features: 100%|██████████| 294/294 [01:17<00:00,  3.79it/s]
add example index and unique id: 100%|██████████| 294/294 [00:00<00:00, 63169.17it/s]


Running Prediction:   0%|          | 0/871 [00:00<?, ?it/s]

In [192]:
pred_df = pd.DataFrame(predictions)
prob_df = pd.DataFrame(raw_outputs)

In [184]:
pred_df.head(10)
prob_df.head(10)

Unnamed: 0,id,probability
0,9841051B-7A7F-4657-98A9-117A9B36C98B_1,[0.9994059361548607]
1,9841051B-7A7F-4657-98A9-117A9B36C98B_2,[0.997699414515289]
2,9841051B-7A7F-4657-98A9-117A9B36C98B_3,[0.9999995664582505]
3,9841051B-7A7F-4657-98A9-117A9B36C98B_4,[0.9996517625834309]
4,9841051B-7A7F-4657-98A9-117A9B36C98B_5,[0.15003815075635676]
5,9841051B-7A7F-4657-98A9-117A9B36C98B_6,[3.744063148752484e-05]
6,C5FE84B7-E3B1-4677-8EDF-A60E7D358055_1,[4.9066822113834034e-05]
7,C5FE84B7-E3B1-4677-8EDF-A60E7D358055_2,[0.0004096146249521647]
8,C5FE84B7-E3B1-4677-8EDF-A60E7D358055_3,[0.8459255957363234]
9,C5FE84B7-E3B1-4677-8EDF-A60E7D358055_4,[0.0010002256285069012]


## Function to change captured dates to requested dd/mm/YYYY format

In [193]:

def get_date(s_date):
    date_patterns = ["%d-%m-%Y", "%Y-%m-%d", "%B %d, %Y", "%B %Y", "%b. %d, %Y", "%A, %B %d, %Y"]

    for pattern in date_patterns:
        try:
            return datetime.strptime(s_date, pattern).strftime('%d/%m/%Y')
        except:
              pass #return s_date

    #print("Date is not in expected format: %s" %(s_date))


## Processing date prediction outcomes to comply with provided reg data

In [194]:

output_df = pred_df
output_df['UID'] = ''
output_df['Q'] = ''
output_df['Probability'] = prob_df['probability']


for index, row in output_df.iterrows():
    idx = row['id'].split('_')
    row['UID'] = idx[0]
    if idx[1] == '1':
        row['Q'] = 'Publiccation/ Issue Date'
    elif idx[1] == '2':
        row['Q'] = 'Compliance Date'
    elif idx[1] == '3':
        row['Q'] = 'Effective Date'
    elif idx[1] == '4':
        row['Q'] = 'Repeal Date'
    elif idx[1] == '5':
        row['Q'] = 'Consultation Period'
    else:
        row['Q'] = 'Issuing Department'
    
    row['answer'] = row['answer'][0]
    if row['answer'] == 'empty':
        row['answer'] = 'Unknown'
    else:
        row['answer'] = get_date(row['answer'])
        
    row['Probability'] = float(row['Probability'][0])
    
output_df.drop(columns=['id'])

output_df = output_df[['UID', 'Q', 'answer', 'Probability']]
output_df.columns = ['UID', 'Queston', 'Insight', 'Probability']

In [195]:
output_df.head(50)

Unnamed: 0,UID,Queston,Insight,Probability
0,9841051B-7A7F-4657-98A9-117A9B36C98B,Publiccation/ Issue Date,15/03/2021,0.999406
1,9841051B-7A7F-4657-98A9-117A9B36C98B,Compliance Date,15/03/2021,0.997699
2,9841051B-7A7F-4657-98A9-117A9B36C98B,Effective Date,15/03/2021,1.0
3,9841051B-7A7F-4657-98A9-117A9B36C98B,Repeal Date,15/03/2021,0.999652
4,9841051B-7A7F-4657-98A9-117A9B36C98B,Consultation Period,Unknown,0.150038
5,9841051B-7A7F-4657-98A9-117A9B36C98B,Issuing Department,Unknown,3.7e-05
6,C5FE84B7-E3B1-4677-8EDF-A60E7D358055,Publiccation/ Issue Date,Unknown,4.9e-05
7,C5FE84B7-E3B1-4677-8EDF-A60E7D358055,Compliance Date,Unknown,0.00041
8,C5FE84B7-E3B1-4677-8EDF-A60E7D358055,Effective Date,19/01/2021,0.845926
9,C5FE84B7-E3B1-4677-8EDF-A60E7D358055,Repeal Date,Unknown,0.001


### END