In [1]:
import datasets

squad = datasets.load_dataset('squad_v2', split='train')
squad

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 130319
})

In [2]:
squad[0]

{'id': '56be85543aeaaa14008c9063',
 'title': 'Beyoncé',
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'question': 'When did Beyonce start becoming popular?',
 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}

In [3]:
squad[0]['answers']['answer_start'][0] + len(squad[0]['answers']['text'][0])

286

In [4]:
squad[0]['context'][269:286]

'in the late 1990s'

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [6]:
squad = squad.map(lambda x: tokenizer(x['question'], x['context'], max_length=384,
                                     padding='max_length', truncation=True, return_offsets_mapping=True))
squad

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'],
    num_rows: 130319
})

In [7]:
tokenizer.decode(squad[0]['input_ids'])

2024-04-28 09:45:48.297412: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'[CLS] when did beyonce start becoming popular? [SEP] beyonce giselle knowles - carter ( / biːˈjɒnseɪ / bee - yon - say ) ( born september 4, 1981 ) is an american singer, songwriter, record producer and actress. born and raised in houston, texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of r & b girl - group destiny\'s child. managed by her father, mathew knowles, the group became one of the world\'s best - selling girl groups of all time. their hiatus saw the release of beyonce\'s debut album, dangerously in love ( 2003 ), which established her as a solo artist worldwide, earned five grammy awards and featured the billboard hot 100 number - one singles " crazy in love " and " baby boy ". [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [8]:
question_len = 0
for x in squad[0]['token_type_ids']:
    if x != 1:
        question_len += 1
    else:
        break
context_len = sum(squad[0]['token_type_ids'])
question_len, context_len

(9, 165)

In [None]:
squad[0]['answers']['answer_start'][0] + len(squad[0]['answers']['text'][0])

In [None]:
def get_pos(row):
    answer_start = 0
    answer_end = 0
    text = ''
    if row.get('answers'): 
        if row['answers'].get('text'):
            text = row['answers']['text'][0]
        if row['answers'].get('answer_start'):
            answer_start = row['answers']['answer_start'][0]
            answer_end = answer_start + len(text)
    
    return {
        'answers': {
            'answer_start': answer_start,
            'answer_end': answer_end,
            'text': text
        }
    }
squad_v2 = squad.map(get_pos)

squad_v2[0].keys()

In [20]:
def char_to_id(sample):
    char_start = sample['answers']['answer_start']
    char_end = sample['answers']['answer_end']
    question_len = 0
    for x in sample['token_type_ids']:
        if x != 1:
            question_len += 1
        else:
            break
    context_len = sum(sample['token_type_ids'])

    context_mappings = sample['offset_mapping'][question_len:][:context_len-1]
    for i, mapping in enumerate(context_mappings):
        if char_start >= mapping[0] and char_start <= mapping[1]:
            token_start = question_len + i 
        if char_end >= mapping[0] and char_end <= mapping[1]:
            token_end = question_len + i + 1
            return {'start_positions': token_start, 'end_positions': token_end}
        if i == len(context_mappings)-1:
            token_start, token_end = 0, 0
            return {'start_positions': token_start, 'end_positions': token_end}
    

In [21]:
squad_v2 = squad_v2.map(lambda x: char_to_id(x))

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

In [22]:
squad_v2[0]

{'id': '56be85543aeaaa14008c9063',
 'title': 'Beyoncé',
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'question': 'When did Beyonce start becoming popular?',
 'answers': {'answer_end': 286,
  'answer_start': 269,
  'text': 'in the late 1990s'},
 'input_ids': [101,
  2043,
  2106,
  20773,
  2707,
  3352,
  2759,
  1029,
  102,
  2

In [23]:
squad_v2 = squad_v2.remove_columns(['id', 'title', 'context', 'question', 'answers', 'offset_mapping'])
squad_v2

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 130319
})

In [34]:
from transformers import default_data_collator
from transformers import BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
from transformers import TrainingArguments

batch_size = 16
epochs = 1
args = TrainingArguments(
    'bert-base-uncased-squad2',
    learning_rate = 2e-5,
    per_device_train_batch_size = batch_size,
    num_train_epochs = epochs,
    weight_decay = 0.1,
    warmup_steps = int(len(squad_v2)*epochs*0.1)
)

In [36]:
from transformers import Trainer
import torch

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
trainer = Trainer(
    model.to(device),
    args,
    train_dataset=squad_v2,
    data_collator=default_data_collator,
    tokenizer=tokenizer
)