In [230]:
from datasets import load_dataset

dataset = load_dataset("MU-NLPC/Calc-mawps")


In [231]:
import pandas as pd 
pd_train = pd.DataFrame(dataset['train'])
pd_val = pd.DataFrame(dataset['validation'])
pd_test = pd.DataFrame(dataset['test'])

In [232]:
import numpy as np 

ops = set(['+', '-', '*', '/'])
def process_row(row):
    exp = row['expression']
    op_count = {}
    for op in ops:
        if op in exp:
            op_count[op] = exp.count(op)

    row['num_unique_ops'] = len(op_count)
    if len(op_count) != 1:
        row['valid'] = False 
        row['operation'] = np.nan
        return row     
    
    # Only if one unique op present
    row['valid'] = True
    row['operation'] = list(op_count.keys())[0] # First one

    op = row['operation']
    num_occurances = op_count[op] # Number of occurances
    num_operands = num_occurances + 1 # Assuming binary operands 

    # Remove parantheses 
    exp = exp.replace('(', '')
    exp = exp.replace(')', '')

    splits = exp.split(op) # Split it on op
    if len(splits) == num_operands: # Once split if all clean
        row['operands'] = [float(operand) for operand in splits] 
    else:
        pass # TODO handle this


    # TODO get the tag sequence
    tag_seq = [1 if token in splits else 0 for token in row['question'].split(' ')]
    question_split = [token for token in row['question'].split(' ')]

    assert len(splits) == num_operands

    row['operand_tags'] = tag_seq 
    row['question_split'] = question_split

    # TODO Validate that the result matches. Call sympy?
    # Validate by recon structing the expression Reconstructed expr 
    reconst_expr = row['operation'].join(splits)
    assert reconst_expr == exp
    return row


def process_data(df: pd.DataFrame, name = ''):
    proc_df = df.apply(process_row, axis=1)
    proc_df_filter = proc_df[proc_df['valid'] == True]    
    print(f'For split {name}, lost {(len(proc_df_filter)/len(df)):2f}% ({len(df)} -> {len(proc_df_filter)})')
    return proc_df_filter
    

In [233]:
proc_val = process_data(pd_val, 'validation')
proc_test = process_data(pd_test, 'test')
proc_train = process_data(pd_train, 'train')

For split validation, lost 0.976923% (1040 -> 1016)
For split test, lost 0.980769% (520 -> 510)
For split train, lost 0.966942% (1089 -> 1053)


In [234]:
from datasets import Dataset, DatasetDict

# Assuming you have three pandas dataframes: pd_train, pd_val, pd_test
train_dataset = Dataset.from_pandas(proc_train, split='train')
val_dataset = Dataset.from_pandas(proc_val, split='validation')
test_dataset = Dataset.from_pandas(proc_test, split='test')

combined_dataset = DatasetDict({'train': train_dataset, 'validation': val_dataset, 'test': test_dataset})


In [235]:
combined_dataset.push_to_hub("vishruthnath/Calc-MAWPS-CalcBERT-Tags", token='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')

Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 85.63ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.02it/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 162.56ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.78it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 63.34ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.09it/s]
README.md: 100%|██████████| 989/989 [00:00<00:00, 682kB/s]


In [236]:
train_dataset

Dataset({
    features: ['chain', 'equation', 'expression', 'id', 'num_unique_ops', 'operand_tags', 'operands', 'operation', 'question', 'question_split', 'result', 'result_float', 'valid', '__index_level_0__'],
    num_rows: 1053
})

## BERT 

In [319]:
from transformers import AutoModel, AutoModelForTokenClassification, AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorForTokenClassification, DataCollatorWithPadding

In [374]:
model_id = 'bert-base-uncased'
# model_id = 'roberta'

tokenizer = AutoTokenizer.from_pretrained(model_id)
special_tokens = {'additional_special_tokens': ["[OP]"]}
tokenizer.add_special_tokens(special_tokens)

1

In [375]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]',
 'additional_special_tokens': ['[OP]']}

In [None]:
# def tokenize_and_align_labels(examples):
#     tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

#     labels = []
#     for i, label in enumerate(examples[f"ner_tags"]):
#         word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
#         previous_word_idx = None
#         label_ids = []
#         for word_idx in word_ids:  # Set the special tokens to -100.
#             if word_idx is None:
#                 label_ids.append(-100)
#             elif word_idx != previous_word_idx:  # Only label the first token of a given word.
#                 label_ids.append(label[word_idx])
#             else:
#                 label_ids.append(-100)
#             previous_word_idx = word_idx
#         labels.append(label_ids)

#     tokenized_inputs["labels"] = labels
#     return tokenized_inputs

In [None]:
# def align_tokens_with_labels(tags,word_ids):
#     aligned_labels = []
#     current_word = None
#     for word_id in word_ids:
#         if word_id!=current_word:
#             current_word = word_id
#             label = -100 if word_id is None else tags[word_id]
#             aligned_labels.append(label)
#         elif word_id is None:
#             aligned_labels.append(-100)
#         else:
#             label = tags[word_id]
#             if label % 2 == 1:
#                 label += 1
#             aligned_labels.append(label)

#     return aligned_labels

In [371]:
operation_label2id = {
    '+': 0, 
    '-': 1,
    '*': 2,
    '/': 3
}

operation_id2label = {v: k for k,v in operation_label2id.items()}

In [402]:
def preprocess_mawps(example, tokenizer):
    # question = example['question_split'] + ' [OP]'
    question = example['question_split']
    question.append('[OP]')
    # print('Question', question)

    op_tags = example['operand_tags']   
    op_tags.append(-100) # Appending -100 for the OP token 
    # print('OP tags', op_tags)
    assert len(question) == len(op_tags)

    tokenized_inputs = tokenizer(question, is_split_into_words=True, add_special_tokens=True, truncation=True, return_length=True)
    # Returning length as need to find the rep for last [OP] token

    # labels = []
    # for i, label in enumerate(op_tags):
    # word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
    word_ids = tokenized_inputs.word_ids()  # Map tokens to their respective word.
    # print('Input_ids', tokenized_inputs.input_ids)
    # print('Word_ids', word_ids, len(word_ids))

    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:  # Set the special tokens to -100.
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:  # Only label the first token of a given word.
            label_ids.append(op_tags[word_idx])
            previous_word_idx = word_idx
        else:
            label_ids.append(op_tags[previous_word_idx])
            # label_ids.append(-100) # Making all subsequent things -100
        # previous_word_idx = word_idx
    # labels.append(label_ids)

    # print('Labels', label_ids)

    operation = example['operation']
    tokenized_inputs['operation_labels'] = operation_label2id[operation]

    tokenized_inputs["labels"] = label_ids
    assert len(tokenized_inputs['labels']) == len(tokenized_inputs['input_ids'])
    return tokenized_inputs

    # print(example['operand_tags'])
    # op_tags = example['operand_tags']
    # op_tags.append(-100) # Appending -100 for the OP token 
    # tokenized_inputs['labels'] = op_tags
    # print('Input Ids', len(tokenized_inputs['input_ids']))
    # print('Labels', len(tokenized_inputs['labels']))

    # assert len(tokenized_inputs['labels']) == len(tokenized_inputs['input_ids'])
    # return tokenized_inputs

In [243]:
# def align_tokens_with_labels(tags,word_ids):
#     aligned_labels = []
#     current_word = None
#     for word_id in word_ids:
#         if word_id!=current_word:
#             current_word = word_id
#             label = -100 if word_id is None else tags[word_id]
#             aligned_labels.append(label)
#         elif word_id is None:
#             aligned_labels.append(-100)
#         else:
#             label = tags[word_id]
#             if label % 2 == 1:
#                 label += 1
#             aligned_labels.append(label)

#     return aligned_labels

# def tokenize_and_align_labels(examples,tokenizer):
#                             #   label_name_to_id):
#     # questions = [q + ' [OP]' for q in examples['question']]
#     question = examples['question']

# #     question = example['question'] + '[OP]'
# # print(question)
# # tokenized_input = tokenizer(question, add_special_tokens=True)
# # print(tokenized_input)
# # print(tokens)

#     # tokenized_inputs = tokenizer(question, truncation=True, is_split_into_words=True,
#     #                                padding=True,max_length=512, add_special_tokens=True)
#     tokenized_inputs = tokenizer(question, add_special_tokens=True)
    
#     tokens = tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"], skip_special_tokens=False)

#     print(tokenized_inputs)
#     print(tokens)
    
#     labels = []
#     for i, tags in enumerate(examples["operand_tags"]):
#     #   id_tags = map_label_to_id(tags,label_name_to_id)
#       word_ids = tokenized_inputs.word_ids(batch_index=i)
#       aligned_labels = align_tokens_with_labels(tags,word_ids)
#       labels.append(aligned_labels)

#     tokenized_inputs['labels'] = labels
#     return tokenized_inputs

In [347]:
# combined_tokenized_dataset = combined_dataset.map(preprocess_mawps, 
#                                                   fn_kwargs={'tokenizer': tokenizer})

Map: 100%|██████████| 1053/1053 [00:01<00:00, 799.44 examples/s]
Map: 100%|██████████| 1016/1016 [00:01<00:00, 932.11 examples/s]
Map: 100%|██████████| 510/510 [00:00<00:00, 769.82 examples/s]


In [403]:
combined_tokenized_dataset = combined_dataset.map(preprocess_mawps,
                                         fn_kwargs={"tokenizer": tokenizer},
                                         remove_columns=['id', 'chain', 'equation', 'expression', 'num_unique_ops', 'operand_tags', 'operands', 'operation', 'question', 'question_split', 'valid', '__index_level_0__', 'result'])
# kept columns (id, result, 'input_ids', 'token_type_ids', 'attention_mask', 'operation_labels', 'labels')

# rename result_float to result 
combined_tokenized_dataset.rename_column('result_float', 'result')

Map: 100%|██████████| 1053/1053 [00:00<00:00, 1256.69 examples/s]
Map: 100%|██████████| 1016/1016 [00:00<00:00, 1347.93 examples/s]
Map: 100%|██████████| 510/510 [00:00<00:00, 1476.53 examples/s]


DatasetDict({
    train: Dataset({
        features: ['result', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'operation_labels', 'labels'],
        num_rows: 1053
    })
    validation: Dataset({
        features: ['result', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'operation_labels', 'labels'],
        num_rows: 1016
    })
    test: Dataset({
        features: ['result', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'operation_labels', 'labels'],
        num_rows: 510
    })
})

In [404]:
# TODO Custom collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

In [405]:
train_dataset = combined_tokenized_dataset['train']
val_dataset = combined_tokenized_dataset['validation']
test_dataset = combined_tokenized_dataset['test']

In [407]:
for ex in train_dataset:
    print(ex.keys())

dict_keys(['result_float', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'operation_labels', 'labels'])
dict_keys(['result_float', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'operation_labels', 'labels'])
dict_keys(['result_float', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'operation_labels', 'labels'])
dict_keys(['result_float', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'operation_labels', 'labels'])
dict_keys(['result_float', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'operation_labels', 'labels'])
dict_keys(['result_float', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'operation_labels', 'labels'])
dict_keys(['result_float', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'operation_labels', 'labels'])
dict_keys(['result_float', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'operation_labels', 'labels'])
dict_keys(['result_float', 'input_ids', 'token_type_ids', 'atten

In [408]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset, collate_fn=data_collator, batch_size=8)
val_dataloader = DataLoader(val_dataset, collate_fn=data_collator, batch_size=8)
test_dataloader = DataLoader(test_dataset, collate_fn=data_collator, batch_size=8)

In [409]:
for ex in test_dataloader:
    print(ex)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'result_float': tensor([152.0000,  34.0000,   6.2222, 444.0000,  16.8421, 135.0000,   5.0000,
          4.4242]), 'input_ids': tensor([[  101, 17551,  2018, 22343,  2338,  1012, 10941, 16763,  2070,  2338,
          1012,  2085, 17551,  2038,  6486,  2338,  1012,  2129,  2116,  2106,
         10941, 16763,  2015,  1029, 30522,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0],
        [  101,  4907,  2179,  5179, 11915, 18223,  2015,  2006,  1996,  3509,
          1010,  2002,  2435,  8201,  2070,  1997,  2010, 11915, 18223,  2015,
          1012,  2002,  2038,  2570, 11915, 18223,  2187,  1012,  2129,  2116,
         11915, 18223,  2015,  2106,  2002,  2507,  2000,  8201,  1029, 30522,
           102],
        [  101,  5696,  6855,  1023,  8641,  1997, 11382,  9148,  1012,  2129,
          2116, 11382,  9148,  1999,  2169,  4524,  1029,  2065,  2561,  5179,
         11382,  9148,  5696,  6855,  1012, 3052

In [410]:
token_id2label = {
    0: 'Not_Operand',
    1: 'Operand'
}

token_label2id = {v: k for k,v in id2label.items()}

In [411]:
model = AutoModelForTokenClassification.from_pretrained(
        model_id,
        id2label=token_id2label,
        label2id=token_label2id,
        ignore_mismatched_sizes=True
    )

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [424]:
model.resize_token_embeddings(len(tokenizer)) # Resize embedding to include special token [OP]

Embedding(30523, 768)

In [425]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30523, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

## Training Loop

In [413]:
import torch
import evaluate 
from tqdm import tqdm
from accelerate import Accelerator

In [449]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5)
# accelerator = Accelerator()
# model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
#     model, optimizer, train_dataloader, val_dataloader
# )

In [415]:
num_train_epochs = 5
num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

In [None]:
# from transformers import get_scheduler
# lr_scheduler = get_scheduler(
#     "linear",
#     optimizer=optimizer,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps,
# )

In [426]:
model.config.hidden_size

768

In [450]:
operation_head = torch.nn.Linear(model.config.hidden_size, len(operation_id2label))

In [452]:
from torch.nn import functional as F
import datetime

In [451]:
def process_predictions(predictions,labels):
    predictions=predictions.detach().cpu().clone().numpy()
    labels=labels.detach().cpu().clone().numpy()

    # Only considering loss for tokens where prediction_token = -1
    true_labels = [[token_id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [[token_id2label[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    return true_labels, true_predictions

In [453]:
token_metric = evaluate.load("seqeval")
operation_metric = evaluate.load("accuracy")

progress_bar = tqdm(range(num_training_steps))
# output_dir = '/content/drive/MyDrive/SciNER/model_v0'

loss_mean = 0
for epoch in range(num_train_epochs):
    # Training
    loss_mean = 0
    model.train()
    for batch in train_dataloader:
        # ['result_float', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'operation_labels', 'labels']
        model_input_keys = ['input_ids', 'token_type_ids', 'attention_mask', 'labels']
        model_input = {k: v for k,v in batch.items() if k in model_input_keys} # Filter out keys
        outputs = model(**model_input,
                        output_hidden_states=True)

        # print(outputs)

        operand_classification_loss = outputs.loss # Token classification loss 
        # print(operand_classification_loss)

        last_hidden_state = outputs.hidden_states[-1] # Shape: (bs, max_seq_len, hidden_size)
        # print(last_hidden_state.shape)

        lengths = batch['length'] # The total length (before padding)
        # print(lengths)
        op_token_idx = lengths - 2 # Lengths-1 is </s> or [SEP] and before that is [OP]
        # print(op_token_idx)

        op_token_reps = []
        # op_input_ids = [] #For sanity check 

        for i in range(len(op_token_idx)):
            op_token_rep = last_hidden_state[i, op_token_idx[i], :] # Get the op token rep for ith in the batch
            op_token_reps.append(op_token_rep)
            # op_input_ids.append(batch['input_ids'][i, op_token_idx[i]]) #For sanity check 
    
        op_token_reps = torch.stack(op_token_reps).squeeze() # (bs, hidden_size)
        # op_input_ids = torch.stack(op_input_ids).squeeze() #For sanity check 

        # print(batch['input_ids'])
        # print(op_input_ids) #For sanity check 
        # print(op_token_reps.shape)

        # Intent of loop above is to do what's given below, but torch doesn't index as per the tokens like numpy (it gives 8 per example)
        # op_input_ids = batch['input_ids'][:, op_token_idx] # bs, seq_len -> bs 
        # op_token_reps = last_hidden_state[:, op_token_idx, :] # Shape: (bs, hidden_size)

        operation_preds = operation_head(op_token_reps)
        # print(operation_preds.shape, operation_preds)

        operation_classification_loss = F.cross_entropy(operation_preds, batch['operation_labels'])
        # print(operation_classification_loss)


        # loss = weighted_loss(
        #       outputs["logits"].permute(0, 2, 1),
        #       batch["labels"]
        #       )

        total_loss = operand_classification_loss + operation_classification_loss 
        # total_loss = total_loss.item()
        # print(total_loss)
        total_loss.backward()
        # accelerator.backward(total_loss)

        optimizer.step()
        # lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Observe training loss
    print("Step Loss: {}".format(loss_mean/train_dataloader.__len__()))

    # Evaluation
    model.eval()
    for batch in val_dataloader:
        with torch.no_grad():
            model_input_keys = ['input_ids', 'token_type_ids', 'attention_mask', 'labels']
            model_input = {k: v for k,v in batch.items() if k in model_input_keys} # Filter out keys
            outputs = model(**model_input)
            # outputs = model(**batch)

        # Token/OPerand predictions ===== 
        token_predictions = outputs.logits.argmax(dim=-1)
        token_labels = batch["labels"]
        # ==========

        # =========== Computing operation classification preds
        last_hidden_state = outputs.hidden_states[-1] # Shape: (bs, max_seq_len, hidden_size)
        lengths = batch['length'] # The total length (before padding)
        op_token_idx = lengths - 2 # Lengths-1 is </s> or [SEP] and before that is [OP]
        op_token_reps = []
        # op_input_ids = [] #For sanity check 

        for i in range(len(op_token_idx)):
            op_token_rep = last_hidden_state[i, op_token_idx[i], :] # Get the op token rep for ith in the batch
            op_token_reps.append(op_token_rep)
            # op_input_ids.append(batch['input_ids'][i, op_token_idx[i]]) #For sanity check 
    
        op_token_reps = torch.stack(op_token_reps).squeeze() # (bs, hidden_size)
        # op_input_ids = torch.stack(op_input_ids).squeeze() #For sanity check 
        operation_preds = operation_head(op_token_reps).argmax(dim=-1)
        operation_labels = batch['operation_labels']
        # =============================

        # # Necessary to pad predictions and labels for being gathered
        # predictions = accelerator.pad_across_processes(token_predictions, dim=1, pad_index=-100)
        # labels = accelerator.pad_across_processes(token_labels, dim=1, pad_index=-100)

        # predictions_gathered = accelerator.gather(token_predictions)
        # labels_gathered = accelerator.gather(token_labels)

        # true_predictions, true_labels = process_predictions(predictions_gathered, labels_gathered)
        true_predictions, true_labels = process_predictions(token_predictions, token_labels)
        token_metric.add_batch(predictions=true_predictions, references=true_labels)

        # ===== 
        operation_preds = operation_preds.detach().cpu().clone().numpy()
        operation_labels = operation_labels.detach().cpu().clone().numpy()
        op_preds_names = [operation_id2label[p] for p in operation_preds]
        op_labels_names = [operation_id2label[l] for l in operation_labels]
        operation_metric.add_batch(predictions=op_preds_names, references=op_labels_names)

    token_results = token_metric.compute()
    operation_results = operation_metric.compute()
    print(
        f"epoch {epoch}: Token",
        {
            key: token_results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )
    print(
        f"epoch {epoch}: Seq/OPeration",
        {
            key: operation_results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # Save and upload
    # accelerator.wait_for_everyone()
    # unwrapped_model = accelerator.unwrap_model(model)
    # unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    model.save_pretrained(f'model_{epoch}_{datetime.datetime.now()}')

    # if accelerator.is_main_process:
    tokenizer.save_pretrained(f'model_{epoch}_{datetime.datetime.now()}')

 10%|▉         | 13/132 [21:35<3:17:37, 99.64s/it]


KeyboardInterrupt: 

In [161]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [162]:
special_tokens = {'additional_special_tokens': ["[OP]"]}
tokenizer.add_special_tokens(special_tokens)

1

In [166]:
example = train_dataset[0] 
question = example['question'] + '[OP]'
print(question)
tokenized_input = tokenizer(question, add_special_tokens=True)
print(tokenized_input)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"], skip_special_tokens=False)
print(tokens)

Mark had 2 Doll. Roland proffered him some more. Now Mark has 161 Doll. How many did Roland proffer him?[OP]
{'input_ids': [101, 2928, 2018, 1016, 10658, 1012, 8262, 11268, 7512, 2098, 2032, 2070, 2062, 1012, 2085, 2928, 2038, 17365, 10658, 1012, 2129, 2116, 2106, 8262, 11268, 7512, 2032, 1029, 30522, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'mark', 'had', '2', 'doll', '.', 'roland', 'prof', '##fer', '##ed', 'him', 'some', 'more', '.', 'now', 'mark', 'has', '161', 'doll', '.', 'how', 'many', 'did', 'roland', 'prof', '##fer', 'him', '?', '[OP]', '[SEP]']


In [167]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

In [164]:
# Decode [OP]

tokenizer.encode('[OP]')

[101, 30522, 102]

In [163]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]',
 'additional_special_tokens': ['[OP]']}

In [140]:
model = AutoModel.from_pretrained('bert-base-uncased')

In [125]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [126]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [127]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [106]:
example

'Mark had 2 Doll. Roland proffered him some more. Now Mark has 161 Doll. How many did Roland proffer him?'

In [128]:
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased')

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [129]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el