In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("squad")
raw_datasets = raw_datasets.remove_columns(["id", "title"])

def prepare_data(example):
    answer = example["answers"]["text"][0]
    example["answer_start"] = example["answers"]["answer_start"][0]
    example["answer_end"] = example["answer_start"] + len(answer)
    return example

raw_datasets = raw_datasets.map(prepare_data, remove_columns=["answers"])
raw_datasets["train"]

Dataset({
    features: ['context', 'question', 'answer_start', 'answer_end'],
    num_rows: 87599
})

In [2]:
print(f"Context: {raw_datasets['train'][0]['context']}")
print(f"Question: {raw_datasets['train'][0]['question']}")
start = raw_datasets["train"][0]["answer_start"]
end = raw_datasets["train"][0]["answer_end"]
print(f"\nAnswer: {raw_datasets['train'][0]['context'][start:end]}")

Context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?

Answer: Saint Bernadette Soubirous


In [3]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

example = raw_datasets["train"][0]
inputs = tokenizer(
    example["question"],
    example["context"],
    truncation="only_second",
    padding="max_length",
    max_length=384,
    stride=128,
    return_overflowing_tokens=True,
    return_offsets_mapping=True
)

In [10]:
out = tokenizer("Output:", "speed limited 30 (km/h)", return_tensors='pt')
input_ids = out['input_ids']
token_type_ids = out['token_type_ids']
tokenizer.decode(
    token_ids=input_ids[token_type_ids==0]
)

'[CLS] Output : [SEP]'

In [6]:
def find_labels(offsets, answer_start, answer_end, sequence_ids):
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    # If the answer is not fully in the context, return (0, 0)
    if offsets[context_start][0] > answer_end or offsets[context_end][1] < answer_start:
        return (0, 0)
    else:
        idx = context_start
        while idx <= context_end and offsets[idx][0] <= answer_start:
            idx += 1
        start_position = idx - 1

        idx = context_end
        while idx >= context_start and offsets[idx][1] >= answer_end:
            idx -= 1
        end_position = idx + 1
    
        return start_position, end_position

In [5]:
[k for k in example], example['answer_start']

(['context', 'question', 'answer_start', 'answer_end'], 515)

In [7]:
start, end = find_labels(
    inputs["offset_mapping"][0],
    example["answer_start"],
    example["answer_end"],
    inputs.sequence_ids(0)
)
tokenizer.decode(inputs["input_ids"][0][start: end+1])

'Saint Bernadette Soubirous'

In [8]:
inputs['offset_mapping']

[[(0, 0),
  (0, 2),
  (3, 7),
  (8, 11),
  (12, 15),
  (16, 22),
  (23, 27),
  (28, 37),
  (38, 44),
  (45, 47),
  (48, 52),
  (53, 55),
  (56, 59),
  (59, 63),
  (64, 70),
  (70, 71),
  (0, 0),
  (0, 13),
  (13, 15),
  (15, 16),
  (17, 20),
  (21, 27),
  (28, 31),
  (32, 33),
  (34, 42),
  (43, 52),
  (52, 53),
  (54, 56),
  (56, 58),
  (59, 62),
  (63, 67),
  (68, 76),
  (76, 77),
  (77, 78),
  (79, 83),
  (84, 88),
  (89, 91),
  (92, 93),
  (94, 100),
  (101, 107),
  (108, 110),
  (111, 114),
  (115, 121),
  (122, 126),
  (126, 127),
  (128, 139),
  (140, 142),
  (143, 148),
  (149, 151),
  (152, 155),
  (156, 160),
  (161, 169),
  (170, 173),
  (174, 180),
  (181, 183),
  (183, 184),
  (185, 187),
  (188, 189),
  (190, 196),
  (197, 203),
  (204, 206),
  (207, 213),
  (214, 218),
  (219, 223),
  (224, 226),
  (226, 229),
  (229, 232),
  (233, 237),
  (238, 241),
  (242, 248),
  (249, 250),
  (250, 251),
  (251, 254),
  (254, 256),
  (257, 259),
  (260, 262),
  (263, 264),
  (264, 2

In [9]:
def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        padding="max_length",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
    )
    
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    inputs["start_positions"] = []
    inputs["end_positions"] = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        start, end = find_labels(
            offset, examples["answer_start"][sample_idx], examples["answer_end"][sample_idx], inputs.sequence_ids(i)
        )
        
        inputs["start_positions"].append(start)
        inputs["end_positions"].append(end)

    return inputs

In [10]:
tokenized_datasets = raw_datasets.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [11]:
[k for k in tokenized_datasets['train'][0]]

['input_ids',
 'token_type_ids',
 'attention_mask',
 'start_positions',
 'end_positions']

In [14]:
data = tokenized_datasets['train'][0]
tokenizer.decode(data['input_ids'])

'[CLS] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [SEP] Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend " Venite Ad Me Omnes ". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive ( and in a direct line that connects through 3 statues and the Gold Dome ), is a simple, modern stone statue of Mary. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD

In [20]:
res = []
for i, mask in enumerate(data['token_type_ids']):
    if mask == 1:
        res.append(data['input_ids'][i])
tokenizer.decode(res)


'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend " Venite Ad Me Omnes ". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive ( and in a direct line that connects through 3 statues and the Gold Dome ), is a simple, modern stone statue of Mary. [SEP]'

In [21]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
sequence_a = "HuggingFace is based in NYC"
sequence_b = "Where is HuggingFace based?"

encoded_dict = tokenizer(sequence_a, sequence_b)
decoded = tokenizer.decode(encoded_dict["input_ids"])

In [25]:
decoded

'[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]'

In [23]:
seq = '[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]'
tokenizer(seq)

{'input_ids': [101, 101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [1]:
from Flamingo.lora_tuning import get_tokenizer

tokenizer = get_tokenizer("facebook/opt-125m")

In [3]:
classes = { 0:'Speed limit (20km/h)',
            1:'Speed limit (30km/h)', 
            2:'Speed limit (50km/h)', 
            3:'Speed limit (60km/h)', 
            4:'Speed limit (70km/h)', 
            5:'Speed limit (80km/h)', 
            6:'End of speed limit (80km/h)', 
            7:'Speed limit (100km/h)', 
            8:'Speed limit (120km/h)', 
            9:'No passing', 
            10:'No passing veh over 3.5 tons', 
            11:'Right-of-way at intersection', 
            12:'Priority road', 
            13:'Yield', 
            14:'Stop', 
            15:'No vehicles', 
            16:'Veh > 3.5 tons prohibited', 
            17:'No entry', 
            18:'General caution', 
            19:'Dangerous curve left', 
            20:'Dangerous curve right', 
            21:'Double curve', 
            22:'Bumpy road', 
            23:'Slippery road', 
            24:'Road narrows on the right', 
            25:'Road work', 
            26:'Traffic signals', 
            27:'Pedestrians', 
            28:'Children crossing', 
            29:'Bicycles crossing', 
            30:'Beware of ice/snow',
            31:'Wild animals crossing', 
            32:'End speed + passing limits', 
            33:'Turn right ahead', 
            34:'Turn left ahead', 
            35:'Ahead only', 
            36:'Go straight or right', 
            37:'Go straight or left', 
            38:'Keep right', 
            39:'Keep left', 
            40:'Roundabout mandatory', 
            41:'End of no passing', 
            42:'End no passing veh > 3.5 tons' }
for i in classes:
    cls = classes[i]
    out = tokenizer(cls, max_length=10, padding="max_length", truncation=True)
    print(tokenizer.decode(out['input_ids']))

</s>Speed limit (20km/h)<pad><pad>
</s>Speed limit (30km/h)<pad><pad>
</s>Speed limit (50km/h)<pad><pad>
</s>Speed limit (60km/h)<pad><pad>
</s>Speed limit (70km/h)<pad><pad>
</s>Speed limit (80km/h)<pad><pad>
</s>End of speed limit (80km/h)
</s>Speed limit (100km/h)<pad><pad>
</s>Speed limit (120km/h)<pad><pad>
</s>No passing<pad><pad><pad><pad><pad><pad><pad><pad>
</s>No passing veh over 3.5 tons<pad><pad>
</s>Right-of-way at intersection<pad><pad><pad>
</s>Priority road<pad><pad><pad><pad><pad><pad><pad>
</s>Yield<pad><pad><pad><pad><pad><pad><pad><pad>
</s>Stop<pad><pad><pad><pad><pad><pad><pad><pad><pad>
</s>No vehicles<pad><pad><pad><pad><pad><pad><pad><pad>
</s>Veh > 3.5 tons prohibited<pad><pad><pad>
</s>No entry<pad><pad><pad><pad><pad><pad><pad><pad>
</s>General caution<pad><pad><pad><pad><pad><pad><pad><pad>
</s>Dangerous curve left<pad><pad><pad><pad><pad>
</s>Dangerous curve right<pad><pad><pad><pad><pad>
</s>Double curve<pad><pad><pad><pad><pad><pad><pad><pad>
</s>Bumpy r

In [10]:
out = tokenizer("'Right-of-way at intersection'", max_length=20, padding="max_length", truncation=True, return_tensors="pt")
tokenizer.batch_decode(out['input_ids'])
# out['input_ids'][0]

["</s>'Right-of-way at intersection'<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>"]

In [21]:
sample = ["hloow", "sort"]
sample = [
    (f"<image>{s.strip()}<|endofchunk|>{tokenizer.eos_token}") for s in sample
]
sample, out

(['<image>hloow<|endofchunk|></s>', '<image>sort<|endofchunk|></s>'],
 {'input_ids': tensor([[    2,   108, 13984,    12,  1116,    12,  1970,    23,  8088,   108,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])})

In [22]:
tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "<image>"]})
tokenizer.encode("<image>")[-1], tokenizer("<image>hloow<|endofchunk|></s>"), tokenizer.pad_token_id

(50266,
 {'input_ids': [2, 50266, 298, 4082, 1722, 50265, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]},
 1)