In [None]:
# !pip install transformers datasets & pip install --upgrade accelerate

## Data: Squad
- five columns: `'id', 'title', 'context', 'question', 'answers'`

In [2]:
from datasets import load_dataset
raw_datasets = load_dataset("squad")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

check data format

In [3]:
raw_datasets['train']['id'][0], raw_datasets['train']['title'][0]

('5733be284776f41900661182', 'University_of_Notre_Dame')

In [4]:
raw_datasets['train']['question'][0], raw_datasets['train']['answers'][0]

('To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]})

In [5]:
raw_datasets['train']['context'][0]

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [6]:
### 有可能有多個ans
raw_datasets["validation"][2]["answers"]

{'text': ['Santa Clara, California',
  "Levi's Stadium",
  "Levi's Stadium in the San Francisco Bay Area at Santa Clara, California."],
 'answer_start': [403, 355, 355]}

## tokenizer: 將word分割成片段並轉換為有意義的表示法

In [7]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-cased" # "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

check data format

In [8]:
context = raw_datasets["train"][1]["context"]
question = raw_datasets["train"][1]["question"]

inputs = tokenizer(question, context)
inputs

{'input_ids': [101, 1327, 1110, 1107, 1524, 1104, 1103, 10360, 8022, 4304, 4334, 136, 102, 22182, 1193, 117, 1103, 1278, 1144, 170, 2336, 1959, 119, 1335, 4184, 1103, 4304, 4334, 112, 188, 2284, 10945, 1110, 170, 5404, 5921, 1104, 1103, 6567, 2090, 119, 13301, 1107, 1524, 1104, 1103, 4304, 4334, 1105, 4749, 1122, 117, 1110, 170, 7335, 5921, 1104, 4028, 1114, 1739, 1146, 14089, 5591, 1114, 1103, 7051, 107, 159, 21462, 1566, 24930, 2508, 152, 1306, 3965, 107, 119, 5893, 1106, 1103, 4304, 4334, 1110, 1103, 19349, 1104, 1103, 11373, 4641, 119, 13301, 1481, 1103, 171, 17506, 9538, 1110, 1103, 144, 10595, 2430, 117, 170, 14789, 1282, 1104, 8070, 1105, 9284, 119, 1135, 1110, 170, 16498, 1104, 1103, 176, 10595, 2430, 1120, 10111, 20500, 117, 1699, 1187, 1103, 6567, 2090, 25153, 1193, 1691, 1106, 2216, 17666, 6397, 3786, 1573, 25422, 13149, 1107, 8109, 119, 1335, 1103, 1322, 1104, 1103, 1514, 2797, 113, 1105, 1107, 170, 2904, 1413, 1115, 8200, 1194, 124, 11739, 1105, 1103, 3487, 17917, 114, 117

### decode：將編碼後的句子轉回文字

In [9]:
tokenizer.decode(inputs["input_ids"])

'[CLS] What is in front of the Notre Dame Main Building? [SEP] Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend " Venite Ad Me Omnes ". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive ( and in a direct line that connects through 3 statues and the Gold Dome ), is a simple, modern stone statue of Mary. [SEP]'

### context可能會超過model input size限制，tokenize 時需將context 切成等長的片段。透過設定：


* max_length
* truncation
* stride

In [10]:
inputs = tokenizer(text = question,
                   text_pair = context,
                   max_length=100, # the maximum length of context
                   truncation="only_second", # 不能truncate question
                   stride=50, # move forward 50 (overlap 50 chars)
                   return_overflowing_tokens=True)

inputs.keys(), len(inputs.input_ids) #被切成4 份

(dict_keys(['input_ids', 'attention_mask', 'overflow_to_sample_mapping']), 4)

格式：[CLS] question [SEP] context

In [11]:
for id, input_ids in enumerate(inputs["input_ids"]):
    print(id, len(input_ids), tokenizer.decode(input_ids))

0 100 [CLS] What is in front of the Notre Dame Main Building? [SEP] Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend " Venite Ad Me Omnes ". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the G [SEP]
1 100 [CLS] What is in front of the Notre Dame Main Building? [SEP] facing it, is a copper statue of Christ with arms upraised with the legend " Venite Ad Me Omnes ". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernade [SEP]
2 100 [CLS] What is in front of the Notre Dame Main Building? [SEP] of the Sacred Heart. Immediately behind the basili

overflow_to_sample_mapping: 當return_overflowing_tokens=True 產生，用儲存片段資料的來源句子

In [12]:
inputs['overflow_to_sample_mapping']

[0, 0, 0, 0]

In [13]:
### 三個例子，各自切成四份，可以用overflow_to_sample_mapping 對照回原資料的id
inputs = tokenizer(text = raw_datasets["train"][:3]["question"],
                   text_pair = raw_datasets["train"][:3]["context"],
                   max_length=100,
                   truncation="only_second",
                   stride=50,
                   return_overflowing_tokens=True)

inputs['overflow_to_sample_mapping']

[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]

offset_mapping：設定return_offsets_mapping=True，回傳每一個token 在原文中的起訖位置（以char 為單位）

In [14]:
inputs = tokenizer(text = question,
                   text_pair = context,
                   max_length=100, # the maximum length of context
                   truncation="only_second", # 不能truncate question
                   stride=50, # move forward 50 (overlap 50 chars)
                   return_overflowing_tokens=True,
                   return_offsets_mapping=True)
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])

In [15]:
inputs['offset_mapping'] ##每個token在字串中的位置

[[(0, 0),
  (0, 4),
  (5, 7),
  (8, 10),
  (11, 16),
  (17, 19),
  (20, 23),
  (24, 29),
  (30, 34),
  (35, 39),
  (40, 48),
  (48, 49),
  (0, 0),
  (0, 13),
  (13, 15),
  (15, 16),
  (17, 20),
  (21, 27),
  (28, 31),
  (32, 33),
  (34, 42),
  (43, 52),
  (52, 53),
  (54, 56),
  (56, 58),
  (59, 62),
  (63, 67),
  (68, 76),
  (76, 77),
  (77, 78),
  (79, 83),
  (84, 88),
  (89, 91),
  (92, 93),
  (94, 100),
  (101, 107),
  (108, 110),
  (111, 114),
  (115, 121),
  (122, 126),
  (126, 127),
  (128, 139),
  (140, 142),
  (143, 148),
  (149, 151),
  (152, 155),
  (156, 160),
  (161, 169),
  (170, 173),
  (174, 180),
  (181, 183),
  (183, 184),
  (185, 187),
  (188, 189),
  (190, 196),
  (197, 203),
  (204, 206),
  (207, 213),
  (214, 218),
  (219, 223),
  (224, 226),
  (226, 229),
  (229, 232),
  (233, 237),
  (238, 241),
  (242, 248),
  (249, 250),
  (250, 251),
  (251, 254),
  (254, 256),
  (257, 259),
  (260, 262),
  (263, 264),
  (264, 265),
  (265, 268),
  (268, 269),
  (269, 270),
 

## Answer Alignment: 將answer 對應回片段資料作為target

In [17]:
answer = raw_datasets["train"][1]["answers"]
answer

{'text': ['a copper statue of Christ'], 'answer_start': [188]}

sequence_ids：tokenizer output 來源為function 的第幾個input string
* 0: 第一個輸入(question)
* 1: 第二個輸入(context)

In [18]:
print(inputs.sequence_ids(0)) #tokenizer的輸入來源

[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, None]


定位context 在片段中的位置 (the first and last '1')


In [19]:
sequence_ids = inputs.sequence_ids(0)

ctx_start = sequence_ids.index(1) # first occurrence
ctx_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1 # last occurrence

ctx_start, ctx_end #切分後的context內容在 [13, 98](前面為question)

(13, 98)

ansswer 在原文中的位置

In [20]:
ans_start_char = answer['answer_start'][0]
ans_end_char = ans_start_char + len(answer['text'][0])
ans_start_char, ans_end_char

(188, 213)

In [21]:
## 切割後的第1筆資料當作例子
offset = inputs['offset_mapping'][0]
start_idx = 0
end_idx = 0

if (ans_start_char >= offset[ctx_start][0]) and (ans_end_char <= offset[ctx_end][1]): # ans 位置在切割後的context內
    i = ctx_start
    for start_end_char in offset[ctx_start:]:
        start, end = start_end_char
        if start == ans_start_char:
            start_idx = i

        if end == ans_end_char:
            end_idx = i
            break

        i += 1

else:
     print("target is (0, 0)")
     # nothing else to do

start_idx, end_idx

(53, 57)

In [22]:
input_ids = inputs['input_ids'][0]
input_ids[start_idx : end_idx + 1], tokenizer.decode(input_ids[start_idx : end_idx + 1])

([170, 7335, 5921, 1104, 4028], 'a copper statue of Christ')

### 打包成function

In [23]:
def find_answer_token_idx(ctx_start, ctx_end,
                          ans_start_char, ans_end_char,
                          offset):
    start_idx = 0
    end_idx = 0

    if (ans_start_char >= offset[ctx_start][0]) and (ans_end_char <= offset[ctx_end][1]): # ans 位置在切割後的context內
        i = ctx_start
        for start_end_char in offset[ctx_start:]:
            start, end = start_end_char
            if start == ans_start_char:
                start_idx = i

            if end == ans_end_char:
                end_idx = i
                break

            i += 1
    return start_idx, end_idx

In [24]:
start_idxs = []
end_idxs = []

## ans 在原文中的位置
ans_start_char = answer['answer_start'][0]
ans_end_char = ans_start_char + len(answer['text'][0])

for i, offset in enumerate(inputs['offset_mapping']):

    #找到切割後的句子arr中context 位置
    sequence_ids = inputs.sequence_ids(i)
    # print(len(offset), len(sequence_ids))

    ctx_start = sequence_ids.index(1) # first occurrence
    ctx_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1 # last occurrence


    new_s, new_e = find_answer_token_idx(ctx_start, ctx_end,
                                         ans_start_char, ans_end_char,
                                         offset)
    start_idxs.append(new_s)
    end_idxs.append(new_e)

start_idxs, end_idxs

([53, 17, 0, 0], [57, 21, 0, 0])

In [25]:
for i in range(len(start_idxs)):
    input_ids = inputs['input_ids'][i]
    start_idx, end_idx = start_idxs[i], end_idxs[i]

    print(input_ids[start_idx : end_idx + 1],
          tokenizer.decode(input_ids[start_idx : end_idx + 1]))

[170, 7335, 5921, 1104, 4028] a copper statue of Christ
[170, 7335, 5921, 1104, 4028] a copper statue of Christ
[101] [CLS]
[101] [CLS]


## Tokenizer func. for Training set & Val set

In [26]:
# Google used 384 for SQuAD
max_length = 384
stride = 128

def tokenize_fn_train(batch):

    questions = [q.strip() for q in batch['question']]
    inputs = tokenizer(text = questions,
                       text_pair = batch['context'],
                       max_length=max_length, # the maximum length of context
                       truncation='only_second', # 不能truncate question
                       stride=stride, # move forward 50 (overlap 50 chars)
                       return_overflowing_tokens=True,
                       return_offsets_mapping=True,
                       padding='max_length')

    offset_mapping = inputs.pop("offset_mapping")
    orig_sample_idxs = inputs.pop("overflow_to_sample_mapping")
    answers = batch['answers']
    start_idxs, end_idxs = [], []

    for i,  offset in enumerate(offset_mapping):

        ## ans 在原文中的位置
        sample_idx = orig_sample_idxs[i] #用overflow_to_sample_mapping 對照回原資料的id
        answer = answers[sample_idx]

        ans_start_char = answer['answer_start'][0]
        ans_end_char = ans_start_char + len(answer['text'][0])


        #找到切割後的句子arr中context 位置
        sequence_ids = inputs.sequence_ids(i)
        # print(len(offset), len(sequence_ids))

        ctx_start = sequence_ids.index(1) # first occurrence
        ctx_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1 # last occurrence

        # 找到切割後contex內ans的位置，若不包含ans, 回傳 0,0
        new_s, new_e = find_answer_token_idx(ctx_start, ctx_end,
                                             ans_start_char, ans_end_char,
                                             offset)
        start_idxs.append(new_s)
        end_idxs.append(new_e)

    inputs["start_positions"] = start_idxs
    inputs["end_positions"] = end_idxs
    return inputs

In [27]:
train_dataset = raw_datasets['train'].map(tokenize_fn_train,
                                          batched=True,
                                          remove_columns=raw_datasets['train'].column_names)
len(raw_datasets['train']), len(train_dataset)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

(87599, 88729)

In [29]:
def tokenize_fn_val(batch):

    questions = [q.strip() for q in batch['question']]
    inputs = tokenizer(text = questions,
                       text_pair = batch['context'],
                       max_length=max_length, # the maximum length of context
                       truncation='only_second', # 不能truncate question
                       stride=stride, # move forward 50 (overlap 50 chars)
                       return_overflowing_tokens=True,
                       return_offsets_mapping=True,
                       padding='max_length')

    orig_sample_idxs = inputs.pop("overflow_to_sample_mapping")
    sample_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = orig_sample_idxs[i]
        sample_ids.append(batch['id'][sample_idx]) #存原本資料的 str id

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [x if sequence_ids[j] == 1 else None for j, x in enumerate(offset)] # mask掉question的部分為None

    inputs['sample_id'] = sample_ids
    return inputs

In [30]:
val_dataset = raw_datasets['validation'].map(tokenize_fn_val,
                                          batched=True,
                                          remove_columns=raw_datasets['validation'].column_names)
len(raw_datasets['validation']), len(val_dataset)

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

(10570, 10822)

## Metrics
- model output 是一連串的logits, 需要將model output 對回string

### 用內建的metric

In [31]:
from datasets import load_metric

metric = load_metric("squad")

  metric = load_metric("squad")


Downloading builder script:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

In [32]:
## input example & format

## 預測結果須包含id, prediction_text，且每筆資料用dict表示
predicted_answers = [{'id': '1', 'prediction_text': 'Albert Einstein'},
                     {'id': '1', 'prediction_text': 'physicist'},
                     {'id': '3', 'prediction_text': 'general relativity'}]

## 實際值須包含id, answers，answer內在用dict存ans, ans_start
true_answers = [{'id': '1', 'answers': {'text': ['Albert Einstein'], 'answer_start': [100]}},
                {'id': '2', 'answers': {'text': ['physicist'], 'answer_start': [100]}},
                {'id': '3', 'answers': {'text': ['special relativity'], 'answer_start': [100]}}]

metric.compute(predictions=predicted_answers, references=true_answers)

Unanswered question 2 will receive score 0.


{'exact_match': 0.0, 'f1': 16.666666666666668}

### model output: 用HuggingFace Fintune 好的SQuAD model 舉例

In [33]:
# 01 data set
small_validation_dataset = raw_datasets["validation"].select(range(100))

# 02 pretrained tokenizer/model
trained_checkpoint = 'distilbert-base-cased-distilled-squad'
tokenizer2 = AutoTokenizer.from_pretrained(trained_checkpoint)

# 03 tokenize data
old_tokenizer = tokenizer
tokenizer = tokenizer2 #用pretrain tokenizer做

small_validation_processed = small_validation_dataset.map(tokenize_fn_val,
                                                          batched=True,
                                                          remove_columns=raw_datasets["validation"].column_names)
tokenizer = old_tokenizer #換回來

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [34]:
# 04 get the model outputs
import torch
from transformers import AutoModelForQuestionAnswering

small_model_inputs = small_validation_processed.remove_columns(['sample_id', 'offset_mapping'])
small_model_inputs.set_format('torch')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
small_model_inputs_gpu = {k: small_model_inputs[k].to(device) for k in small_model_inputs.column_names}

trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(device) #trained_checkpoint
with torch.inference_mode():
    outputs = trained_model(**small_model_inputs_gpu)

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

### start_logits 跟 end_logits 存ans在每個位置的機率

In [36]:
outputs

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[ -2.2607,  -5.1783,  -5.2709,  ...,  -9.5243,  -9.5183,  -9.5288],
        [ -2.5961,  -5.5482,  -5.5313,  ...,  -9.9598,  -9.9533,  -9.9860],
        [ -3.7127,  -7.1848,  -8.5388,  ..., -11.6557, -11.6571, -11.6505],
        ...,
        [ -2.0260,  -4.4167,  -4.4980,  ...,  -8.1479,  -8.1530,  -8.1760],
        [ -4.1553,  -5.8304,  -7.1643,  ..., -10.5255, -10.5251, -10.4890],
        [ -3.2000,  -5.8162,  -6.7249,  ...,  -9.4935,  -9.5038,  -9.4871]],
       device='cuda:0'), end_logits=tensor([[ -0.7353,  -4.9236,  -5.1048,  ...,  -8.8734,  -8.8916,  -8.8550],
        [ -1.3056,  -5.3870,  -5.4945,  ...,  -9.4895,  -9.5039,  -9.4959],
        [ -2.7649,  -7.2201,  -9.0916,  ..., -11.3106, -11.3414, -11.2702],
        ...,
        [ -0.0768,  -4.8210,  -4.4374,  ...,  -8.0483,  -8.0502,  -7.9903],
        [ -2.7347,  -5.3650,  -7.2549,  ..., -10.0498, -10.0661,  -9.9886],
        [ -1.0991,  -4.2569,  -6.1267,  ...,  -8

In [35]:
# 移回cpu 做後續運算
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

### context window會的做法一筆資料會expand成多筆，要mapping 對回原本context 才能擷取答案

In [37]:
small_validation_processed['sample_id'][:5] ## 每筆資料獨立的id

['56be4db0acb8001400a502ec',
 '56be4db0acb8001400a502ed',
 '56be4db0acb8001400a502ee',
 '56be4db0acb8001400a502ef',
 '56be4db0acb8001400a502f0']

建立mapping table: {'56be4db0acb8001400a502ec': [0, 1, 2, 3], ...}

In [38]:
from collections import defaultdict

sample_id2idxs = defaultdict(list)
for i, sample_id in enumerate(small_validation_processed['sample_id']):
    sample_id2idxs[sample_id].append(i)

### 取 start_logits + end_logits機率最大的當作答案，並將位置對回文字

In [39]:
n_largest = 20 #只看前20的機率(不然會算很久)
max_answer_length = 30
predicted_answers = []

for sample in small_validation_dataset: ## 原資料集
    sample_id = sample['id']
    context = sample['context']

    ## 最佳分數 ＆ans
    best_score = float('-inf')
    best_answer = None

    for idx in sample_id2idxs[sample_id]: ## 用上一步的mapping去找expand後的資料
        start_logit = start_logits[idx] # (384,) vector
        end_logit = end_logits[idx] # (384,) vector
        offsets = small_validation_processed[idx]['offset_mapping']

        ## 取P(start)*P(end)最大的 -> log(start) + log(end) 最大
        start_indices = (-start_logit).argsort() #descending order
        end_indices = (-end_logit).argsort()

        for start_idx in start_indices[:n_largest]:
            for end_idx in end_indices[:n_largest]:
                ## 先確認是否有不合法的組合
                if offsets[start_idx] is None or offsets[end_idx] is None or end_idx < start_idx or end_idx - start_idx + 1 > max_answer_length:
                    continue


                # 計算分數並更新
                score = start_logit[start_idx] + end_logit[end_idx]
                if score > best_score:
                    best_score = score

                    ## 取得ans 字串
                    first_ch = offsets[start_idx][0]
                    last_ch = offsets[end_idx][1]
                    best_answer = context[first_ch:last_ch]

    # save best answer
    predicted_answers.append({'id': sample_id, 'prediction_text': best_answer})

In [40]:
predicted_answers[0]

{'id': '56be4db0acb8001400a502ec', 'prediction_text': 'Denver Broncos'}

In [41]:
small_validation_dataset['answers'][0]

{'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'],
 'answer_start': [177, 177, 177]}

In [42]:
## true answer 格式
true_answers = [{'id': x['id'], 'answers': x['answers']} for x in small_validation_dataset]

## 計算
metric.compute(predictions=predicted_answers, references=true_answers)

{'exact_match': 83.0, 'f1': 88.25000000000004}

### 打包 Metrics function

In [43]:
from tqdm.autonotebook import tqdm

def compute_metrics(start_logits, end_logits, processed_dataset, orig_dataset):
    ## 建立 sample to index 對照表
    sample_id2idxs = defaultdict(list)
    for i, sample_id in enumerate(processed_dataset['sample_id']):
        sample_id2idxs[sample_id].append(i)

    predicted_answers = []
    for sample in orig_dataset: ## 原資料集
        sample_id = sample['id']
        context = sample['context']

        ## 最佳分數 ＆ans
        best_score = float('-inf')
        best_answer = None
        for idx in sample_id2idxs[sample_id]: ## 用上一步的mapping去找expand後的資料
            start_logit = start_logits[idx] # (384,) vector
            end_logit = end_logits[idx] # (384,) vector
            offsets = processed_dataset[idx]['offset_mapping']

            ## 取P(start)*P(end)最大的 -> log(start) + log(end) 最大
            start_indices = (-start_logit).argsort() #descending order
            end_indices = (-end_logit).argsort()

            for start_idx in start_indices[:n_largest]:
                for end_idx in end_indices[:n_largest]:
                    ## 先確認是否有不合法的組合
                    if offsets[start_idx] is None or offsets[end_idx] is None or end_idx < start_idx or end_idx - start_idx + 1 > max_answer_length:
                        continue


                    # 計算分數並更新
                    score = start_logit[start_idx] + end_logit[end_idx]
                    if score > best_score:
                        best_score = score

                        ## 取得ans 字串
                        first_ch = offsets[start_idx][0]
                        last_ch = offsets[end_idx][1]
                        best_answer = context[first_ch:last_ch]

        # save best answer
        predicted_answers.append({'id': sample_id, 'prediction_text': best_answer})

    ## true answer 格式
    true_answers = [{'id': x['id'], 'answers': x['answers']} for x in orig_dataset]

    ## 計算
    return metric.compute(predictions=predicted_answers, references=true_answers)

In [44]:
# run our function on the same mini dataset as before
compute_metrics(start_logits,
                end_logits,
                small_validation_processed,
                small_validation_dataset)

{'exact_match': 83.0, 'f1': 88.25000000000004}

## Train and Evaluate
- 用trainer 訓練model

In [45]:
from transformers import TrainingArguments, Trainer

#設定參數
args = TrainingArguments(output_dir = 'distilbert-finetuned-squad',
                         evaluation_strategy = 'no',
                         save_strategy = 'epoch',
                         learning_rate = 2e-5,
                         num_train_epochs = 3,
                         weight_decay = 0.01,
                         fp16 = True)

#create new pretrained model
model_checkpoint = 'distilbert-base-cased'
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)


model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_dataset,
                #   train_dataset=train_dataset.shuffle(seed=42).select(range(1_000)),
                  eval_dataset=val_dataset,
                  tokenizer=tokenizer)
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,3.2717
1000,2.2631
1500,1.99
2000,1.7765
2500,1.679
3000,1.5702
3500,1.5006
4000,1.4514
4500,1.4006
5000,1.4244


TrainOutput(global_step=33276, training_loss=1.045443084437997, metrics={'train_runtime': 4072.5988, 'train_samples_per_second': 65.36, 'train_steps_per_second': 8.171, 'total_flos': 2.608361755366349e+16, 'train_loss': 1.045443084437997, 'epoch': 3.0})

In [47]:
trainer_output = trainer.predict(val_dataset)
trainer_output

PredictionOutput(predictions=(array([[ -7.8085938, -11.078125 , -11.0859375, ..., -11.4140625,
        -11.421875 , -11.4375   ],
       [ -8.296875 , -11.0625   , -11.0703125, ..., -11.3984375,
        -11.4140625, -11.421875 ],
       [ -8.0703125, -10.765625 , -10.8828125, ..., -11.4765625,
        -11.4765625, -11.4765625],
       ...,
       [ -5.5703125, -11.2265625, -11.4765625, ..., -11.4140625,
        -11.4296875, -11.453125 ],
       [ -3.6269531, -11.0234375, -11.015625 , ..., -11.5      ,
        -11.4921875, -11.4921875],
       [ -4.8632812, -11.1796875, -11.2890625, ..., -11.4375   ,
        -11.4453125, -11.4765625]], dtype=float32), array([[ -6.8125   , -10.875    , -10.8515625, ..., -11.5      ,
        -11.5      , -11.484375 ],
       [ -7.3164062, -10.8515625, -10.8125   , ..., -11.484375 ,
        -11.484375 , -11.46875  ],
       [ -7.7734375, -11.3359375, -11.328125 , ..., -11.5      ,
        -11.5      , -11.4921875],
       ...,
       [ -4.6132812, -10.9921

In [48]:
predictions, _, _ = trainer_output
predictions

(array([[ -7.8085938, -11.078125 , -11.0859375, ..., -11.4140625,
         -11.421875 , -11.4375   ],
        [ -8.296875 , -11.0625   , -11.0703125, ..., -11.3984375,
         -11.4140625, -11.421875 ],
        [ -8.0703125, -10.765625 , -10.8828125, ..., -11.4765625,
         -11.4765625, -11.4765625],
        ...,
        [ -5.5703125, -11.2265625, -11.4765625, ..., -11.4140625,
         -11.4296875, -11.453125 ],
        [ -3.6269531, -11.0234375, -11.015625 , ..., -11.5      ,
         -11.4921875, -11.4921875],
        [ -4.8632812, -11.1796875, -11.2890625, ..., -11.4375   ,
         -11.4453125, -11.4765625]], dtype=float32),
 array([[ -6.8125   , -10.875    , -10.8515625, ..., -11.5      ,
         -11.5      , -11.484375 ],
        [ -7.3164062, -10.8515625, -10.8125   , ..., -11.484375 ,
         -11.484375 , -11.46875  ],
        [ -7.7734375, -11.3359375, -11.328125 , ..., -11.5      ,
         -11.5      , -11.4921875],
        ...,
        [ -4.6132812, -10.9921875, -10.

In [49]:
len(start_logits), len(end_logits), len(val_dataset), len(raw_datasets["validation"])

(100, 100, 10822, 10570)

In [50]:
start_logits, end_logits = predictions

In [51]:
compute_metrics(start_logits,
                end_logits,
                val_dataset, # processed
                raw_datasets["validation"])

{'exact_match': 77.39829706717124, 'f1': 85.28333574968637}

## Save Model & predict

In [52]:
from transformers import pipeline

trainer.save_model('my_qa_model')

qa_model = pipeline(task = 'question-answering',
              model='my_qa_model',
              device=0)

In [54]:
context = "Today I went to the store to purchase a carton of milk."
question = "What did I buy?"

qa_model(context=context, question=question)

{'score': 0.5270168781280518,
 'start': 38,
 'end': 54,
 'answer': 'a carton of milk'}