# Import libs

In [2]:
import numpy as np
from tqdm.auto import tqdm
import collections
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForQuestionAnswering
from transformers import TrainingArguments
from transformers import Trainer
import evaluate

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")




In [None]:
from huggingface_hub import notebook_login
notebook_login()

# Building Reader

# Configure

In [3]:
MODEL_NAME = "distilbert-base-uncased"
MAX_LENGTH = 384
STRIDE = 128

# Setup Datasets

In [4]:
DATASET_NAME = 'squad_v2'
raw_datasets = load_dataset(DATASET_NAME)

In [5]:
raw_datasets['train'][1]

{'id': '56be85543aeaaa14008c9065',
 'title': 'Beyoncé',
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'question': 'What areas did Beyonce compete in when she was growing up?',
 'answers': {'text': ['singing and dancing'], 'answer_start': [207]}}

In [6]:
raw_datasets['validation'][22]['answers']

{'text': ['King Charles III', 'King Charles III', 'King Charles III'],
 'answer_start': [324, 324, 324]}

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)



# Preprocessing data

In [10]:
def preprocess_training_examples(examples):
    # Get questions from examples
    # and remove redundant spaces
    questions = [q.strip() for q in examples["question"]]

    # tokenize input data
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        stride=STRIDE,
        padding="max_length",
    )
    # Extract offset mappings from inputs
    # then pop it from inputs
    offset_mapping = inputs.pop("offset_mapping")

    # Extract sample mappings from inputs
    # then pop it from inputs
    sample_map = inputs.pop("overflow_to_sample_mapping")

    # get answers from examples
    answers = examples["answers"]

    # Initiate start end stop answer position list
    start_positions = []
    end_positions = []

    # Loop through offset_mapping
    for i, offset in enumerate(offset_mapping):
        # identify index of sample relate to the current offset
        sample_idx = sample_map[i]
        # get sequence_ids from input
        sequence_ids = inputs.sequence_ids(i)
        # Get start and end position of context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # get answer for this sample
        answer = answers[sample_idx]
        if len(answer["text"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            # if the answer is not in the context
            if (
                offset[context_start][0] > start_char
                or offset[context_end][1] < end_char
            ):
                start_positions.append(0)
                end_positions.append(0)
            else:
                # else set the start and end position
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)
                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

    # adding start, end position to inputs
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

In [11]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [111]:
len(raw_datasets["train"]), len(train_dataset)

(130319, 131754)

In [12]:
def preprocess_validation_examples(examples):
    # Get questions from examples
    # and remove redundant spaces
    questions = [q.strip() for q in examples["question"]]

    # tokenize input data
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        stride=STRIDE,
        padding="max_length",
    )

    # Extract sample mappings from inputs
    # then pop it from inputs
    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    # Xác định ví dụ tham chi ếu cho mỗi dòng đầu vào và
    # điều chỉnh ánh xạ offset
    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])
        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        # Loại bỏ các offset không phù hợp với sequence_ids
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]
    # Thêm thông tin ví dụ tham chi ếu vào đầu vào
    inputs["example_id"] = example_ids
    return inputs

In [13]:
validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
# In ra độ dài của raw_datasets [" validation "]
# và validation_dataset để so sánh.
len(raw_datasets["validation"]), len(validation_dataset)

(11873, 12134)

# Training

In [14]:
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [122]:
validation_dataset[120].items()

dict_items([('input_ids', [101, 2040, 2001, 5545, 1005, 1055, 2567, 1029, 102, 2028, 1997, 1996, 4366, 11390, 1997, 1996, 2394, 6106, 10078, 2520, 1996, 25466, 1010, 9586, 2012, 16001, 2075, 1010, 2776, 6783, 2000, 3885, 1012, 2332, 8861, 3523, 1997, 3885, 2496, 9586, 1005, 1055, 2905, 5545, 1010, 1998, 2234, 2046, 4559, 2000, 2520, 2040, 2018, 2525, 11621, 3885, 1005, 1055, 2670, 6645, 1012, 2520, 10836, 3885, 1999, 10550, 2475, 1010, 5559, 2004, 2521, 2004, 14863, 26573, 10536, 2073, 2002, 2777, 2039, 2007, 2010, 4170, 1997, 3719, 1012, 8861, 7864, 1010, 3825, 14822, 2000, 2520, 1998, 10795, 2010, 2365, 7343, 2004, 1037, 13446, 1010, 2927, 1037, 2186, 1997, 9918, 2004, 2000, 3251, 1996, 4104, 4410, 12232, 14588, 2000, 1996, 2332, 1997, 2563, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [15]:
args = TrainingArguments(
    output_dir="distilbert-finetuned-squadv2",  
    evaluation_strategy="no",  # Chế độ đánh giá không tự động sau mỗi epoch
    save_strategy="epoch",  # Lưu checkpoint sau mỗi epoch
    learning_rate=2e-5,  # Tốc độ học
    num_train_epochs=5,  # Số epoch huấn luyện
    weight_decay=0.01,  # Giảm trọng lượng mô hình để tránh overfitting
    fp16=True,  # Sử dụng kiểu dữ liệu half - precision để tối ưu tài nguyên
    push_to_hub=True,  # Đẩy kết quả huấn luyện lên HuggingFace Hub
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4
)



In [None]:
# Khởi tạo một đối tượng Trainer để huấn luyện mô hình
trainer = Trainer(
    model=model,  # Sử dụng mô hình đã tạo trước đó
    args=args,  # Các tham số và cấu hình huấn luy ện
    train_dataset=train_dataset,  # Sử dụng tập dữ liệu huấn luyện
    eval_dataset=validation_dataset,  # Sử dụng tập dữ liệu đánh giá
    tokenizer=tokenizer,  # Sử dụng tokenizer để xử lý văn bản
)
# Bắt đầu quá trình huấn luy ện
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/5145 [00:00<?, ?it/s]

{'loss': 2.254, 'grad_norm': 7.672187805175781, 'learning_rate': 1.805636540330418e-05, 'epoch': 0.49}
{'loss': 1.5106, 'grad_norm': 8.735021591186523, 'learning_rate': 1.611273080660836e-05, 'epoch': 0.97}
{'loss': 1.3021, 'grad_norm': 8.502686500549316, 'learning_rate': 1.4169096209912539e-05, 'epoch': 1.46}
{'loss': 1.2261, 'grad_norm': 5.810998439788818, 'learning_rate': 1.2225461613216716e-05, 'epoch': 1.94}
{'loss': 1.0953, 'grad_norm': 9.021493911743164, 'learning_rate': 1.0285714285714285e-05, 'epoch': 2.43}
{'loss': 1.0653, 'grad_norm': 6.346558094024658, 'learning_rate': 8.342079689018465e-06, 'epoch': 2.91}
{'loss': 0.9865, 'grad_norm': 6.091553688049316, 'learning_rate': 6.3984450923226434e-06, 'epoch': 3.4}
{'loss': 0.9563, 'grad_norm': 6.82282829284668, 'learning_rate': 4.454810495626822e-06, 'epoch': 3.89}
{'loss': 0.9044, 'grad_norm': 8.198391914367676, 'learning_rate': 2.5150631681243924e-06, 'epoch': 4.37}
{'loss': 0.8978, 'grad_norm': 7.069595813751221, 'learning_rat

TrainOutput(global_step=5145, training_loss=1.2107311371116527, metrics={'train_runtime': 2725.1589, 'train_samples_per_second': 241.736, 'train_steps_per_second': 1.888, 'total_flos': 6.452355606320333e+16, 'train_loss': 1.2107311371116527, 'epoch': 4.997571636716853})

In [None]:
trainer.push_to_hub(commit_message="Reader_Squadv2")

# Evaluation

In [18]:
metric = evaluate.load("squad_v2")


In [19]:
N_BEST = 20  # Số lượng kết quả tốt nhất được lựa chọn sau khi dự đoán
MAX_ANS_LENGTH = 30  # Độ dài tối đa cho câu trả lời dự đoán


def compute_metrics(start_logits, end_logits, features, examples):
    # Tạo một từ điển mặc định để ánh xạ mỗi ví dụ
    # với danh sách các đặc trưng tương ứng
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)
        predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []
        # Lặp qua tất cả các đặc trưng liên quan đến ví dụ đó
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]
            # Lấy các chỉ số có giá trị lớn nhất cho start và end logits
            start_indexes = np.argsort(start_logit)[-1 : -N_BEST - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -N_BEST - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Bỏ qua các câu trả lời
                    # không hoàn toàn nằm trong ngữ cảnh
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Bỏ qua các câu trả lời có độ dài > max_answer_length
                    if end_index - start_index + 1 > MAX_ANS_LENGTH:
                        continue
                    # Tạo một câu trả lời mới
                    text = context[offsets[start_index][0] : offsets[end_index][1]]
                    logit_score = start_logit[start_index] + end_logit[end_index]
                    answer = {
                        "text": text,
                        "logit_score": logit_score,
                    }
                    answers.append(answer)
        # Chọn câu trả lời có điểm số tốt nhất
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            answer_dict = {
                "id": example_id,
                "prediction_text": best_answer["text"],
                "no_answer_probability": 1 - best_answer["logit_score"],
            }
        else:
            answer_dict = {
                "id": example_id,
                "prediction_text": "",
                "no_answer_probability": 1.0,
            }
        predicted_answers.append(answer_dict)
    # Tạo danh sách câu trả lời lý thuyết từ các ví dụ
    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    # Sử dụng metric.compute để tính toán các độ đo và trả về kết quả
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [20]:
# Thực hiện dự đoán trên tập dữ liệu validation
predictions, _, _ = trainer.predict(validation_dataset)
# Lấy ra thông tin về các điểm bắt đầu và
# điểm kết thúc của câu trả lời dự đoán
start_logits, end_logits = predictions
# Tính toán các chỉ số đánh giá sử dụng hàm compute_metrics
results = compute_metrics(
    start_logits, end_logits, validation_dataset, raw_datasets["validation"]
)
results

  0%|          | 0/380 [00:00<?, ?it/s]

  0%|          | 0/11873 [00:00<?, ?it/s]

{'exact': 42.727196159353156,
 'f1': 47.19397740890768,
 'total': 11873,
 'HasAns_exact': 71.64304993252361,
 'HasAns_f1': 80.5894220269839,
 'HasAns_total': 5928,
 'NoAns_exact': 13.894028595458368,
 'NoAns_f1': 13.894028595458368,
 'NoAns_total': 5945,
 'best_exact': 62.966394340099384,
 'best_exact_thresh': -11.00390625,
 'best_f1': 64.68519386646281,
 'best_f1_thresh': -10.98828125}

# Building Retriever


In [24]:
DATASET_NAME = 'squad_v2'
raw_datasets = load_dataset(DATASET_NAME,split ='train+validation')

In [25]:
raw_datasets = raw_datasets.filter(lambda x: len(x['answers']['text']) > 0)

In [26]:
raw_datasets

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 92749
})

In [None]:
MODEL_NAME= 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)

In [28]:
def get_embeddings(text_input):
    inputs = tokenizer(text_input, return_tensors="pt", padding=True, truncation=True)
    encoded_input = {key: value.to(device) for key, value in inputs.items()}
    model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0]

In [None]:
EMBEDDING_COLUMN = 'question_embedding'
embedding_dataset = raw_datasets.map(lambda x: {EMBEDDING_COLUMN: get_embeddings(x['question']).detach().cpu().numpy()[0]})

In [None]:
import faiss

embedding_dataset.add_faiss_index(column=EMBEDDING_COLUMN)

In [31]:
input_question = "Why is this a bad practice?"
input_quest_embedding = get_embeddings([input_question])
input_quest_embedding = input_quest_embedding.cpu().detach().numpy()
TOP_K = 3
scores, samples = embedding_dataset.get_nearest_examples(
    EMBEDDING_COLUMN, input_quest_embedding, k=TOP_K
)

for idx, score in enumerate(scores):
    print(f"Top {idx + 1}\tScore : {score}")
    print(f'Question: {samples ["question"][ idx ]}')
    print(f'Context: {samples ["context"][ idx ]}')
    print()

Top 1	Score : 0.0
Question: Why is this a bad practice?
Context: However, performance enhancements cannot be generalized as the benefits and limitations of the system are dependent on many factors. One problem is that the system is subject to gaming. Sometimes, one person enters the destination for a large group of people going to the same floor. The dispatching algorithm is usually unable to completely cater for the variation, and latecomers may find the elevator they are assigned to is already full. Also, occasionally, one person may press the floor multiple times. This is common with up/down buttons when people believe this to be an effective way to hurry elevators. However, this will make the computer think multiple people are waiting and will allocate empty cars to serve this one person.

Top 2	Score : 4.230165481567383
Question: Why would one want to give a speech?
Context: Some civil disobedience defendants choose to make a defiant speech, or a speech explaining their actions, i

## Reader-Retriever QA

In [3]:
from transformers import pipeline

PIPELINE_NAME = "question-answering"
MODEL_NAME = "binhphap5/distilbert-finetuned-squadv2"
pipe = pipeline(PIPELINE_NAME, model=MODEL_NAME, device=device)

## FAISS

In [None]:
print(f"Input question : {input_question}")
for idx, score in enumerate(scores):
    question = samples["question"][idx]
    context = samples["context"][idx]
    answer = pipe(question=question, context=context)
    print(f"Top {idx + 1}\t Score : {score}")
    print(f"Context: {context}")
    print(f"Answer: {answer}")
    print()

Input question : Why is this a bad practice?
Top 1	 Score : 0.0
Context: However, performance enhancements cannot be generalized as the benefits and limitations of the system are dependent on many factors. One problem is that the system is subject to gaming. Sometimes, one person enters the destination for a large group of people going to the same floor. The dispatching algorithm is usually unable to completely cater for the variation, and latecomers may find the elevator they are assigned to is already full. Also, occasionally, one person may press the floor multiple times. This is common with up/down buttons when people believe this to be an effective way to hurry elevators. However, this will make the computer think multiple people are waiting and will allocate empty cars to serve this one person.
Answer: {'score': 0.036891911178827286, 'start': 622, 'end': 683, 'answer': 'this will make the computer think multiple people are waiting'}

Top 2	 Score : 4.230165481567383
Context: Some

## duckduckgo search api

In [46]:
from duckduckgo_search import DDGS

def get_context_from_duckduckgo(question):
    results = DDGS().text(question, max_results=2)
    snippets = [result['body'] for result in results]
    context = " ".join(snippets)
    return context

In [47]:
question = "Who is Donald Trump ?"
context = get_context_from_duckduckgo(question)

result = pipe(question=question, context=context)
print(result)

{'score': 0.14308825135231018, 'start': 42, 'end': 100, 'answer': 'an American politician, media personality, and businessman'}
