In [1]:
!pip install transformers
!pip install datasets
!pip install evaluate


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

import torch
import torch.nn as nn

import transformers
from transformers import (TrainingArguments,
                          pipeline,
                          DistilBertForQuestionAnswering,
                          DistilBertTokenizer,
                          DistilBertTokenizerFast,
                          AutoTokenizer,
                          AutoModelForQuestionAnswering,
                          Trainer,
                          logging)
from datasets import Dataset, load_dataset
import evaluate
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [4]:
df_train = pd.read_csv("/content/drive/MyDrive/data/train.csv")
df_test = pd.read_csv("/content/drive/MyDrive/data/test.csv")

In [5]:
import re
def clean_answer(answer):
    # Remove annotations like <<>> and final '#### ...'
    answer = re.sub(r'<<.*?>>', '', answer)
    answer = re.sub(r'####.*', '', answer)
    return answer.strip()

df_train['clean_answer'] = df_train['answer'].apply(clean_answer)
df_test['clean_answer'] = df_test['answer'].apply(clean_answer)

# Split answers into stepwise reasoning
def split_steps(answer):
    return answer.split('\n')

df_train['steps'] = df_train['clean_answer'].apply(split_steps)
df_test['steps'] = df_test['clean_answer'].apply(split_steps)

# 3. Prepare dataset for BERT input
def create_input_output_pairs(row):
    return {
        'question': row['question'],
        'answer': row['clean_answer']
    }

data_pairs_train = df_train.apply(create_input_output_pairs, axis=1).tolist()
data_pairs_test = df_test.apply(create_input_output_pairs, axis=1).tolist()

print(data_pairs_train[0])
print(data_pairs_test[0])

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'answer': 'Natalia sold 48/2 = 24 clips in May.\nNatalia sold 48+24 = 72 clips altogether in April and May.'}
{'question': "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?", 'answer': 'Janet sells 16 - 3 - 4 = 9 duck eggs a day.\nShe makes 9 * 2 = $18 every day at the farmer’s market.'}


In [6]:
def transform_data(data_list):
    transformed_data = []

    for data in data_list:
        try:
            # Tìm câu hỏi chính
            question_sentences = data["question"].strip().split('.')
            last_question = question_sentences[-1].strip() + '?' if question_sentences[-1] else question_sentences[-2].strip() + '?'

            # Combine remaining question sentences (without the last sentence) with the answer
            combined_context = ". ".join(question_sentences[:-1]).strip()
            if combined_context:
                combined_context += ". "
            combined_context += data["answer"]

            all_numbers = re.finditer(r"\b\d+\b", data["answer"])

            # Tìm số có liên quan nhất với câu hỏi dựa trên ý nghĩa
            relevant_number = None
            relevant_index = None
            for match in all_numbers:
                number = match.group()
                start_index = match.start()

                # Áp dụng logic bổ sung nếu cần chọn số phù hợp nhất
                # Trong trường hợp này, chúng ta lấy số xuất hiện cuối cùng
                relevant_number = number
                relevant_index = start_index

            if relevant_number is None or relevant_index is None:
                raise ValueError("No valid numeric answer found.")

            # Chuẩn hóa dữ liệu
            transformed_data.append({
                "question": last_question.strip(),
                "context": combined_context.strip(),
                "answers": {
                    "answer_start": [relevant_index],
                    "text": [relevant_number.strip()]
                }
            })
        except Exception as e:
            print(f"Skipping entry due to error: {e}")
            continue

    return transformed_data

dataset_train_transform = transform_data(data_pairs_train)
dataset_test_transform = transform_data(data_pairs_test)
print(dataset_train_transform[0])
print(dataset_test_transform[0])



{'question': 'How many clips did Natalia sell altogether in April and May??', 'context': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. Natalia sold 48/2 = 24 clips in May.\nNatalia sold 48+24 = 72 clips altogether in April and May.', 'answers': {'answer_start': [58], 'text': ['72']}}
{'question': "How much in dollars does she make every day at the farmers' market??", 'context': "Janet’s ducks lay 16 eggs per day.  She eats three for breakfast every morning and bakes muffins for her friends every day with four.  She sells the remainder at the farmers' market daily for $2 per fresh duck egg. Janet sells 16 - 3 - 4 = 9 duck eggs a day.\nShe makes 9 * 2 = $18 every day at the farmer’s market.", 'answers': {'answer_start': [63], 'text': ['18']}}


In [7]:
# Convert to Hugging Face Datasets
dataset_train = Dataset.from_list(dataset_train_transform)
dataset_test = Dataset.from_list(dataset_test_transform)

print(dataset_train)
print(dataset_test)

Dataset({
    features: ['question', 'context', 'answers'],
    num_rows: 14946
})
Dataset({
    features: ['question', 'context', 'answers'],
    num_rows: 2638
})


In [8]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


# Tokenize datasets
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
tokenized_dataset_train = dataset_train.map(preprocess_function, batched=True)
tokenized_dataset_test = dataset_test.map(preprocess_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/14946 [00:00<?, ? examples/s]

Map:   0%|          | 0/2638 [00:00<?, ? examples/s]

In [9]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# 4. Define and train BERT model
model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='steps',
    eval_steps=500,  # Match the evaluation frequency to save steps
    learning_rate=2e-5,
    warmup_steps=500,  # Warmup for learning rate scheduling
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
    num_train_epochs=5,  # Increased epochs for better convergence
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    load_best_model_at_end=True  # Load the best model at the end of training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_test,
    tokenizer=tokenizer,
    data_collator=data_collator
)
# Train the model
print("\nStarting training...")
trainer.train()

# Save the model
model.save_pretrained("bert_gsm8k_qa")
tokenizer.save_pretrained("bert_gsm8k_qa")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Starting training...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
500,3.9307,3.867809
1000,3.4568,3.633137
1500,3.0662,3.199519
2000,2.8647,3.066055
2500,2.835,2.999392
3000,2.4876,3.077539
3500,2.6738,2.85541
4000,2.3886,2.830517
4500,2.3981,2.896764


('bert_gsm8k_qa/tokenizer_config.json',
 'bert_gsm8k_qa/special_tokens_map.json',
 'bert_gsm8k_qa/vocab.txt',
 'bert_gsm8k_qa/added_tokens.json',
 'bert_gsm8k_qa/tokenizer.json')

In [18]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model="/content/bert_gsm8k_qa")

# Bài toán từ GSM8K
context = """Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. Natalia sold 48/2 = 24 clips in May.\nNatalia sold 48+24 = 72 clips altogether in April and May.
"""
question = "How many clips did Natalia sell altogether in April and May?"


# Thực hiện suy luận
result = qa_pipeline(question=question, context=context)

# In kết quả
print(f"Question: {question}")
print(f"Answer: {result['answer']}")
print(f"Score: {result['score']:.2f}")

Device set to use cuda:0


Question: How many clips did Natalia sell altogether in April and May?
Answer: sold
Score: 0.08
