In [1]:
from datasets import load_dataset

dataset = load_dataset("lucadiliello/newsqa")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [2]:
train_ds = dataset["train"]
val_ds=dataset["validation"]

In [3]:
!pip install simpletransformers



In [4]:
import logging

from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs


In [5]:
model_type = "bert"  # general type: "bert", "roberta", etc.

if model_type == "bert":
    model_name = "bert-base-cased"
elif model_type == "roberta":
    model_name = "roberta-base"
elif model_type == "distilbert":
    model_name = "distilbert-base-cased"
elif model_type == "distilroberta":
    model_name = "distilroberta-base"
elif model_type == "electra":
    model_name = "google/electra-base-discriminator"
else:
    raise ValueError(f"Unknown model_type: {model_type}")


In [6]:
model_type = "roberta"
model_name = "deepset/roberta-base-squad2"


In [7]:
# Configure the model
model_args = QuestionAnsweringArgs()
model_args.train_batch_size = 16
model_args.evaluate_during_training = True
model_args.eval_best_size = 3
model_args.num_train_epochs = 5


In [8]:
!pip install wandb



In [18]:
train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "use_cached_eval_features": True,
    "output_dir": f"outputs/{model_type}/best_model",
    "best_model_dir": f"outputs/{model_type}/best_model",
    "save_model_every_epoch": True,
    "num_train_epochs": 15,
    "max_seq_length": 128,
    "train_batch_size": 128,  # final batch size
    "eval_batch_size": 64,
    "evaluate_during_training_steps": 1000,
    "wandb_project": "Question Answer Application",
    "wandb_kwargs": {"name": model_name},
    "use_early_stopping": True,
    "early_stopping_metric": "mcc",
    "early_stopping_patience": 3,
    "use_multiprocessing": False,
    # "config": {"output_hidden_states": True}
}


In [19]:
model = QuestionAnsweringModel(
    model_type, model_name, args=train_args
)


In [11]:
import json

def format_newsqa_for_simpletransformers(dataset):
    formatted_data = []
    for sample in dataset:
        context = sample["context"]
        question = sample["question"]
        qid = str(sample.get("key", None)) # Use 'key' if available, otherwise None
        answers_field = sample.get("answers", [])

        qas = []
        if answers_field:
            # Assuming answers_field is a list of lists, like [['text']]
            # or a list of strings, like ['text']
            # Or a dictionary with 'answer_start' and 'text'
            if isinstance(answers_field, dict) and 'answer_start' in answers_field and 'text' in answers_field:
                 # Handle the case where 'answers' is a dictionary
                 answer_texts = answers_field['text']
                 answer_starts = answers_field['answer_start']
                 if not isinstance(answer_texts, list):
                     answer_texts = [answer_texts]
                     answer_starts = [answer_starts]

                 answers_list = []
                 for text, start in zip(answer_texts, answer_starts):
                     # Calculate answer_end
                     end = start + len(text)
                     answers_list.append({"text": text, "answer_start": start, "answer_end": end})

                 qas.append({
                    "question": question,
                    "id": qid if qid else str(len(formatted_data)), # Generate an ID if 'key' is None
                    "is_impossible": False,
                    "answers": answers_list
                })
            elif isinstance(answers_field, list):
                # Handle the case where 'answers' is a list (of lists or strings)
                flat_answers = []
                for a in answers_field:
                    if isinstance(a, list):
                        flat_answers.extend(a)
                    else:
                        flat_answers.append(a)

                answers_list = []
                for text in flat_answers:
                     # Find the start index of the answer in the context
                    start = context.find(text)
                    if start != -1:
                        end = start + len(text)
                        answers_list.append({"text": text, "answer_start": start, "answer_end": end})
                    # else: Handle cases where the answer text is not found exactly in the context
                    # For now, we'll skip answers that can't be located

                if answers_list:
                    qas.append({
                        "question": question,
                        "id": qid if qid else str(len(formatted_data)), # Generate an ID if 'key' is None
                        "is_impossible": False,
                        "answers": answers_list
                    })
                else:
                     # If no answers could be located, mark as impossible
                     qas.append({
                        "question": question,
                        "id": qid if qid else str(len(formatted_data)),
                        "is_impossible": True,
                        "answers": []
                    })

        # If no answers_field exists or is empty
        if not qas:
             qas.append({
                "question": question,
                "id": qid if qid else str(len(formatted_data)),
                "is_impossible": True,
                "answers": []
            })


        formatted_data.append({
            "context": context.strip(),
            "qas": qas
        })

    return formatted_data

train_data_formatted_corrected = format_newsqa_for_simpletransformers(train_ds)
test_data_formatted_corrected = format_newsqa_for_simpletransformers(val_ds)

print("Correctly formatted training data sample:")
print(json.dumps(train_data_formatted_corrected[100], indent=2, ensure_ascii=False))

Correctly formatted training data sample:
{
  "qas": [
    {
      "question": "how many attacks have been done since July?",
      "id": "3c448809c2c94ece9036b9be5d013fe0",
      "is_impossible": false,
      "answers": [
        {
          "text": "12",
          "answer_start": 839,
          "answer_end": 841
        }
      ]
    }
  ]
}


In [13]:
# For HuggingFace Dataset objects
NUM_TRAIN_SAMPLES = 2000
NUM_VAL_SAMPLES = 500

train_ds_small = train_ds.select(range(NUM_TRAIN_SAMPLES))
val_ds_small = val_ds.select(range(NUM_VAL_SAMPLES))

train_data_formatted_small = format_newsqa_for_simpletransformers(train_ds_small)
test_data_formatted_small = format_newsqa_for_simpletransformers(val_ds_small)


In [21]:
# Now use the correctly formatted data for training
model.train_model(train_data_formatted_small, eval_data=test_data_formatted_small)

100%|██████████| 2000/2000 [00:21<00:00, 92.98it/s]
add example index and unique id: 100%|██████████| 2000/2000 [00:00<00:00, 723842.26it/s]


Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

0,1
Training loss,█▁
correct,▁
eval_loss,▁
global_step,▁▇█
incorrect,▁
lr,█▁
similar,▁
train_loss,▁

0,1
Training loss,0.8251
correct,225.0
eval_loss,-7.26576
global_step,109.0
incorrect,96.0
lr,3e-05
similar,179.0
train_loss,1.38894


  scaler = amp.GradScaler()


Running Epoch 1 of 15:   0%|          | 0/35 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 15:   0%|          | 0/35 [00:00<?, ?it/s]

Running Epoch 3 of 15:   0%|          | 0/35 [00:00<?, ?it/s]

Running Epoch 4 of 15:   0%|          | 0/35 [00:00<?, ?it/s]

Running Epoch 5 of 15:   0%|          | 0/35 [00:00<?, ?it/s]

Running Epoch 6 of 15:   0%|          | 0/35 [00:00<?, ?it/s]

Running Epoch 7 of 15:   0%|          | 0/35 [00:00<?, ?it/s]

Running Epoch 8 of 15:   0%|          | 0/35 [00:00<?, ?it/s]

Running Epoch 9 of 15:   0%|          | 0/35 [00:00<?, ?it/s]

Running Epoch 10 of 15:   0%|          | 0/35 [00:00<?, ?it/s]

Running Epoch 11 of 15:   0%|          | 0/35 [00:00<?, ?it/s]

Running Epoch 12 of 15:   0%|          | 0/35 [00:02<?, ?it/s]

Running Epoch 13 of 15:   0%|          | 0/35 [00:00<?, ?it/s]

Running Epoch 14 of 15:   0%|          | 0/35 [00:00<?, ?it/s]

Running Epoch 15 of 15:   0%|          | 0/35 [00:00<?, ?it/s]

(525, 0.2118832277329195)

In [22]:
# Evaluate the model
result, text_predictions = model.eval_model(test_data_formatted_small)

print("Evaluation Results:")
print(result)

convert squad examples to features: 100%|██████████| 500/500 [00:05<00:00, 99.82it/s]
add example index and unique id: 100%|██████████| 500/500 [00:00<00:00, 352521.77it/s]


Running Evaluation:   0%|          | 0/18 [00:00<?, ?it/s]

  with amp.autocast():


Evaluation Results:
{'correct': 163, 'similar': 256, 'incorrect': 81, 'eval_loss': -9.581597222222221}
