`1. Install & Import Packages`

In [1]:
!pip install -q transformers datasets evaluate accelerate "huggingface-hub>=0.16.0" pandas scikit-learn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m74.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from datasets import load_dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
import evaluate
import numpy as np
import collections
from tqdm.auto import tqdm

`2. Load and Explore Dataset`

In [None]:
squad_dataset = load_dataset("squad", "plain_text")

train_df = squad_dataset["train"].to_pandas()
val_df   = squad_dataset["validation"].to_pandas()

train_df.sample(3)[["context", "question", "answers"]]

In [4]:
train_df["context_len"]  = train_df["context"].str.len()
train_df["question_len"] = train_df["question"].str.len()
train_df["answer_len"]   = train_df["answers"].apply(lambda x: len(x["text"][0]))

train_df[["context_len", "question_len", "answer_len"]].describe()

Unnamed: 0,context_len,question_len,answer_len
count,87599.0,87599.0,87599.0
mean,754.364216,59.571137,20.147273
std,307.396264,89.027556,21.597341
min,151.0,1.0,1.0
25%,559.0,44.0,7.0
50%,693.0,56.0,14.0
75%,895.0,71.0,23.0
max,3706.0,25651.0,239.0


`3. Tokenization and Data Preprocessing`

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer.save_pretrained("./squad_tokenizer")

max_length = 384
doc_stride = 128

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

`4. Prepare Training Features`

In [6]:
def prepare_train_features(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char   = start_char + len(answers["text"][0])

            token_start_index = 0
            while tokenized_examples.sequence_ids(i)[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while tokenized_examples.sequence_ids(i)[token_end_index] != 1:
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)

                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"]   = end_positions
    return tokenized_examples

In [7]:
tokenized_squad = squad_dataset.map(
    prepare_train_features,
    batched=True,
    remove_columns=squad_dataset["train"].column_names
)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

`5. Prepare Validation Features for Post-processing`

In [8]:
def prepare_validation_features(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples["offset_mapping"]

    tokenized_examples["example_id"] = []

    for i in range(len(offset_mapping)):
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        sequence_ids = tokenized_examples.sequence_ids(i)
        new_offset = []
        for k, o in enumerate(offset_mapping[i]):
            if sequence_ids[k] != 1:
                new_offset.append(None)
            else:
                new_offset.append(o)
        tokenized_examples["offset_mapping"][i] = new_offset

    return tokenized_examples

In [9]:
validation_features = squad_dataset["validation"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=squad_dataset["validation"].column_names
)

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

`6. Model Initialization & Training`

In [10]:
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
# Load squad metric
squad_metric = evaluate.load("squad")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

In [12]:
training_args = TrainingArguments(
    output_dir="./qa_model",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True
)


In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    tokenizer=tokenizer
)


  trainer = Trainer(


In [14]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m202101657[0m ([33m202101657-pua[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,1.1238,1.123659
2,0.7974,1.144566


TrainOutput(global_step=22132, training_loss=1.1374917849911803, metrics={'train_runtime': 2646.4324, 'train_samples_per_second': 66.901, 'train_steps_per_second': 8.363, 'total_flos': 1.7348902540849152e+16, 'train_loss': 1.1374917849911803, 'epoch': 2.0})

`7. Post-processing & Evaluation`

In [20]:
def postprocess_qa_predictions(examples, features, all_start_logits, all_end_logits,
                               n_best_size=20, max_answer_length=30):
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, f in enumerate(features):
        features_per_example[f["example_id"]].append(i)

    predictions = collections.OrderedDict()

    for example_id, feature_indices in tqdm(features_per_example.items(), desc="Postprocessing"):
        example = examples[example_id_to_index[example_id]]
        context = example["context"]

        valid_answers = []
        min_null_score = None

        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]
            input_ids = features[feature_index]["input_ids"]

            try:
                cls_index = input_ids.index(tokenizer.cls_token_id)
            except ValueError:
                cls_index = 0

            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or feature_null_score < min_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1: -n_best_size - 1: -1].tolist()
            end_indexes = np.argsort(end_logits)[-1: -n_best_size - 1: -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    if start_index >= len(offsets) or end_index >= len(offsets):
                        continue
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    if end_index < start_index:
                        continue
                    length = end_index - start_index + 1
                    if length > max_answer_length:
                        continue

                    start_char = offsets[start_index][0]
                    end_char = offsets[end_index][1]
                    answer_text = context[start_char: end_char]
                    score = start_logits[start_index] + end_logits[end_index]
                    valid_answers.append({
                        "score": float(score),
                        "text": answer_text
                    })

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
            final_text = best_answer["text"]
        else:
            final_text = ""

        # Modify the output format to match the expected format for squad_metric.compute
        predictions[example_id] = [{"prediction_text": final_text}]

    return predictions

In [21]:
# Get raw predictions from model
raw_pred = trainer.predict(validation_features)
all_start_logits, all_end_logits = raw_pred.predictions

In [22]:
# Run post-processing
predictions = postprocess_qa_predictions(
    examples=squad_dataset["validation"],
    features=validation_features,
    all_start_logits=all_start_logits,
    all_end_logits=all_end_logits
)


Postprocessing:   0%|          | 0/10570 [00:00<?, ?it/s]

In [23]:
# Build references
references = []
for ex in squad_dataset["validation"]:
    references.append({
        "id": ex["id"],
        "answers": {
            "text": ex["answers"]["text"],
            "answer_start": ex["answers"]["answer_start"]
        }
    })

In [27]:
# Convert predictions dict {id: text} to list of dicts
predictions_list = [{"id": k, "prediction_text": v[0]["prediction_text"]} for k, v in predictions.items()]

# References لازم تفضل زي ما هي لكن تأكد إن الـ text و answer_start قايمة
references_list = [
    {"id": ex["id"], "answers": {"text": ex["answers"]["text"], "answer_start": ex["answers"]["answer_start"]}}
    for ex in squad_dataset["validation"]
]

# Compute metrics
results = squad_metric.compute(predictions=predictions_list, references=references_list)
print("Evaluation Results:", results)

Evaluation Results: {'exact_match': 77.77672658467361, 'f1': 85.78939628304471}


`8. Save Model and Tokenizer`

In [29]:
tokenizer.save_pretrained("./qa_model_tokenizer")
model.save_pretrained("./qa_model")

In [30]:
!zip -r qa_model.zip qa_model qa_model_tokenizer

  adding: qa_model/ (stored 0%)
  adding: qa_model/config.json (deflated 43%)
  adding: qa_model/checkpoint-22000/ (stored 0%)
  adding: qa_model/checkpoint-22000/config.json (deflated 43%)
  adding: qa_model/checkpoint-22000/trainer_state.json (deflated 73%)
  adding: qa_model/checkpoint-22000/tokenizer.json (deflated 71%)
  adding: qa_model/checkpoint-22000/model.safetensors (deflated 8%)
  adding: qa_model/checkpoint-22000/vocab.txt (deflated 53%)
  adding: qa_model/checkpoint-22000/training_args.bin (deflated 51%)
  adding: qa_model/checkpoint-22000/tokenizer_config.json (deflated 75%)
  adding: qa_model/checkpoint-22000/optimizer.pt (deflated 13%)
  adding: qa_model/checkpoint-22000/special_tokens_map.json (deflated 42%)
  adding: qa_model/checkpoint-22000/scaler.pt (deflated 60%)
  adding: qa_model/checkpoint-22000/scheduler.pt (deflated 56%)
  adding: qa_model/checkpoint-22000/rng_state.pth (deflated 25%)
  adding: qa_model/model.safetensors (deflated 8%)
  adding: qa_model/chec

In [None]:
import shutil

shutil.make_archive("qa_model", 'zip', "./qa_model")

shutil.make_archive("qa_model_tokenizer", 'zip', "./qa_model_tokenizer")

try:
    from google.colab import files
    files.download("qa_model.zip")
    files.download("qa_model_tokenizer.zip")
except ImportError:
    print("Not running on Colab, zip files saved locally.")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>