<a href="https://colab.research.google.com/github/ZeyadAboeleneen/Deep_Learning/blob/main/Transformar_Ass.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade pip
!pip install transformers datasets torch

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0.1
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Do

In [None]:
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizerFast, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import os
import numpy as np

In [None]:
os.environ["WANDB_DISABLED"] = "true"
dataset = load_dataset("squad_v2")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

In [None]:
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(500))
small_val_dataset = dataset["validation"].shuffle(seed=42).select(range(100))

In [None]:
def preprocess_data(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        if len(answer["answer_start"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
            continue

        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
train_dataset = small_train_dataset.map(preprocess_data, batched=True, remove_columns=small_train_dataset.column_names)
val_dataset = small_val_dataset.map(preprocess_data, batched=True, remove_columns=small_val_dataset.column_names)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
training_args = TrainingArguments(
    output_dir="./trained_qa_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    learning_rate=3e-5,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,4.8983,3.167227


TrainOutput(global_step=63, training_loss=4.696515764508929, metrics={'train_runtime': 1735.9838, 'train_samples_per_second': 0.289, 'train_steps_per_second': 0.036, 'total_flos': 65587856216064.0, 'train_loss': 4.696515764508929, 'epoch': 1.0})

In [None]:
def debug_qa(question, context):
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True)
    print("Tokenized input:", tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]))

    with torch.no_grad():
        outputs = model(**inputs)

    start_probs = torch.softmax(outputs.start_logits, dim=1)[0]
    end_probs = torch.softmax(outputs.end_logits, dim=1)[0]
    top_starts = torch.topk(start_probs, 5)
    top_ends = torch.topk(end_probs, 5)

    print("\nTop start positions:")
    for i, (prob, idx) in enumerate(zip(top_starts.values, top_starts.indices)):
        token = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][idx])
        print(f"{i+1}. Prob: {prob:.2f}, Token: '{token}' (position {idx})")

    print("\nTop end positions:")
    for i, (prob, idx) in enumerate(zip(top_ends.values, top_ends.indices)):
        token = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][idx])
        print(f"{i+1}. Prob: {prob:.2f}, Token: '{token}' (position {idx})")

    best_start = torch.argmax(start_probs)
    best_end = torch.argmax(end_probs)

    answer_tokens = inputs["input_ids"][0][best_start:best_end+1]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

    print("\nRaw model output:", answer)
    return answer

In [None]:
context_text = """Diabetes is a chronic disease that occurs when the pancreas is no longer able to make insulin,
or when the body cannot make good use of the insulin it produces. Common symptoms include increased thirst,
frequent urination, and extreme fatigue."""

question_text = "what are symptoms of diabetes"
answer = ask_question(question_text, context_text)
print(f"Question: {question_text}")
print(f"Answer: {answer}")

Question: what are symptoms of diabetes
Answer: increased thirst,
frequent urination, and extreme fatigue
