In [None]:
import torch
import torch.nn as nn
import math
import numpy as np

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, max_len, d_model, device):
        super(PositionalEncoding, self).__init__()

        self.encoding = torch.zeros(max_len, d_model, device=device)
        self.encoding.requires_grad = False

        pos = torch.arange(0, max_len, device=device)
        pos = pos.float().unsqueeze(dim=1)

        _2i = torch.arange(0, d_model, step=2, device=device)

        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))


    def forward(self, x):
        # x before encoding
        batch_size, seq_len = x.size()
        return self.encoding[:seq_len, :]

In [None]:
class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, drop_prob, device):
        super(TransformerEmbedding, self).__init__()

        self.token_emb = nn.Embedding(vocab_size, d_model)
        self.segment_emb = nn.Embedding(2, d_model)  # Segment embeddings (0 for context, 1 for question)
        self.pos_encoding = PositionalEncoding(max_len, d_model, device)
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x, token_type_ids=None):
        tok_emb = self.token_emb(x)
        pos_enc = self.pos_encoding(x)

        if token_type_ids is not None:
            seg_emb = self.segment_emb(token_type_ids)
        else:
            seg_emb = torch.zeros_like(tok_emb, device=tok_emb.device)

        return self.dropout(tok_emb + pos_enc + seg_emb)



In [None]:
class LayerNorm(nn.Module):

    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features)) # alpha (multiplicative) is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features)) # bias (additive) is a learnable parameter

    def forward(self, x):
        # x: (batch, seq, hidden_size)
        # Keep the dimension for broadcasting
        mean = x.mean(dim = -1, keepdim = True) # (batch, seq, 1)
        # Keep the dimension for broadcasting
        std = x.std(dim = -1, keepdim = True) # (batch, seq, 1)
        # eps is to prevent dividing by zero or when std is very small
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

In [None]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_hidden, d_model, p=0.1):
        super(PositionwiseFeedForward, self).__init__()

        self.layer1 = nn.Linear(d_hidden, d_model)
        self.layer2 = nn.Linear(d_model, d_hidden)
        self.dropout = nn.Dropout(p=p)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.layer2(x)

        return x

In [None]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, h, d_model):
        super(MultiHeadSelfAttention, self).__init__()

        self.h = h
        self.d_model = d_model
        self.W_Q = nn.Linear(self.d_model, self.d_model, bias=False)
        self.W_K = nn.Linear(self.d_model, self.d_model, bias=False)
        self.W_V = nn.Linear(self.d_model, self.d_model, bias=False)
        self.W_T = nn.Linear(self.d_model, self.d_model, bias=False)
        self.softmax = nn.Softmax(dim=-1)
        self.score = None

    def forward(self, q, k, v, mask=None):
        # Generating Q, K, V for the input
        Q, K, V = self.W_Q(q), self.W_K(k), self.W_V(v)
        # Split the values for each head
        Q, K, V = self.split_head(Q), self.split_head(K), self.split_head(V)

        output, self.score = self.scaled_dot_product_attention(Q, K, V, mask=mask)
        # output --> (N, h, seq_len, dim)
        output = self.concat_multihead(output)
        output = self.W_T(output)

        return output

    def split_head(self, x):
        batch_size, seq_len, dim = x.size()
        head_dim = dim // self.h

        output = x.view(batch_size, seq_len, self.h, head_dim).transpose(1, 2)

        return output

    def concat_multihead(self, x):
        batch_size, n_head, seq_len, d_v = x.size()
        dim = d_v * n_head
        output = x.transpose(1,2).contiguous().view(batch_size, seq_len, dim)

        return output

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        _, _, _, d_k = K.size()
        K_t = K.transpose(2,3)

        score = (Q @ K_t) / math.sqrt(d_k)

        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(2)
            score = score.masked_fill(mask == 0, -1e9)

        score = self.softmax(score)

        output = score @ V

        return output, score

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, d_hidden, h, drop_prob):
        super(EncoderLayer, self).__init__()

        self.attention = MultiHeadSelfAttention(h, d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.layer_norm1 = LayerNorm(d_model)

        self.ffn = PositionwiseFeedForward(d_model, d_hidden, p=drop_prob)
        self.dropout2 = nn.Dropout(p=drop_prob)
        self.layer_norm2 = LayerNorm(d_model)

    def forward(self, x, src_mask):
        _x = x
        x = self.attention(q=x, k=x, v=x, mask=src_mask)

        x = self.dropout1(x)
        x = self.layer_norm1(x + _x)

        _x = x
        x = self.ffn(x)

        x = self.dropout2(x)
        x = self.layer_norm2(x + _x)

        return x

In [None]:
class Encoder(nn.Module):
    def __init__(self, enc_voc_size, max_len, d_model, d_hidden, h, n_layers, drop_prob, device):
        super(Encoder, self).__init__()
        self.emb = TransformerEmbedding(enc_voc_size, d_model, max_len, drop_prob=drop_prob, device=device)

        self.layers = nn.ModuleList([EncoderLayer(d_model,
                                        d_hidden,
                                        h,
                                        drop_prob)
                                        for _ in range(n_layers)])
        self.qa_outputs = nn.Linear(d_model, 2)


    def forward(self, input_ids=None, attention_mask=None,token_type_ids=None,start_positions=None,
        end_positions=None):
        x = self.emb(input_ids,token_type_ids)

        if attention_mask is not None:
            attention_mask = attention_mask.float()

        for layer in self.layers:
            x = layer(x,attention_mask)

        logits = self.qa_outputs(x)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        outputs = (start_logits, end_logits)

        if start_positions is not None and end_positions is not None:
            loss_fct = nn.CrossEntropyLoss()
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
            outputs = (total_loss,) + outputs

        return outputs


In [None]:
d_model = 768
d_hidden = 3072
n_layers = 12
n_heads = 12
drop_prob = 0.1
max_len = 512
vocab_size = 30522
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_encoder=Encoder(vocab_size, max_len, d_model, d_hidden, n_heads, n_layers, drop_prob, device)

In [None]:
model=bert_encoder
model


Encoder(
  (emb): TransformerEmbedding(
    (token_emb): Embedding(30522, 768)
    (segment_emb): Embedding(2, 768)
    (pos_encoding): PositionalEncoding()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (layers): ModuleList(
    (0-11): 12 x EncoderLayer(
      (attention): MultiHeadSelfAttention(
        (W_Q): Linear(in_features=768, out_features=768, bias=False)
        (W_K): Linear(in_features=768, out_features=768, bias=False)
        (W_V): Linear(in_features=768, out_features=768, bias=False)
        (W_T): Linear(in_features=768, out_features=768, bias=False)
        (softmax): Softmax(dim=-1)
      )
      (dropout1): Dropout(p=0.1, inplace=False)
      (layer_norm1): LayerNorm()
      (ffn): PositionwiseFeedForward(
        (layer1): Linear(in_features=768, out_features=3072, bias=True)
        (layer2): Linear(in_features=3072, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (relu): ReLU()
      )
      (dropout2): Dropout(p=0.1, i

In [None]:
!pip install transformers



In [None]:
from transformers import BertModel
pretrained_bert = BertModel.from_pretrained("bert-base-uncased")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:

#from transformers import BertModel
#pretrained_bert = BertModel.from_pretrained("bert-base-uncased")
 #Load matching weights from pretrained BERT
#pretrained_dict = pretrained_bert.state_dict()
#model_dict = model.state_dict()

# Rename and filter matching keys
#new_state_dict = {}
#for k, v in pretrained_dict.items():
    #new_key = k.replace("embeddings.word_embeddings", "emb.token_emb") \
               #.replace("embeddings.position_embeddings", "emb.pos_encoding") \
               #.replace("encoder.layer", "layers")
    #if new_key in model_dict:
        #new_state_dict[new_key] = v

# Update model state dict
#model_dict.update(new_state_dict)
#model.load_state_dict(model_dict, strict=False)

In [None]:
# prompt: print model weights
#for name, param in model.named_parameters():
    #if param.requires_grad:
        #print(f"{name}: shape={param.shape}\n{param.data}\n")


In [None]:
!pip install datasets
from datasets import load_dataset



In [None]:

squad=load_dataset("squad")

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
print(squad)
print(squad['train'][0])

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})
{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome

In [None]:
print("Context: ", squad["train"][0]["context"])
print("Question: ", squad["train"][0]["question"])
print("Answer: ", squad["train"][0]["answers"])

Context:  Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
Question:  To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Answer:  {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:

train_subset = squad["train"].select(range(8800))
train_dataset = train_subset.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=squad["train"].column_names,
)
len(squad["train"]), len(train_dataset)

Map:   0%|          | 0/8800 [00:00<?, ? examples/s]

(87599, 8937)

In [None]:
train_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 8937
})

In [None]:
def preprocess_validation_examples(examples):
    """Preprocess validation examples with proper offset mapping validation"""
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]

    # Tokenize with return_offsets_mapping
    inputs = tokenizer(
        questions,
        contexts,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Get mappings
    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []
    offset_mappings = inputs["offset_mapping"]

    # Validate and process each example
    for i, offsets in enumerate(offset_mappings):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])


        # Set offsets to None for non-context tokens
        sequence_ids = inputs.sequence_ids(i)
        inputs["offset_mapping"][i] = [
            offsets[k] if sequence_ids[k] == 1 else None
            for k in range(len(offsets))
        ]

    inputs["example_id"] = example_ids
    return inputs

In [None]:

validation_subset = squad["validation"].select(range(1000))

validation_dataset = validation_subset.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=squad["validation"].column_names,
)
len(squad["validation"]), len(validation_dataset)

(10570, 1020)

In [None]:
validation_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 1020
})

In [None]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

train_dataset.set_format("torch")
validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=32,
)
eval_dataloader = DataLoader(
    validation_set, collate_fn=default_data_collator, batch_size=32
)

In [None]:
!pip install evaluate



In [None]:
import collections
import evaluate
from evaluate import load

# Load the SQuAD metric
metric = evaluate.load("squad")
def compute_metrics(predictions, references):
    # Ensure all predictions and references are properly formatted
    formatted_predictions = [
        {"id": str(pred["id"]), "prediction_text": str(pred["prediction_text"])}
        for pred in predictions
    ]

    formatted_references = [
        {
            "id": str(ref["id"]),
            "answers": {
                "text": [str(ans) for ans in ref["answers"]["text"]],
                "answer_start": ref["answers"]["answer_start"]
            }
        }
        for ref in references
    ]

    results = metric.compute(
        predictions=formatted_predictions,
        references=formatted_references
    )
    return {
        "exact_match": results["exact_match"],
        "f1": results["f1"]
    }
def evaluate(model, eval_dataloader, processed_dataset, original_dataset, n_best=20, max_answer_length=30):
    model.eval()

    # Store all features and examples
    features = [processed_dataset[i] for i in range(len(processed_dataset))]
    examples = [original_dataset[i] for i in range(len(original_dataset))]

    # Create mapping from example ID to feature indices
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    # Get predictions
    start_logits = []
    end_logits = []

    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            outputs = model(
                input_ids=batch["input_ids"].to(device),
                attention_mask=batch["attention_mask"].to(device),
                token_type_ids=batch["token_type_ids"].to(device)
            )
            start_logits.append(outputs[0].cpu().numpy())
            end_logits.append(outputs[1].cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)

    # Get predicted answers
    predicted_answers = []
    for example in tqdm(examples, desc="Processing predictions"):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Get all features for this example
        for feature_index in example_to_features[example_id]:
            feature_start_logits = start_logits[feature_index]
            feature_end_logits = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            # Get the n-best predictions
            start_indexes = np.argsort(feature_start_logits)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(feature_end_logits)[-1 : -n_best - 1 : -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    if (end_index < start_index or
                        end_index - start_index + 1 > max_answer_length):
                        continue

                    answer_text = context[offsets[start_index][0]:offsets[end_index][1]]
                    answers.append({
                        "text": answer_text,
                        "logit_score": (feature_start_logits[start_index] +
                                      feature_end_logits[end_index])
                    })

        # Select best answer
        if answers:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append({
                "id": str(example_id),  # Ensure ID is string
                "prediction_text": str(best_answer["text"])  # Ensure text is string
            })
        else:
            predicted_answers.append({
                "id": str(example_id),
                "prediction_text": ""
            })

    # Prepare ground truth answers
    theoretical_answers = []
    for ex in original_dataset:
        # Handle both SQuAD v1 and v2 formats
        if isinstance(ex["answers"], dict):
            answer_texts = ex["answers"]["text"]
            answer_starts = ex["answers"]["answer_start"]
        else:
            answer_texts = [ans["text"] for ans in ex["answers"]]
            answer_starts = [ans["answer_start"] for ans in ex["answers"]]

        theoretical_answers.append({
            "id": str(ex["id"]),
            "answers": {
                "text": [str(text) for text in answer_texts],
                "answer_start": answer_starts
            }
        })

    # Compute metrics
    results = metric.compute(
        predictions=predicted_answers,
        references=theoretical_answers
    )

    print(f"Exact Match: {results['exact_match']:.2f} | F1 Score: {results['f1']:.2f}")
    return results

In [None]:
import math
import os
from tqdm import tqdm
from sklearn.metrics import f1_score
from torch.optim import AdamW
import collections
import re

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

checkpoint_path = "/kaggle/input/checkpoint/model_checkpoint.pth"
checkpoint_save_path = "/kaggle/working/model_checkpoint.pth"

# Function to save checkpoint
def save_checkpoint(model, optimizer, epoch, loss):
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "loss": loss
    }
    torch.save(checkpoint, checkpoint_save_path)
    print(f"Checkpoint saved at epoch {epoch+1}")

# Function to load checkpoint
def load_checkpoint(model, optimizer, device):
    try:
        checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

        # Move optimizer to correct device
        for param_group in optimizer.param_groups:
            param_group['lr'] = param_group['lr']  # Ensure LR remains unchanged

        start_epoch = checkpoint["epoch"] + 1  # Resume from next epoch
        loss = checkpoint["loss"]
        print(f"Checkpoint loaded. Resuming from epoch {start_epoch+1}.")
        return start_epoch, loss
    except FileNotFoundError:
        print("No checkpoint found, starting from epoch 0.")
        return 0, None  # Start fresh

# Training function
def train(model, dataloader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    correct_start = 0
    correct_end = 0
    total_samples = 0
    for batch in tqdm(dataloader, desc="Training"):
        # Move batch to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        start_positions = batch["start_positions"].to(device)
        end_positions = batch["end_positions"].to(device)
        # Ensure target tensors are 1D
        start_positions = start_positions.view(-1) # Reshape start_positions
        end_positions = end_positions.view(-1)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,start_positions=start_positions, end_positions=end_positions)
        loss, start_logits, end_logits = outputs[0], outputs[1], outputs[2]

        # Compute loss
        start_loss = loss_fn(start_logits, start_positions)
        end_loss = loss_fn(end_logits, end_positions)
        loss = (start_loss + end_loss) / 2

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Compute accuracy
        start_preds = torch.argmax(start_logits, dim=-1)
        end_preds = torch.argmax(end_logits, dim=-1)
        correct_start += (start_preds == start_positions).sum().item()
        correct_end += (end_preds == end_positions).sum().item()
        total_samples += start_positions.size(0)


    avg_loss = total_loss / len(dataloader)
    start_accuracy = correct_start / total_samples
    end_accuracy = correct_end / total_samples
    avg_accuracy = (start_accuracy + end_accuracy) / 2
    return avg_loss, avg_accuracy



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
start_epoch, _ = load_checkpoint(model, optimizer,device)

# Training loop
total_epochs = 47 # Change as needed
for epoch in range(start_epoch, total_epochs):
    print(f"Epoch {epoch + 1}/{total_epochs}")
    train_loss, train_accuracy = train(model, train_dataloader, optimizer, loss_fn, device)
    print(f"Training Loss: {train_loss}, Training Accuracy: {train_accuracy}")

    # Evaluate
    results=evaluate(model, eval_dataloader, validation_dataset, squad["validation"])


    # Save checkpoint after every epoch
    save_checkpoint(model, optimizer, epoch, train_loss)

Checkpoint loaded. Resuming from epoch 47.
Epoch 47/47


Training: 100%|██████████| 280/280 [05:48<00:00,  1.24s/it]


Training Loss: 0.5448460206389427, Training Accuracy: 0.827011301331543


Evaluating: 100%|██████████| 18/18 [00:07<00:00,  2.52it/s]
Processing predictions: 100%|██████████| 10570/10570 [00:00<00:00, 121677.13it/s]


Exact Match: 0.68 | F1 Score: 0.93
Checkpoint saved at epoch 47
