In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install transformers datasets torch transformers[torch]

In [None]:
from datasets import load_dataset

# Replace 'path/to/train.json' and 'path/to/test.json' with the actual paths
dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/Dataset/train.json", "test": "/content/drive/MyDrive/Dataset/train.json"})

In [None]:
from transformers import BertTokenizerFast

# Load the tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)

def preprocess_function(examples):
    # Tokenize the questions and contexts
    tokenized_examples = tokenizer(
        examples["question"], examples["context"], truncation="only_second", max_length=384, stride=128, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length"
    )

    # Map the start and end positions of the answer
    offset_mapping = tokenized_examples.pop("offset_mapping")
    sample_map = tokenized_examples.pop("overflow_to_sample_mapping")

    # Initialize empty lists for start and end positions
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # Get the example that this tokenized example originates from
        sample_idx = sample_map[i]
        answers = examples["answers"][sample_idx]

        # If no answer exists, set start and end positions to 0
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(0)
            tokenized_examples["end_positions"].append(0)
            continue

        # Find the start and end positions of the answer in the tokenized input
        answer_start_char = answers["answer_start"][0]
        answer_end_char = answer_start_char + len(answers["text"][0])

        start_token_idx = 0
        end_token_idx = 0

        for token_idx, (offset_start, offset_end) in enumerate(offsets):
            if offset_start == 0 and offset_end == 0:  # Special tokens
                continue
            if offset_start <= answer_start_char < offset_end:
                start_token_idx = token_idx
            if offset_start <= answer_end_char <= offset_end:
                end_token_idx = token_idx

        # Append the start and end positions
        tokenized_examples["start_positions"].append(start_token_idx)
        tokenized_examples["end_positions"].append(end_token_idx)

    return tokenized_examples

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

In [None]:
from transformers import BertForQuestionAnswering

# Load the pre-trained BERT model for question answering
model = BertForQuestionAnswering.from_pretrained(model_name)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")