<a href="https://colab.research.google.com/github/anandshaurya011/Automation-Without-Sensor/blob/master/self%20train%20bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets transformers
!pip install accelerate>=0.21.0



In [14]:
import json
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast, BertForQuestionAnswering, Trainer, TrainingArguments, pipeline


In [23]:
import json
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast, BertForQuestionAnswering, Trainer, TrainingArguments

# Load and preprocess the dataset
def load_and_preprocess_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

# Convert the data to Hugging Face Dataset
def create_dataset(data):
    contexts = []
    questions = []
    answers = []

    for item in data:
        context = item['context']
        for qa in item['qas']:
            question = qa['question']
            answer = qa['answers'][0]['text']
            answer_start = qa['answers'][0]['answer_start']

            contexts.append(context)
            questions.append(question)
            answers.append({'text': answer, 'answer_start': answer_start})

    dataset = Dataset.from_dict({
        'context': contexts,
        'question': questions,
        'answers': answers
    })

    return DatasetDict({'train': dataset})  # Return a DatasetDict with a single key 'train' and a Dataset as its value

# Tokenize the data
def tokenize_data(dataset, tokenizer):
    def preprocess_function(examples):
        questions = [q.strip() for q in examples['question']]
        contexts = examples['context']

        # Tokenize contexts and questions separately
        tokenized_batch = tokenizer(
            contexts,
            questions,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_offsets_mapping=True,
            return_overflowing_tokens=True,
            return_special_tokens_mask=True,
            stride=256
        )

        # Update tokenized inputs with start and end positions for QA
        offset_mapping = tokenized_batch.pop("offset_mapping")
        answers = examples["answers"]
        start_positions = []
        end_positions = []

        for i, offset in enumerate(offset_mapping):
            # Handle case where answers might be shorter than expected
            if i >= len(answers):
                start_positions.append(0)
                end_positions.append(0)
                continue

            answer = answers[i]
            start_char = answer['answer_start']
            end_char = start_char + len(answer['text'])

            sequence_ids = tokenized_batch.sequence_ids(i)

            # Handle cases where answer is outside context or truncated
            if sequence_ids.count(1) <= 1:  # No real tokens or only [CLS], [SEP]
                start_positions.append(0)
                end_positions.append(0)
                continue

            context_start = sequence_ids.index(1)
            context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                idx_start = next(idx for idx, (s, e) in enumerate(offset) if s <= start_char <= e)
                idx_end = next(idx for idx, (s, e) in enumerate(offset) if s <= end_char <= e)

                start_positions.append(idx_start)
                end_positions.append(idx_end)

        tokenized_batch.update({
            'start_positions': start_positions,
            'end_positions': end_positions
        })

        return tokenized_batch

    # Apply preprocessing function to the dataset
    return dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

# Fine-tune the model
def fine_tune_model(train_dataset, eval_dataset, tokenizer):
    model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=100,
        weight_decay=0.01,
        gradient_accumulation_steps=2,
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset["train"],
        eval_dataset=eval_dataset["train"],
        tokenizer=tokenizer
    )

    trainer.train()
    return model

# Main script
if __name__ == "__main__":
    file_path = "/content/simplifin_ai_dataset.json"

    # Load and preprocess data
    data = load_and_preprocess_data(file_path)

    # Create dataset
    dataset = create_dataset(data)

    # Split dataset for training and evaluation
    train_dataset = dataset  # You can adjust this for your actual split
    eval_dataset = dataset   # You can adjust this for your actual split

    # Initialize tokenizer
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

    # Tokenize datasets
    tokenized_train_dataset = tokenize_data(train_dataset, tokenizer)
    tokenized_eval_dataset = tokenize_data(eval_dataset, tokenizer)

    # Fine-tune model
    model = fine_tune_model(tokenized_train_dataset, tokenized_eval_dataset, tokenizer)

    # Save model and tokenizer
    model.save_pretrained("./fine_tuned_model")
    tokenizer.save_pretrained("./fine_tuned_model")


Map:   0%|          | 0/339 [00:00<?, ? examples/s]

Map:   0%|          | 0/339 [00:00<?, ? examples/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
0,No log,0.856066
2,No log,0.181127
4,No log,0.078577
6,No log,0.025688
8,No log,0.013963
10,No log,0.0064
12,No log,0.003092
14,No log,0.001802
16,No log,0.001334
18,No log,0.002006
