In [None]:
# import os
# os.kill(os.getpid(), 9)

In [None]:
import wandb
from huggingface_hub import login

# Hugging Face and W&B API Keys
hugging_face_key = "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"
wandb_api_key = "9974aaae099d5a49bd33fdc8498d4be48f56531a"

# Logging into Hugging Face using the API key
login(token=hugging_face_key)

# Set up W&B for logging
wandb.login(key=wandb_api_key)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# Install necessary packages
!pip install transformers datasets peft accelerate bitsandbytes

import torch
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

# Step 1: Load the SQuAD dataset
dataset = load_dataset("squad")

# Step 2: Load the tokenizer and the BERT model (use smaller model for faster training)
model_id = "bert-base-uncased"  # Use smaller model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForQuestionAnswering.from_pretrained(model_id)

# Step 3: Preprocess the dataset
def preprocess_function(examples):
    tokenized_examples = tokenizer(
        examples["question"], examples["context"], truncation=True, padding="max_length", max_length=512
    )

    # Find the start and end positions of the answers in the context
    start_positions = []
    end_positions = []

    for i in range(len(examples["answers"])):
        context = examples["context"][i]
        answer = examples["answers"][i]["text"][0]  # Assuming there's only one answer per question
        start_pos = context.find(answer)
        end_pos = start_pos + len(answer) - 1

        start_positions.append(start_pos)
        end_positions.append(end_pos)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples

# Apply preprocessing
train_dataset = dataset["train"].map(preprocess_function, batched=True)
validation_dataset = dataset["validation"].map(preprocess_function, batched=True)

# Step 4: Apply LoRA (PEFT) to the model
lora_config = LoraConfig(
    r=8,  # Low-rank decomposition rank
    lora_alpha=32,  # Scaling factor for LoRA
    lora_dropout=0.1,  # Dropout rate for LoRA
    task_type="QUESTION_ANS",  # LoRA task type for Question Answering
)

# Get the LoRA model
model = get_peft_model(model, lora_config)

# Step 5: Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Output directory for the model checkpoints
    evaluation_strategy="epoch",  # Evaluate after each epoch
    learning_rate=2e-5,  # Learning rate
    per_device_train_batch_size=4,  # Batch size for training
    per_device_eval_batch_size=4,  # Batch size for evaluation
    num_train_epochs=2,  # Number of epochs (adjust as needed)
    weight_decay=0.01,  # Weight decay for regularization
    save_steps=10_000,  # Save checkpoint every 10k steps
    logging_dir="./logs",  # Directory for storing logs
    logging_steps=500,  # Log every 500 steps
    fp16=True,  # Use mixed precision training
    gradient_accumulation_steps=2,  # Simulate larger batch size
)

# Step 6: Create the Trainer object
trainer = Trainer(
    model=model,  # Model for fine-tuning
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=validation_dataset,  # Validation dataset
    tokenizer=tokenizer,  # Tokenizer for preprocessing
)

# Step 7: Start training
trainer.train()

# Step 8: Save the fine-tuned model
model.save_pretrained("./fine_tuned_lora_bert_squad")
tokenizer.save_pretrained("./fine_tuned_lora_bert_squad")

# Step 9: Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Step 10: Use the fine-tuned model for Question Answering
def answer_question(question, context):
    inputs = tokenizer(question, context, return_tensors="pt")
    answer_start_scores, answer_end_scores = model(**inputs)

    # Find the start and end of the answer span
    answer_start = answer_start_scores.argmax()
    answer_end = answer_end_scores.argmax()

    # Decode the answer
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end + 1]))
    return answer

# Example usage: Answering a question using the fine-tuned LoRA model
context = "Hugging Face is creating a tool that democratizes AI."
question = "What is Hugging Face creating?"
answer = answer_question(question, context)
print(f"Answer: {answer}")


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

KeyboardInterrupt: 