In [1]:
!pip install bitsandbytes




In [10]:
# %%capture
# import torch
# major_version, minor_version = torch.cuda.get_device_capability()

# # Must install separately since Colab has torch 2.2.1, which breaks packages
# !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# if major_version >= 8:
#     # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
#     !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
# else:
#     # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
#     !pip install --no-deps xformers trl peft accelerate bitsandbytes


In [2]:
import pandas as pd


In [5]:
import torch

In [6]:
# Mount Google Drive (skip this if already mounted)
from google.colab import drive
drive.mount('/content/drive')

# Load the JSON file from Google Drive
import json
json_file_path = '/content/drive/MyDrive/qa_data.json'

with open(json_file_path, 'r') as f:
    data = json.load(f)

# Check the structure of your data (optional)
# print(data[:1])  # Print the first item to check structure

# Convert the list of dictionaries to a Hugging Face Dataset
from datasets import Dataset

# Since 'data' is a list of dictionaries, we can use Dataset.from_dict
dataset = Dataset.from_pandas(pd.DataFrame(data))

from unsloth import FastLanguageModel

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-2-7b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Define your prompt template
alpaca_prompt = """Below is a question paired with a context. Write an answer based on the context.

### Question:
{}

### Context:
{}

### Answer:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Set the EOS token

# Formatting function to convert data to the correct prompt format
def formatting_prompts_func(examples):
    questions = examples["question"]
    contexts = examples["context"]
    answers = examples["answer"]
    texts = []
    for question, context, answer in zip(questions, contexts, answers):
        text = alpaca_prompt.format(question, context, answer) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Apply the formatting function to your dataset
dataset = dataset.map(formatting_prompts_func, batched=True)
from trl import SFTTrainer
from transformers import TrainingArguments

# Define the trainer and training arguments
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        run_name="run1",
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

# Start the training
trainer_stats = trainer.train()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
==((====))==  Unsloth 2024.12.3: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/15 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 15 | Num Epochs = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 39,976,960
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
1,2.3252
2,2.4194
3,2.3233
4,2.3529
5,2.2729
6,2.14
7,2.0698
8,1.9355
9,1.8705
10,1.6754


In [7]:
model_save_path = '/content/drive/MyDrive/llama_7b_model'
tokenizer_save_path = '/content/drive/MyDrive/llama_7b_tokenizer'

# Save the model and tokenizer
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

('/content/drive/MyDrive/llama_7b_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/llama_7b_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/llama_7b_tokenizer/tokenizer.model',
 '/content/drive/MyDrive/llama_7b_tokenizer/added_tokens.json',
 '/content/drive/MyDrive/llama_7b_tokenizer/tokenizer.json')

Pushing to Huggingface

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login

# Login to Hugging Face Hub (if you haven't already)
login()

# Define your paths
model_save_path = '/content/drive/MyDrive/llama_7b_model'
tokenizer_save_path = '/content/drive/MyDrive/llama_7b_tokenizer'

# Load your model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_save_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)

# Push the model and tokenizer to Hugging Face Hub
model.push_to_hub("Yashavika/llama_7b_model")
tokenizer.push_to_hub("Yashavika/llama_7b_tokenizer")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

`low_cpu_mem_usage` was None, now default to True since model is quantized.


adapter_model.safetensors:   0%|          | 0.00/160M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Yashavika/llama_7b_tokenizer/commit/46d71740fc06c8a092cc835c059ba3b4c08c37b9', commit_message='Upload tokenizer', commit_description='', oid='46d71740fc06c8a092cc835c059ba3b4c08c37b9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Yashavika/llama_7b_tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='Yashavika/llama_7b_tokenizer'), pr_revision=None, pr_num=None)