In [None]:
!pip install bitsandbytes


In [None]:
# %%capture
# import torch
# major_version, minor_version = torch.cuda.get_device_capability()

# # Must install separately since Colab has torch 2.2.1, which breaks packages
# !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# if major_version >= 8:
#     # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
#     !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
# else:
#     # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
#     !pip install --no-deps xformers trl peft accelerate bitsandbytes


In [None]:
import pandas as pd


In [None]:
import torch

In [None]:
!pip install datasets

In [None]:
!pip install unsloth

In [None]:
!pip install huggingface_hub

In [None]:
# Mount Google Drive (skip this if already mounted)
from google.colab import drive
drive.mount('/content/drive')

# Load the JSON file from Google Drive
import json
json_file_path = '/content/drive/MyDrive/qna_triplets.json'

with open(json_file_path, 'r') as f:
    data = json.load(f)

# Check the structure of your data (optional)
# print(data[:1])  # Print the first item to check structure

# Convert the list of dictionaries to a Hugging Face Dataset
from datasets import Dataset

# Since 'data' is a list of dictionaries, we can use Dataset.from_dict
dataset = Dataset.from_pandas(pd.DataFrame(data))

from unsloth import FastLanguageModel

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-2-7b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Define your prompt template
alpaca_prompt = """Below is a question paired with a context. Write an answer based on the context.

### Question:
{}

### Context:
{}

### Answer:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Set the EOS token

# Formatting function to convert data to the correct prompt format
def formatting_prompts_func(examples):
    questions = examples["question"]
    contexts = examples["context"]
    answers = examples["answer"]
    texts = []
    for question, context, answer in zip(questions, contexts, answers):
        text = alpaca_prompt.format(question, context, answer) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Apply the formatting function to your dataset
dataset = dataset.map(formatting_prompts_func, batched=True)
from trl import SFTTrainer
from transformers import TrainingArguments

# Define the trainer and training arguments
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        run_name="run1",
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=20,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

# Start the training
trainer_stats = trainer.train()


In [None]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 

In [None]:
model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")


In [None]:
access_token = "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"
model.push_to_hub_gguf(
    "debika/model",
    tokenizer=tokenizer,
    quantization_method="f16",
    token=access_token
)


In [None]:
# model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
# access_token = "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"
# model.push_to_hub_gguf(
#     "debika/model_final",
#     tokenizer=tokenizer,
#     quantization_method="q4_k_m",
#     token=access_token
# )

In [None]:
# # Define paths
# model_save_path = '/content/drive/MyDrive/llama_7b_model_version2'
# tokenizer_save_path = '/content/drive/MyDrive/llama_7b_tokenizer_version2'
# hf_model_repo = "debika/model_ig"

# # Save merged model and tokenizer
# model.save_pretrained_merged(model_save_path, tokenizer, save_method="merged_16bit")
# tokenizer.save_pretrained(tokenizer_save_path)


# # Save to 16bit GGUF


# print("Model and tokenizer saved locally in merged format.")