In [None]:
wandb.login()

wandb_project = "pm-classify-finetune"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project


In [None]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128' #'expandable_segments:True'

In [None]:
train_dataset = load_dataset('json', data_files='./train_data.jsonl', split='train')
val_dataset = load_dataset('json', data_files='./test_data.jsonl', split='train')

In [None]:
model_id = "Equall/Saul-Instruct-v1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True

import matplotlib.pyplot as plt

def plot_data_lengths(tokenized_train_dataset, tokenized_val_dataset):
    lengths = [len(x['input_ids']) for x in tokenized_train_dataset]
    lengths += [len(x['input_ids']) for x in tokenized_val_dataset]
    print(len(lengths))

    # Plotting the histogram
    plt.figure(figsize=(10, 6))
    plt.hist(lengths, bins=20, alpha=0.7, color='blue')
    plt.xlabel('Length of input_ids')
    plt.ylabel('Frequency')
    plt.title('Distribution of Lengths of input_ids')
    plt.show()

In [None]:
def clean_text(text):
    text = ' '.join(text.split())

    text = text.replace('\n', ' ')
    text = text.replace('\\t', ' ')
    text = text.replace('\\"', '"')

    return text

In [None]:
def truncate_and_tokenize(example):
    question = f"### Question: {example['Prompt']}\n"
    answer = f"\n### Answer: {example['Response']}"
    max_len = 2000

    q_tokens = tokenizer(question, add_special_tokens=False)['input_ids']
    a_tokens = tokenizer(answer, add_special_tokens=False)['input_ids']
    c_tokens = tokenizer(clean_text(example['Context']), add_special_tokens=False)['input_ids']

    # Calculate available space for context
    available = max_len - len(q_tokens) - len(a_tokens) - 2

    # Truncate context if needed
    if len(c_tokens) > available:
        c_tokens = c_tokens[:available]

    # Combine all tokens
    combined_tokens = (
        [tokenizer.bos_token_id] +
        q_tokens +
        c_tokens +
        a_tokens +
        [tokenizer.eos_token_id]
    )

    attention_mask = [1] * len(combined_tokens)

    # Pad if necessary
    if len(combined_tokens) < max_len:
        padding_length = max_len - len(combined_tokens)
        combined_tokens.extend([tokenizer.pad_token_id] * padding_length)
        attention_mask.extend([0] * padding_length)

    result = {
        "input_ids": combined_tokens,
        "attention_mask": attention_mask,
        "labels": combined_tokens.copy()
    }

    return result

In [None]:
tok_train_dataset = train_dataset.map(truncate_and_tokenize, remove_columns=train_dataset.column_names)
tok_val_dataset = val_dataset.map(truncate_and_tokenize, remove_columns=val_dataset.column_names)

In [None]:
# Import statements
import transformers
import os
import wandb
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
import torch
from datetime import datetime
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

def print_trainable_parameters(model):
    trainable_parameters = 0
    all_parameters = 0
    for i, param in model.named_parameters():
        all_parameters += param.numel()
        if param.requires_grad:
            trainable_parameters += param.numel()
    print(f"trainable_parameters: {trainable_parameters} || all_parameters: {all_parameters} || trainable: {100 * trainable_parameters/all_parameters}" )

In [None]:
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r= 32,
    lora_alpha= 64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head",],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
# print_trainable_parameters(model)

In [None]:
run_name = "saul-classification-ft"
training_args = TrainingArguments(
    output_dir="./saul-classification-ft",
    warmup_steps=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    max_steps=200,
    learning_rate=2.5e-5,
    bf16=True,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    eval_steps=25,
    do_eval=True,
    evaluation_strategy="steps",
    save_total_limit=3,
    max_grad_norm=0.3,
    report_to="wandb",
    run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
)

data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = transformers.Trainer(
    model=model,
    train_dataset=tok_train_dataset,
    eval_dataset=tok_val_dataset,
    args=training_args,
    data_collator=data_collator,
)

#torch.cuda.synchronize()
torch.cuda.empty_cache()

trainer.train()

In [None]:
trainer.model.save_pretrained("saul-ft-200/")
model.config.use_cache = True

In [None]:
from huggingface_hub import push_to_hub

model.push_to_hub("prx2sam/saul-ft-200")

print("Model uploaded to huggingface.")