In [1]:
# Import statements
import transformers
import os
import wandb
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
import torch
from datetime import datetime
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from trl import SFTConfig, SFTTrainer

In [2]:
wandb.login()

wandb_project = "pm-classify-finetune"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33map4489[0m ([33map4489-columbia-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128' #'expandable_segments:True' 

In [None]:
train_dataset = load_dataset('json', data_files='./train_data.jsonl', split='train')
val_dataset = load_dataset('json', data_files='./test_data.jsonl', split='train')

In [4]:
import pandas as pd

train_df = pd.read_json('./train_data.jsonl', lines=True)
val_df = pd.read_json('./test_data.jsonl', lines=True)

In [5]:
all_messages = []
for i in range(len(train_df)):
    message = [
        {"role": "user", "content": train_df.Prompt.values[i] +  train_df.Context.values[i][:10000]},
        {"role": "assistant", "content": train_df.Response.values[i]},
        ]
    all_messages.append(message)
train_dataset = Dataset.from_dict({"formatted_chat": all_messages})

all_messages = []
for i in range(len(val_df)):
    message = [
        {"role": "user", "content": val_df.Prompt.values[i] +  val_df.Context.values[i][:10000]},
        {"role": "assistant", "content": val_df.Response.values[i]},
        ]
    all_messages.append(message)

val_dataset = Dataset.from_dict({"formatted_chat": all_messages})

In [None]:
model_id = "Equall/Saul-Instruct-v1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.set_default_template = False

In [9]:
def clean_text(text):
    text = ' '.join(text.split())

    text = text.replace('\n', ' ')
    text = text.replace('\\t', ' ')
    text = text.replace('\\"', '"')

    return text

In [8]:
def truncate_and_tokenize(example):
    max_len = 2000

    prompt = clean_text(example["Prompt"])
    context = clean_text(example["Context"])
    response = clean_text(example["Response"])

    prompt_tokens = tokenizer(prompt, add_special_tokens=False)['input_ids']
    response_tokens = tokenizer(response, add_special_tokens=False)['input_ids']

    available_tokens = max_len - len(prompt_tokens) - len(response_tokens) - 2  
    
    context_tokens = tokenizer(context, add_special_tokens=False)['input_ids']
    if len(context_tokens) > available_tokens:
        context_tokens = context_tokens[:available_tokens]

    truncated_context = tokenizer.decode(context_tokens, skip_special_tokens=True)

    messages = [
        {"role": "user", "content": f"{prompt}\n{truncated_context}"},
        {"role": "assistant", "content": response}
    ]
    
    formatted_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)

    tokens = tokenizer(
        formatted_input, 
        add_special_tokens=True,
        padding="max_length", 
        truncation=True, 
        max_length=max_len 
    )  

    return {
        "input_ids": tokens["input_ids"],
        "attention_mask": tokens["attention_mask"],
        "labels": tokens["input_ids"][:]  
    }


In [9]:
tok_train_dataset = train_dataset.map(truncate_and_tokenize, remove_columns=train_dataset.column_names)
tok_val_dataset = val_dataset.map(truncate_and_tokenize, remove_columns=val_dataset.column_names)

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 648/648 [00:19<00:00, 33.77 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 163/163 [00:05<00:00, 31.83 examples/s]


In [10]:
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r= 32,
    lora_alpha= 64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head",],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
# print_trainable_parameters(model)

In [None]:
run_name = "saul-classification-ft"

training_args = SFTConfig(
    output_dir="./saul-classification",
    warmup_steps=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    max_steps=50,
    learning_rate=2.5e-5,
    bf16=True,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    eval_steps=25,
    do_eval=True,
    evaluation_strategy="steps",
    save_total_limit=3,
    max_grad_norm=0.3,
    report_to="wandb",
    run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",
    max_seq_length=512,
)

data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

def formatting_prompts_func(example):
    return tokenizer.apply_chat_template(example['formatted_chat'], tokenize=False, add_generation_prompt=False)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=training_args,
    data_collator=data_collator,
    formatting_func=formatting_prompts_func
)

torch.cuda.empty_cache()

trainer.train()

In [None]:
run_name = "saul-classification-ft"
training_args = TrainingArguments(
    output_dir="./saul-classification-ft",
    warmup_steps=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    max_steps=200,
    learning_rate=2.5e-5,
    bf16=True,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    eval_steps=25,
    do_eval=True,
    evaluation_strategy="steps",
    save_total_limit=3,
    max_grad_norm=0.3,
    report_to="wandb",
    run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
)

data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = transformers.Trainer(
    model=model,
    train_dataset=tok_train_dataset,
    eval_dataset=tok_val_dataset,
    args=training_args,
    data_collator=data_collator,
)

#torch.cuda.synchronize()
torch.cuda.empty_cache()

trainer.train()

In [None]:
trainer.model.save_pretrained("saul-ft-200/")
model.config.use_cache = True

from huggingface_hub import HfApi

hf_token = ""
api = HfApi(token=hf_token)
api.upload_folder(
    repo_id="prx2sam/saul-ft-200",
    folder_path="../saul-ft-200/"
)