In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from datasets import load_dataset, Dataset,DatasetDict
import os 


In [2]:
os.environ["WANDB_API_KEY"] = "e2ab1b2b4244272268524960c98f9a9e6a5decd6"
os.environ["WANDB_PROJECT"]="ft"
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
# import wandb

# sweep_configuration = {
#     "method": "random",
#     "name": "sweep",
#     "metric": {"goal": "minimize", "name": "score"},
#     "parameters": {
#         # "batch_size": {"values": [16, 32, 64]},
#         "epochs": {"values": [2, 4, 6]},
#         "lr": {"max": 5e-4, "min": 1e-5},
#     },
# }

# sweep_id = wandb.sweep(sweep=sweep_configuration, project="ft")

In [4]:
modelpath=r"/tsukimi/llm/Meta-Llama-3-8B/"
tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast=False)   
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
train_dataset = load_dataset("Amirkid/MedQuad-dataset")
all_train_data = []
for i in range(0,len(train_dataset["train"]),2):
    all_train_data.append(f'Question:\n{train_dataset["train"][i]["text"]} \n\nAnswer:\n{train_dataset["train"][i+1]["text"]}')

In [6]:
dataset = Dataset.from_dict({"text": all_train_data}).train_test_split(test_size=0.1)
# dataset['validation'] = dataset['test']
# del dataset['test']

In [7]:
# test_data_path = '/workdir/MedQA/data/test_set.csv'
# test_dataset = load_dataset("csv", data_files=test_data_path)['train']

# all_test_data = []
# for test_data in test_dataset:
#     all_test_data.append(f"Question:\n{test_data['question']} \n\nAnswer:\n{test_data['answer']}")
# dataset['test']= Dataset.from_dict({"text": all_test_data})


In [8]:
dateset_tokenized = dataset.map(
    lambda examples: tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512),
    batched=True, 
    num_proc=4,   
    remove_columns=["text"])  

dataset = dateset_tokenized

Map (num_proc=4):   0%|          | 0/14760 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1640 [00:00<?, ? examples/s]

In [9]:
model = AutoModelForCausalLM.from_pretrained(
    modelpath,    
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
    ),
    torch_dtype=torch.bfloat16,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
config = LoraConfig(
    r=64, 
    lora_alpha=16, 
    target_modules = ['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj', 'up_proj'],
    lora_dropout=0.1, 
    bias="none", 
    modules_to_save = ["lm_head", "embed_tokens"],		# needed because we added new tokens to tokenizer/model
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)

In [11]:
from transformers.integrations import WandbCallback
import pandas as pd


def decode_predictions(tokenizer, predictions):
    labels = tokenizer.batch_decode(predictions.label_ids)
    logits = predictions.predictions.argmax(axis=-1)
    prediction_text = tokenizer.batch_decode(logits)
    return {"labels": labels, "predictions": prediction_text}


class WandbPredictionProgressCallback(WandbCallback):
    """Custom WandbCallback to log model predictions during training.

    This callback logs model predictions and labels to a wandb.Table at each 
    logging step during training. It allows to visualize the 
    model predictions as the training progresses.

    Attributes:
        trainer (Trainer): The Hugging Face Trainer instance.
        tokenizer (AutoTokenizer): The tokenizer associated with the model.
        sample_dataset (Dataset): A subset of the validation dataset 
          for generating predictions.
        num_samples (int, optional): Number of samples to select from 
          the validation dataset for generating predictions. Defaults to 100.
        freq (int, optional): Frequency of logging. Defaults to 2.
    """

    def __init__(self, trainer, tokenizer, val_dataset,
                 num_samples=100, freq=2):
        """Initializes the WandbPredictionProgressCallback instance.

        Args:
            trainer (Trainer): The Hugging Face Trainer instance.
            tokenizer (AutoTokenizer): The tokenizer associated 
              with the model.
            val_dataset (Dataset): The validation dataset.
            num_samples (int, optional): Number of samples to select from 
              the validation dataset for generating predictions.
              Defaults to 100.
            freq (int, optional): Frequency of logging. Defaults to 2.
        """
        super().__init__()
        self.trainer = trainer
        self.tokenizer = tokenizer
        self.sample_dataset = val_dataset.select(range(num_samples))
        self.freq = freq

    def on_evaluate(self, args, state, control, **kwargs):
        super().on_evaluate(args, state, control, **kwargs)
        # control the frequency of logging by logging the predictions
        # every `freq` epochs
        if state.epoch % self.freq == 0:
            # generate predictions
            predictions = self.trainer.predict(self.sample_dataset)
            # decode predictions and labels
            predictions = decode_predictions(self.tokenizer, predictions)
            # add predictions to a wandb.Table
            predictions_df = pd.DataFrame(predictions)
            predictions_df["epoch"] = state.epoch
            records_table = self._wandb.Table(dataframe=predictions_df)
            # log the table to wandb
            self._wandb.log({"sample_predictions": records_table})


In [12]:
def collate(elements):
    tokenlist=[e["input_ids"] for e in elements]
    tokens_maxlen=max([len(t) for t in tokenlist])

    input_ids,labels,attention_masks = [],[],[]
    for tokens in tokenlist:
        pad_len=tokens_maxlen-len(tokens)

        # pad input_ids with pad_token, labels with ignore_index (-100) and set attention_mask 1 where content otherwise 0
        input_ids.append( tokens + [tokenizer.pad_token_id]*pad_len )   
        labels.append( tokens + [-100]*pad_len )    
        attention_masks.append( [1]*len(tokens) + [0]*pad_len ) 

    batch={
        "input_ids": torch.tensor(input_ids),
        "labels": torch.tensor(labels),
        "attention_mask": torch.tensor(attention_masks)
    }
    return batch


In [13]:

bs=8      # batch size
ga_steps=1  # gradient acc. steps
epochs=5
steps_per_epoch=len(dataset["train"])//(bs*ga_steps)


args = TrainingArguments(
    output_dir="/tsukimi/llm/ft",
    report_to='wandb',  
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    evaluation_strategy="steps",
    logging_steps=1,
    eval_steps=steps_per_epoch,		# eval and save once per epoch  	
    save_steps=steps_per_epoch,
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit",
    learning_rate=0.0005,
    group_by_length=True,
    fp16=True,
    ddp_find_unused_parameters=False,
)

trainer = Trainer(
model=model,
tokenizer=tokenizer,
data_collator=collate,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
args=args,
)

progress_callback = WandbPredictionProgressCallback(
    trainer=trainer,
    tokenizer=tokenizer,
    val_dataset=dataset["test"],
    num_samples=10,
    freq=2,
)
trainer.add_callback(progress_callback)




Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [14]:


trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mcalvinchai[0m ([33mcalvin-chai[0m). Use [1m`wandb login --relogin`[0m to force relogin




AssertionError: No inf checks were recorded for this optimizer.

In [None]:
model = model.merge_and_unload()

In [None]:
# def train(config=None):
#   with wandb.init(config=config):
#     # set sweep configuration
#     config = wandb.config

#     bs=16

#     # set training arguments
#     training_args = TrainingArguments(
#         output_dir="/tsukimi/llm/ft",
# 	report_to='wandb',  # Turn on Weights & Biases logging
#         num_train_epochs=config.epochs,
#         learning_rate=config.learning_rate,
#         # weight_decay=config.weight_decay,
#         per_device_train_batch_size=bs,
#         per_device_eval_batch_size=bs,
#         save_strategy='epoch',
#         evaluation_strategy='epoch',
#         logging_strategy='epoch',
#         load_best_model_at_end=True,
#         remove_unused_columns=False,
#         fp16=True
#     )
#     trainer = Trainer(
#     model=model,
#     tokenizer=tokenizer,
#     data_collator=collate,
#     train_dataset=dataset["train"],
#     eval_dataset=dataset["validation"],
#     test_dataset=dataset["test"],
#     args=training_args,
# )

#     progress_callback = WandbPredictionProgressCallback(
#         trainer=trainer,
#         tokenizer=tokenizer,
#         val_dataset=dataset["validation"],
#         num_samples=10,
#         freq=2,
#     )
#     trainer.add_callback(progress_callback)


#     # start training loop
#     trainer.train()


In [None]:
# wandb.agent(sweep_id, train, count=20)

[34m[1mwandb[0m: Agent Starting Run: 5h5njsmy with config:
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	lr: 0.0004154122001904217
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mcalvinchai[0m ([33mcalvin-chai[0m). Use [1m`wandb login --relogin`[0m to force relogin


Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_config.py", line 162, in __getattr__
    return self.__getitem__(key)
  File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_config.py", line 130, in __getitem__
    return self._items[key]
KeyError: 'learning_rate'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/tmp/ipykernel_17249/4270920747.py", line 13, in train
    learning_rate=config.learning_rate,
  File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_config.py", line 164, in __getattr__
    raise AttributeError(
AttributeError: <class 'wandb.sdk.wandb_config.Config'> object has no attribute 'learning_rate'
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


[34m[1mwandb[0m: [32m[41mERROR[0m Problem finishing run
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_config.py", line 162, in __getattr__
    return self.__getitem__(key)
  File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_config.py", line 130, in __getitem__
    return self._items[key]
KeyError: 'learning_rate'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/tmp/ipykernel_17249/4270920747.py", line 13, in train
    learning_rate=config.learning_rate,
  File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_config.py", line 164, in __getattr__
    raise AttributeError(
AttributeError: <class 'wandb.sdk.wandb_config.Config'> object has no attribute 'learning_rate'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/wandb/sdk/wandb_run.py", line 231

In [None]:

# bs=8      # batch size
# ga_steps=1  # gradient acc. steps
# epochs=5
# steps_per_epoch=len(dataset["train"])//(bs*ga_steps)

# args = TrainingArguments(
    
    
#     evaluation_strategy="steps",
#     logging_steps=1,
#     eval_steps=steps_per_epoch,		# eval and save once per epoch  	
#     save_steps=steps_per_epoch,
#     gradient_accumulation_steps=ga_steps,
#     num_train_epochs=epochs,
#     lr_scheduler_type="constant",
#     optim="paged_adamw_32bit",
#     learning_rate=0.0002,
#     group_by_length=True,
# )


# trainer.train()
# model = model.merge_and_unload()