In [1]:
import accelerate
import datasets
import evaluate
import math
import numpy as np
import peft
import pickle
import pytest, ipytest
ipytest.autoconfig()
import pandas as pd
import transformers

from datasets import(
    load_dataset, 
    load_dataset_builder,
    get_dataset_split_names,
    get_dataset_config_names,
)


from peft import(
    LoftQConfig,
    LoraConfig,
    get_peft_model,
)

from transformers import(
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
from trl import SFTTrainer, SFTConfig
# allows fast processing of datasets

  from .autonotebook import tqdm as notebook_tqdm
2025-04-04 16:58:09.896000: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743785889.910222  694204 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743785889.914445  694204 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743785889.926804  694204 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743785889.926819  694204 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743785889.926822  694204

In [2]:
def model_from_pkl(model):
    with open("pkl_files/" + model + ".pkl", "rb") as f:
        pkl_model=pickle.load(f)
    model_name=pkl_model["model_name"]
    tokenizer=pkl_model["tokenizer"]
    tokenizer.pad_token=tokenizer.eos_token
    return model_name, tokenizer

In [11]:
ds_gst1_train=load_dataset("LongSafari/open-genome", "stage1", split="train[:500]")
#print(ds_gst1[50])
ds_gst1_test=load_dataset("LongSafari/open-genome", "stage1", split="test[:50]")
print(get_dataset_split_names("LongSafari/open-genome", "stage1"))
ds_gst2_train=load_dataset("LongSafari/open-genome", "stage2", split="train[:500]")
ds_gst2_test=load_dataset("LongSafari/open-genome", "stage2", split="test[:50]")
print(get_dataset_split_names("LongSafari/open-genome", "stage2"))

Using the latest cached version of the dataset since LongSafari/open-genome couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'stage1' at /home/ac.cdavies/.cache/huggingface/datasets/LongSafari___open-genome/stage1/0.0.0/84369c058d192dcb607086d71679b877421e3250 (last modified on Fri Apr  4 15:23:06 2025).


['train', 'validation', 'test']
['train', 'validation', 'test']


In [3]:
# perform preprocessing on the genomic data
def map_data(data, model, tokenizer):
    def tokenize_l_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)
    def tokenize_m_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=1024)
        
    if type(model)==transformers.models.llama.modeling_llama.LlamaForCausalLM:
        tokenized_dataset=data.map(tokenize_l_function, batched=True)
    else:
        tokenized_dataset=data.map(tokenize_m_function, batched=True)
    return tokenized_dataset

In [4]:
# before loading in the base model with LoRA, might be good to define a helper function
# this looks at the total parameters a model has, and how many are trainable
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [5]:
# A trainer needs to be passed a function from the Evaluate library (specifically the accuracy function) to compute and report metrics
metric=evaluate.load("accuracy")

In [6]:
# the compute_metrics method will calculate prediction accuracy
comp_metrics_output=[]
def compute_metrics(eval_pred):
    logits=eval_pred.predictions
    refs=eval_pred.label_ids
    log_32=logits.astype(np.int32)
    log_32=np.concatenate(log_32).tolist()
    ref_32=refs.astype(np.int32)
    ref_32=np.concatenate(ref_32).tolist()
    predictions = np.argmax(log_32, axis=-1)
    met=metric.compute(predictions=predictions, references=ref_32)
    comp_metrics_output.append(met)
    return met
    # this lets us convert logits (returned by models) into predictions
    # np.argmax returns the indices of the maximum values along the axis of an array
    # axis=-1 means it looks at the last axis in the array
    # metric.compute gathers all cached predictions and references to compute the metric score

In [7]:
# the trainer object specifies the model, training arguments, training and test datasets, and evaluation function
def make_trainer(m_model, train_data, test_data, config, args):
    trainer=SFTTrainer(
        model=m_model,
        train_dataset=train_data,
        eval_dataset=test_data,
        peft_config=config,
        args=args,
        compute_metrics=compute_metrics,
        )
    return trainer
    
# SFTTrainer is best used for training with a pre-trained model and a smaller dataset
# It can be better suited for fine-tuning than regular Trainer

## Training Loss
* Measures how well the models are performing on the training data, used to update parameters, decreases as model learns
* Too low means model may have overfit
## Validation Loss
* Measures how well the model is performing on the separate validation set (the test set in this case)
* Shows how well model generalizes to data it hasn't seen, if this increases while training loss decreases, there is overfitting
## Accuracy
* How often a correct outcome is predicted

In [8]:
def get_dataframe(training_output: list, strategy):
    df=pd.DataFrame(training_output) # convert the imported list of dictionaries to a DataFrame
    df.index=df[strategy] # the index of the dataframe is whatever evaluation strategy was used
    df=df.drop([strategy], axis=1) # drop one column so there aren't two step/epoch columns
    df.plot(y=0, xlabel=strategy, ylabel="Training Loss", title="Fine-Tuning Training Evaluation") # plot training loss
    df.plot(y=1, xlabel=strategy, ylabel="Validation Loss", title="Fine-Tuning Validation Evaluation") # plot validation loss
    df.plot(y=2, xlabel=strategy, ylabel="Accuracy", title="Fine-Tuning Accuracy Evaluation") # plot accuracy
    #for all of the above plots, the evaluation strategy (the index) is the x-axis value
    return df

In [9]:
def get_training_output(trainer, keys: list):
    trainer_info=[]
    temp_dict={}
    logs=trainer.state.log_history # get the logs from model training, these show training loss, accuracy, etc
    strat=trainer.args.eval_strategy.value # was this evaluated at steps or epochs
    def check_eval(strat, log):
        condition=False
        state=0
        if (strat=='epoch'):
            condition= log['epoch'].is_integer() 
            state=log['epoch'] # save the epoch number
            # if evaluated at epochs, extract data at the points where epochs are whole numbers
        elif (strat=='steps'):
            val=math.floor(log['step']) # use floor to convert the floating point step to an integer
            condition = val%trainer.args.logging_steps==0
            state=val # save the step number
            # if evaluated at steps, evaluate at the point where the number of steps divides evenly by the training interval
        return condition, state
        
    for log in logs: # loop through training logs
        condition, state=check_eval(strat, log)
        if not (condition): # check the appropriate condition based on evaluation strategy
            continue # whenever the condition isn't true, restart the loop
        for key in keys: # look at all the keys (usually training loss, validation loss, and accuracy)
            if key in log:
                temp_dict[key]=log[key] # at the value tied to each key to a placeholder dictionary
        if key in temp_dict: # if a key is already in the dictionary (you've found a value for a different step/epoch)
            temp_dict[strat]=state # add the corresponding step/epoch number to the temp dictionary
            trainer_info.append(temp_dict) # add the temp dictionary to the list with training information
            temp_dict={} # clear the temp dictionary, new values with the same keys as the last can now be added
    training_output=get_dataframe(trainer_info, strat) # run this method to convert the list of dicts to a dataframe
    return training_output #return the dataframe

In [10]:
keys=list(globals().keys())
keys
for key in keys:
    if key[0]!='_':
        print(f"{key},", end=" ")
del keys
del key

In, Out, get_ipython, exit, quit, open, accelerate, datasets, evaluate, math, np, peft, pickle, pytest, ipytest, pd, transformers, load_dataset, load_dataset_builder, get_dataset_split_names, get_dataset_config_names, LoftQConfig, LoraConfig, get_peft_model, AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, SFTTrainer, SFTConfig, @py_builtins, @pytest_ar, model_from_pkl, map_data, print_trainable_parameters, metric, comp_metrics_output, compute_metrics, make_trainer, get_dataframe, get_training_output, 