In [9]:
import accelerate
import datasets
import evaluate
import numpy as np
import peft
import pickle
import pytest, ipytest
ipytest.autoconfig()
import transformers

from datasets import(
    load_dataset, 
    load_dataset_builder,
    get_dataset_split_names,
    get_dataset_config_names,
)


from peft import(
    LoftQConfig,
    LoraConfig,
    get_peft_model,
)

from transformers import(
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments,
Trainer)
# allows fast processing of datasets

In [4]:
ds_builder1=load_dataset_builder("LongSafari/open-genome", "stage1")
print(ds_builder1.info.features)
ds_builder2=load_dataset_builder("LongSafari/open-genome", "stage2")
print(ds_builder2.info.features)
ds_builder3=load_dataset_builder("LongSafari/open-genome", "sample")
print(ds_builder3.info.features)

{'record': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None)}
{'text': Value(dtype='string', id=None)}
{'text': Value(dtype='string', id=None)}


In [6]:
ds_gs=load_dataset("LongSafari/open-genome", "sample")
get_dataset_split_names("LongSafari/open-genome", "sample")

['validation']

In [18]:
ds_gst1_train=load_dataset("LongSafari/open-genome", "stage1", split="train[:500]")
#print(ds_gst1[50])
ds_gst1_test=load_dataset("LongSafari/open-genome", "stage1", split="test[:50]")
print(get_dataset_split_names("LongSafari/open-genome", "stage1"))
ds_gst2_train=load_dataset("LongSafari/open-genome", "stage2", split="train[:500]")
ds_gst2_test=load_dataset("LongSafari/open-genome", "stage2", split="test[:50]")
print(get_dataset_split_names("LongSafari/open-genome", "stage2"))

['train', 'validation', 'test']
['train', 'validation', 'test']


In [19]:
def model_from_pkl(model):
    with open("pkl_files/" + model + ".pkl", "rb") as f:
        pkl_model=pickle.load(f)
    model_name=pkl_model["model_name"]
    tokenizer=pkl_model["tokenizer"]
    tokenizer.pad_token=tokenizer.eos_token
    return model_name, tokenizer

In [17]:
%%ipytest -k imp
#test to check models are properly improted from pkl files
def test_model_imp():
    lla_321, lla_321_tokenizer=model_from_pkl("Llama-3.2-1B")
    assert type(lla_321)==transformers.models.llama.modeling_llama.LlamaForCausalLM and type(lla_321_tokenizer)==transformers.tokenization_utils_fast.PreTrainedTokenizerFast
          

[32m.[0m[32m                                                                                            [100%][0m
[32m[32m[1m1 passed[0m[32m in 8.75s[0m[0m


In [10]:
lla_321, lla_321_tokenizer=model_from_pkl("Llama-3.2-1B")

In [11]:
lla_323, lla_323_tokenizer=model_from_pkl("Llama-3.2-3B-Instruct")

In [12]:
lla_31, lla_31_tokenizer=model_from_pkl("Llama-3.1-8B")

In [13]:
lla_323_b, lla_323_b_tokenizer=model_from_pkl("Llama-3.2-3B")

In [14]:
mistral, mistral_tokenizer=model_from_pkl("Mistral-7B-Instruct-v0.1")

In [15]:
# perform preprocessing on the genomic data
def map_data(data, model, tokenizer):
    def tokenize_l_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)
    def tokenize_m_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=8192)
        
    if type(model)==transformers.models.llama.modeling_llama.LlamaForCausalLM:
        tokenized_dataset=data.map(tokenize_l_function, batched=True)
    else:
        tokenized_dataset=data.map(tokenize_m_function, batched=True)
    return tokenized_dataset

In [16]:
# l_tokenized_genome_samples=ds_gs.map(tokenize_l_function, batched=True)
# m_tokenized_genome_samples=ds_gs.map(tokenize_m_function, batched=True)
l_tokenized_genome_samples=map_data(ds_gs, lla_321, lla_321_tokenizer)
m_tokenized_genome_samples=map_data(ds_gs, mistral, mistral_tokenizer)

Map: 100%|███████████████████| 50/50 [00:04<00:00, 10.82 examples/s]
Map: 100%|██████████████████| 50/50 [00:00<00:00, 129.72 examples/s]


In [17]:
l_tokenized_stage1_train=map_data(ds_gst1_train, lla_321, lla_321_tokenizer)
l_tokenized_stage1_test=map_data(ds_gst1_test, lla_321, lla_321_tokenizer)

Map: 100%|█████████████████| 100/100 [00:09<00:00, 10.89 examples/s]
Map: 100%|█████████████████| 100/100 [00:06<00:00, 14.68 examples/s]


In [18]:
l_tokenized_stage2_train=map_data(ds_gst2_train, lla_321, lla_321_tokenizer)
l_tokenized_stage2_test=map_data(ds_gst2_test, lla_321, lla_321_tokenizer)

Map: 100%|█████████████████| 100/100 [00:06<00:00, 15.35 examples/s]
Map: 100%|███████████████████| 50/50 [00:03<00:00, 16.57 examples/s]


In [19]:
m_tokenized_stage1_train=map_data(ds_gst1_train, mistral, mistral_tokenizer)
m_tokenized_stage1_test=map_data(ds_gst1_test, mistral, mistral_tokenizer)

Map: 100%|█████████████████| 100/100 [00:01<00:00, 57.07 examples/s]
Map: 100%|████████████████| 100/100 [00:00<00:00, 207.16 examples/s]


In [20]:
m_tokenized_stage2_train=map_data(ds_gst2_train, mistral, mistral_tokenizer)
m_tokenized_stage2_test=map_data(ds_gst2_test, mistral, mistral_tokenizer)

Map: 100%|████████████████| 100/100 [00:00<00:00, 230.01 examples/s]
Map: 100%|██████████████████| 50/50 [00:00<00:00, 148.92 examples/s]


In [21]:
# before loading in the base model with LoRA, might be good to define a helper function
# this looks at the total parameters a model has, and how many are trainable
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [22]:
training_args=TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
# eval_strategy=epoch ensures that the evaluation metric will be reported at the end of each epoch
# this helps us monitor evaluation metrics during fine-tuning

In [23]:
# A trainer needs to be passed a function from the Evaluate library (specifically the accuracy function) to compute and report metrics
metric=evaluate.load("accuracy")

In [24]:
# the compute_metrics method will calculate prediction accuracy
def compute_metrics(eval_pred):
    logits, labels=eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
    # this lets us convert logits (returned by models) into predictions
    # np.argmax returns the indices of the maximum values along the axis of an array
    # axis=-1 means it looks at the last axis in the array
    # metric.compute gathers all cached predictions and references to compute the metric scor

In [25]:
# to fine-tune with LoRA, instantiate a base model (as above)
# create LoraConfig where LoRA-specific parameters are defined
config=LoraConfig(
    inference_mode=False,
    r=10, #rank of update matrices, lower value results in smaller matrices with fewer parameters
    lora_alpha=10, #LoRA scaling factor
    task_type="CAUSAL_LM",
    lora_dropout=0, # dropout probability of LoRA layers
    bias="lora_only", # specifies if bias parameters should be trained
    modules_to_save=["decode_head"] #models apart from LoRA layers that are trainable
)

In [30]:
mis_lora_model=get_peft_model(mistral, config)
print_trainable_parameters(mis_lora_model)

trainable params: 4259840 || all params: 7245991936 || trainable%: 0.06


In [27]:
lla_lora_model=get_peft_model(lla_321, config)
print_trainable_parameters(lla_lora_model)

trainable params: 1064960 || all params: 1236879360 || trainable%: 0.09




In [1]:
# the trainer object specifies the model, training arguments, training and test datasets, and evaluation function
def make_trainer(m_model, train_data, test_data):
    trainer=Trainer(
        model=m_model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        compute_metrics=compute_metrics,
    )
    return trainer

In [None]:
mistral_train=make_trainer(mistral, m_tokenized_stage1_train, m_tokenized_stage1_test)
mistral_train.train()

In [None]:
# the trainer object specifies the model, training arguments, training and test datasets, and evaluation function
# trainer_m = Trainer(
#     model=mis_lora_model,
#     args=training_args,
#     train_dataset=m_tokenized_stage1_train,
#     eval_dataset=m_tokenized_stage1_test,
#     compute_metrics=compute_metrics,
# )
# trainer_m.train()