In [47]:
import accelerate
import datasets
import evaluate
import numpy as np
import peft
import pickle
import transformers

from datasets import(
    load_dataset, 
    load_dataset_builder,
    get_dataset_split_names,
    get_dataset_config_names,
)


from peft import(
    LoftQConfig,
    LoraConfig,
    get_peft_model,
)

from transformers import(
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer)
# allows fast processing of datasets

In [4]:
ds_builder1=load_dataset_builder("LongSafari/open-genome", "stage1")
print(ds_builder1.info.features)
ds_builder2=load_dataset_builder("LongSafari/open-genome", "stage2")
print(ds_builder2.info.features)
ds_builder3=load_dataset_builder("LongSafari/open-genome", "sample")
print(ds_builder3.info.features)

{'record': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None)}
{'text': Value(dtype='string', id=None)}
{'text': Value(dtype='string', id=None)}


In [5]:
ds_gs=load_dataset("LongSafari/open-genome", "sample")
get_dataset_split_names("LongSafari/open-genome", "sample")

['validation']

In [71]:
ds_gst1=load_dataset("LongSafari/open-genome", "stage1")
#print(ds_gst1[50])
print(get_dataset_split_names("LongSafari/open-genome", "stage1"))
ds_gst2=load_dataset("LongSafari/open-genome", "stage2")
print(get_dataset_split_names("LongSafari/open-genome", "stage2"))

['train', 'validation', 'test']
['train', 'validation', 'test']


In [8]:
def model_from_pkl(model):
    with open("pkl_files/" + model + ".pkl", "rb") as f:
        pkl_model=pickle.load(f)
    model_name=pkl_model["model_name"]
    tokenizer=pkl_model["tokenizer"]
    tokenizer.pad_token=tokenizer.eos_token
    return model_name, tokenizer
    

In [10]:
lla_321, lla_321_tokenizer=model_from_pkl("Llama-3.2-1B")

In [11]:
mistral, mistral_tokenizer=model_from_pkl("Mistral-7B-Instruct-v0.1")

In [12]:
lla_323, lla_323_tokenizer=model_from_pkl("Llama-3.2-3B-Instruct")

In [15]:
lla_31, lla_31_tokenizer=model_from_pkl("Llama-3.1-8B")

In [None]:
lla_323_b, lla_323_b_tokenizer=model_from_pkl("Llama-3.2-3B")

In [37]:
def tokenize_l_function(examples):
    return lla_321_tokenizer(examples["text"], padding="max_length", truncation=True)
# map applies preprocessing across a dataset
#llama_tokenized_datasets=ds.map(tokenize_l_function, batched=True)

In [31]:
def tokenize_m_function(examples):
    return mistral_tokenizer(examples["text"], padding="max_length", truncation=True, max_length=8192)


In [42]:
# perform preprocessing on the genomic data
def map_data(data, model, tokenizer):
    def tokenize_l_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)
    def tokenize_m_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=8192)
        
    if type(model)==transformers.models.llama.modeling_llama.LlamaForCausalLM:
        tokenized_dataset=data.map(tokenize_l_function, batched=True)
    else:
        tokenized_dataset=data.map(tokenize_m_function, batched=True)
    return tokenized_dataset

In [43]:
# l_tokenized_genome_samples=ds_gs.map(tokenize_l_function, batched=True)
# m_tokenized_genome_samples=ds_gs.map(tokenize_m_function, batched=True)
l_tokenized_genome_samples=map_data(ds_gs, lla_321, lla_321_tokenizer)
m_tokenized_genome_samples=map_data(ds_gs, mistral, mistral_tokenizer)

Map: 100%|█████████████████████████████████████████████████████████████████| 50/50 [00:03<00:00, 15.75 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 164.16 examples/s]


In [45]:
l_tokenized_genome_stage1=map_data(ds_gst1, lla_321, lla_321_tokenizer)
l_tokenized_genome_stage2=map_data(ds_gst2, lla_321, lla_321_tokenizer)

Map: 100%|███████████████████████████████████████████████████████████████| 100/100 [00:09<00:00, 10.91 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████| 100/100 [00:05<00:00, 17.82 examples/s]


In [69]:
m_tokenized_stage1_train=map_data(ds_gst1_train, mistral, mistral_tokenizer)
m_tokenized_stage1_test=map_data(ds_gst1_test, mistral, mistral_tokenizer)
m_tokenized_genome_stage2=map_data(ds_gst2, mistral, mistral_tokenizer)
#print(m_tokenized_genome_stage2["text"][0])

AttributeError: 'dict' object has no attribute 'map'

In [48]:
# before loading in the base model with LoRA, might be good to define a helper function
# this looks at the total parameters a model has, and how many are trainable
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [49]:
training_args=TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
# eval_strategy=epoch ensures that the evaluation metric will be reported at the end of each epoch
# this helps us monitor evaluation metrics during fine-tuning

In [50]:
# A trainer needs to be passed a function from the Evaluate library (specifically the accuracy function) to compute and report metrics
metric=evaluate.load("accuracy")

In [51]:
# the compute_metrics method will calculate prediction accuracy
def compute_metrics(eval_pred):
    logits, labels=eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
    # this lets us convert logits (returned by models) into predictions
    # np.argmax returns the indices of the maximum values along the axis of an array
    # axis=-1 means it looks at the last axis in the array
    # metric.compute gathers all cached predictions and references to compute the metric scor

In [52]:
# to fine-tune with LoRA, instantiate a base model (as above)
# create LoraConfig where LoRA-specific parameters are defined
config=LoraConfig(
    inference_mode=False,
    r=10, #rank of update matrices, lower value results in smaller matrices with fewer parameters
    lora_alpha=10, #LoRA scaling factor
    task_type="CAUSAL_LM",
    lora_dropout=0, # dropout probability of LoRA layers
    bias="lora_only", # specifies if bias parameters should be trained
    modules_to_save=["decode_head"] #models apart from LoRA layers that are trainable
)

In [58]:
mis_lora_model=get_peft_model(mistral, config)
print_trainable_parameters(mis_lora_model)

trainable params: 4259840 || all params: 7245991936 || trainable%: 0.06


In [56]:
lla_lora_model=get_peft_model(lla_321, config)
print_trainable_parameters(lla_lora_model)

trainable params: 1064960 || all params: 1236879360 || trainable%: 0.09




In [63]:
# the trainer object specifies the model, training arguments, training and test datasets, and evaluation function
trainer_m = Trainer(
    model=mis_lora_model,
    args=training_args,
    train_dataset=m_tokenized_genome_stage2["train"],
    compute_metrics=compute_metrics,
)
trainer_m.train()

KeyError: "Column train not in the dataset. Current columns in the dataset: ['text', 'input_ids', 'attention_mask']"

In [202]:
def make_trainer(m_model, train_data, test_data):
    trainer=Trainer(
        model=m_model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        compute_metrics=compute_metrics,
    )
    return trainer

In [None]:
# small_train_dataset_l=llama_tokenized_datasets["input_ids"].shuffle(seed=42).select(range(500))
# small_test_dataset_l=llama_tokenized_datasets["attention_mask"].shuffle(seed=42).select(range(500))
# shuffle() randomly rearranges the column values and creates an indices mapping
# select() returns rows according to indices
# shuffling can make your program run significantly slower
# maybe in a second iteration switch to an IterableDataset, as IterableDataset.shuffle() is faster