In [214]:
import accelerate
import datasets
from datasets import(
load_dataset, 
load_dataset_builder,
get_dataset_split_names,
get_dataset_config_names)
import evaluate
import numpy as np
import peft
from peft import(
LoftQConfig,
LoraConfig,
get_peft_model)
import pickle
import transformers
from transformers import(
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer)
# allows fast processing of datasets

In [33]:
ds_builder1=load_dataset_builder("LongSafari/open-genome", "stage1")
print(ds_builder1.info.features)
ds_builder2=load_dataset_builder("LongSafari/open-genome", "stage2")
print(ds_builder2.info.features)
ds_builder3=load_dataset_builder("LongSafari/open-genome", "sample")
print(ds_builder3.info.features)

{'record': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None)}
{'text': Value(dtype='string', id=None)}
{'text': Value(dtype='string', id=None)}


In [66]:
#this is how you load a dataset from the Hugging Face Hub
# the movie review data is for testing purposes
ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train")

In [82]:
ds_gs=load_dataset("LongSafari/open-genome", "sample")
get_dataset_split_names("LongSafari/open-genome", "sample")

['validation']

In [39]:
ds_gst1=load_dataset("LongSafari/open-genome", "stage1", split="train[:50]")
print(get_dataset_split_names("LongSafari/open-genome", "stage1"))
ds_gst2=load_dataset("LongSafari/open-genome", "stage2", split="train[:50]")
print(get_dataset_split_names("LongSafari/open-genome", "stage2"))

['train', 'validation', 'test']
['train', 'validation', 'test']


In [63]:
# this is used to obtain a dataset's split names
get_dataset_split_names("cornell-movie-review-data/rotten_tomatoes")

['train', 'validation', 'test']

In [29]:
configs=get_dataset_config_names("cornell-movie-review-data/rotten_tomatoes")
print(configs)
# returns all configs available to this dataset

['default']


In [42]:
# tokenizers process text and add padding/perform truncation for variables of inconsistent length
tokenizer_l = AutoTokenizer.from_pretrained("meta-llama/Llama-3.3-70B-Instruct")

In [45]:
tokenizer_m = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

In [68]:
def tokenize_l_function(examples):
    return tokenizer_l(examples["text"], padding="max_length", truncation=True)
# map applies preprocessing across a dataset
llama_tokenized_datasets=ds.map(tokenize_l_function, batched=True)

In [53]:
def tokenize_m_function(examples):
    return tokenizer_m(examples["text"], padding="max_length", truncation=True)

mistral_tokenized_datasets=ds.map(tokenize_m_function, batched=True)

Map: 100%|████████████| 8530/8530 [00:00<00:00, 22049.31 examples/s]


In [75]:
# perform preprocessing on the genomic data
def map_data(data, model):
    if model=="llama":
        tokenized_dataset=data.map(tokenize_l_function, batched=True)
    elif model=="mistral":
        tokenized_dataset=data.map(tokenize_m_function, batched=True)
    return tokenized_dataset

In [116]:
# l_tokenized_genome_samples=ds_gs.map(tokenize_l_function, batched=True)
# m_tokenized_genome_samples=ds_gs.map(tokenize_m_function, batched=True)
l_tokenized_genome_samples=map_data(ds_gs, "llama")
m_tokenized_genome_samples=map_data(ds_gs, "mistral")

In [84]:
l_tokenized_genome_stage1=map_data(ds_gst1, "llama")
l_tokenized_genome_stage2=map_data(ds_gst2, "llama")

In [85]:
m_tokenized_genome_stage1=map_data(ds_gst1, "mistral")
m_tokenized_genome_stage2=map_data(ds_gst2, "mistral")

Map: 100%|███████████████████| 50/50 [00:09<00:00,  5.45 examples/s]
Map: 100%|██████████████████| 50/50 [00:00<00:00, 237.50 examples/s]


In [109]:
# let's create smaller subsets of training datasets to fine-tune
# use train_test_split
def tt_split_data(dataset):
    sample_data=dataset.train_test_split(test_size=0.15)
    train_data=sample_data["train"]
    test_data=sample_data["test"]
    return train_data, test_data

In [112]:
l_train_sample, l_test_sample=tt_split_data(llama_tokenized_datasets)
m_train_sample, m_test_sample=tt_split_data(mistral_tokenized_datasets)
# get train and test sets for the sample rotten tomatoes data

In [113]:
l_train_genome1, l_test_genome1=tt_split_data(l_tokenized_genome_stage1)
m_train_genome1, m_test_genome1=tt_split_data(m_tokenized_genome_stage1)

In [114]:
l_train_genome2, l_test_genome2=tt_split_data(l_tokenized_genome_stage2)
m_train_genome2, m_test_genome2=tt_split_data(m_tokenized_genome_stage2)

In [118]:
# l_train_genome_sam, l_test_genome_sam=tt_split_data(l_tokenized_genome_samples)
# m_train_genome_sam, m_test_genome_sam=tt_split_data(m_tokenized_genome_samples)

In [125]:
# before loading in the base model with LoRA, might be good to define a helper function
# this looks at the total parameters a model has, and how many are trainable
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [168]:
print_trainable_parameters(lla_model)

trainable params: 70553706496 || all params: 70553706496 || trainable%: 100.00


In [169]:
print_trainable_parameters(lla_model321)

trainable params: 1235814400 || all params: 1235814400 || trainable%: 100.00


In [170]:
print_trainable_parameters(lla_model323)

trainable params: 3212749824 || all params: 3212749824 || trainable%: 100.00


In [171]:
print_trainable_parameters(lla_model323_I)

trainable params: 3212749824 || all params: 3212749824 || trainable%: 100.00


In [172]:
print_trainable_parameters(lla_model31)

trainable params: 8030261248 || all params: 8030261248 || trainable%: 100.00


In [159]:
# use textclassification methods from transformers
lla_model=AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.3-70B-Instruct", num_labels=3, torch_dtype="auto")

Loading checkpoint shards: 100%|████| 30/30 [00:18<00:00,  1.59it/s]


In [160]:
lla_model321=AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", num_labels=3, torch_dtype="auto")

In [161]:
lla_model323=AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B", num_labels=3, torch_dtype="auto")

Loading checkpoint shards: 100%|██████| 2/2 [00:01<00:00,  1.41it/s]


In [162]:
lla_model323_I=AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct", num_labels=3, torch_dtype="auto")

Loading checkpoint shards: 100%|██████| 2/2 [00:01<00:00,  1.57it/s]


In [163]:
lla_model31=AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", num_labels=3, torch_dtype="auto")

Loading checkpoint shards: 100%|██████| 4/4 [00:02<00:00,  1.64it/s]


In [164]:
mis_model=AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", num_labels=3, torch_dtype="auto")

Loading checkpoint shards: 100%|██████| 2/2 [00:01<00:00,  1.58it/s]


In [165]:
mis_model_base=AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", num_labels=3, torch_dtype="auto")

Loading checkpoint shards: 100%|██████| 2/2 [00:01<00:00,  1.44it/s]


In [166]:
mis_model3=AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.3", num_labels=3, torch_dtype="auto")

Loading checkpoint shards: 100%|██████| 3/3 [00:01<00:00,  1.51it/s]


In [167]:
mis_model3_I=AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3", num_labels=3, torch_dtype="auto")

Loading checkpoint shards: 100%|██████| 3/3 [00:01<00:00,  1.52it/s]


In [173]:
print_trainable_parameters(mis_model)

trainable params: 7241732096 || all params: 7241732096 || trainable%: 100.00


In [174]:
print_trainable_parameters(mis_model_base)

trainable params: 7241732096 || all params: 7241732096 || trainable%: 100.00


In [175]:
print_trainable_parameters(mis_model3)

trainable params: 7248023552 || all params: 7248023552 || trainable%: 100.00


In [176]:
print_trainable_parameters(mis_model3_I)

trainable params: 7248023552 || all params: 7248023552 || trainable%: 100.00


In [121]:
training_args=TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
# eval_strategy=epoch ensures that the evaluation metric will be reported at the end of each epoch
# this helps us monitor evaluation metrics during fine-tuning

In [122]:
# A trainer needs to be passed a function from the Evaluate library (specifically the accuracy function) to compute and report metrics
metric=evaluate.load("accuracy")

In [123]:
# the compute_metrics method will calculate prediction accuracy
def compute_metrics(eval_pred):
    logits, labels=eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
    # this lets us convert logits (returned by models) into predictions
    # np.argmax returns the indices of the maximum values along the axis of an array
    # axis=-1 means it looks at the last axis in the array
    # metric.compute gathers all cached predictions and references to compute the metric scor

In [210]:
# to fine-tune with LoRA, instantiate a base model (as above)
# create LoraConfig where LoRA-specific parameters are defined
config=LoraConfig(
    inference_mode=False,
    r=10, #rank of update matrices, lower value results in smaller matrices with fewer parameters
    lora_alpha=10, #LoRA scaling factor
    task_type="CAUSAL_LM",
    lora_dropout=0, # dropout probability of LoRA layers
    bias="lora_only", # specifies if bias parameters should be trained
    modules_to_save=["decode_head"] #models apart from LoRA layers that are trainable
)

In [211]:
mis_lora_model=get_peft_model(mis_model, config)
print_trainable_parameters(mis_lora_model)

trainable params: 26214400 || all params: 7267946496 || trainable%: 0.36


In [212]:
lla_lora_model=get_peft_model(lla_model31, config)
print_trainable_parameters(lla_lora_model)

trainable params: 26214400 || all params: 8056475648 || trainable%: 0.33


In [216]:
with open("llama3_2.pkl", "rb") as f:
    llama321=pickle.load(f)
lla32_lora=get_peft_model(llama321, config)
print_trainable_parameters(lla32_lora)

trainable params: 1064960 || all params: 1236879360 || trainable%: 0.09


In [None]:
# the trainer object specifies the model, training arguments, training and test datasets, and evaluation function
trainer_l = Trainer(
    model=lla_lora_model,
    args=training_args,
    train_dataset=l_train_genome1,
    eval_dataset=l_test_genome1,
    compute_metrics=compute_metrics,
)
trainer_l.train()

In [202]:
def make_trainer(m_model, train_data, test_data):
    trainer=Trainer(
        model=m_model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        compute_metrics=compute_metrics,
    )
    return trainer

In [None]:
# small_train_dataset_l=llama_tokenized_datasets["input_ids"].shuffle(seed=42).select(range(500))
# small_test_dataset_l=llama_tokenized_datasets["attention_mask"].shuffle(seed=42).select(range(500))
# shuffle() randomly rearranges the column values and creates an indices mapping
# select() returns rows according to indices
# shuffling can make your program run significantly slower
# maybe in a second iteration switch to an IterableDataset, as IterableDataset.shuffle() is faster