In [88]:
import accelerate
import datasets
from datasets import(
load_dataset, 
load_dataset_builder,
get_dataset_split_names,
get_dataset_config_names)
import evaluate
import peft
import transformers
from transformers import(
AutoModelForSequenceClassification,
AutoTokenizer)
# allows fast processing of datasets

In [33]:
ds_builder1=load_dataset_builder("LongSafari/open-genome", "stage1")
print(ds_builder1.info.features)
ds_builder2=load_dataset_builder("LongSafari/open-genome", "stage2")
print(ds_builder2.info.features)
ds_builder3=load_dataset_builder("LongSafari/open-genome", "sample")
print(ds_builder3.info.features)

{'record': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None)}
{'text': Value(dtype='string', id=None)}
{'text': Value(dtype='string', id=None)}


In [66]:
#this is how you load a dataset from the Hugging Face Hub
# the movie review data is for testing purposes
ds = load_dataset("cornell-movie-review-data/rotten_tomatoes", split="train")

In [82]:
ds_gs=load_dataset("LongSafari/open-genome", "sample")
get_dataset_split_names("LongSafari/open-genome", "sample")

['validation']

In [39]:
ds_gst1=load_dataset("LongSafari/open-genome", "stage1", split="train[:50]")
print(get_dataset_split_names("LongSafari/open-genome", "stage1"))
ds_gst2=load_dataset("LongSafari/open-genome", "stage2", split="train[:50]")
print(get_dataset_split_names("LongSafari/open-genome", "stage2"))

['train', 'validation', 'test']
['train', 'validation', 'test']


In [63]:
# this is used to obtain a dataset's split names
get_dataset_split_names("cornell-movie-review-data/rotten_tomatoes")

['train', 'validation', 'test']

In [29]:
configs=get_dataset_config_names("cornell-movie-review-data/rotten_tomatoes")
print(configs)
# returns all configs available to this dataset

['default']


In [42]:
# tokenizers process text and add padding/perform truncation for variables of inconsistent length
tokenizer_l = AutoTokenizer.from_pretrained("meta-llama/Llama-3.3-70B-Instruct")

In [45]:
tokenizer_m = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

In [68]:
def tokenize_l_function(examples):
    return tokenizer_l(examples["text"], padding="max_length", truncation=True)
# map applies preprocessing across a dataset
llama_tokenized_datasets=ds.map(tokenize_l_function, batched=True)

In [53]:
def tokenize_m_function(examples):
    return tokenizer_m(examples["text"], padding="max_length", truncation=True)

mistral_tokenized_datasets=ds.map(tokenize_m_function, batched=True)

Map: 100%|████████████| 8530/8530 [00:00<00:00, 22049.31 examples/s]


In [75]:
# perform preprocessing on the genomic data
def map_data(data, model):
    if model=="llama":
        tokenized_dataset=data.map(tokenize_l_function, batched=True)
    elif model=="mistral":
        tokenized_dataset=data.map(tokenize_m_function, batched=True)
    return tokenized_dataset

In [79]:
l_tokenized_genome_samples=ds_gs.map(tokenize_l_function, batched=True)
m_tokenized_genome_samples=ds_gs.map(tokenize_m_function, batched=True)

In [84]:
l_tokenized_genome_stage1=map_data(ds_gst1, "llama")
l_tokenized_genome_stage2=map_data(ds_gst2, "llama")

In [85]:
m_tokenized_genome_stage1=map_data(ds_gst1, "mistral")
m_tokenized_genome_stage2=map_data(ds_gst2, "mistral")

Map: 100%|███████████████████| 50/50 [00:09<00:00,  5.45 examples/s]
Map: 100%|██████████████████| 50/50 [00:00<00:00, 237.50 examples/s]


In [71]:
# let's create smaller subsets of training datasets to fine-tune
# use train_test_split
llama_tokenized_datasets.train_test_split(test_size=0.1)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 7677
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 853
    })
})

In [73]:
mistral_tokenized_datasets.train_test_split(test_size=0.1)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 7677
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 853
    })
})

In [86]:
l_tokenized_genome_stage1.train_test_split(test_size=0.1)
m_tokenized_genome_stage1.train_test_split(test_size=0.1)

DatasetDict({
    train: Dataset({
        features: ['record', 'text', 'input_ids', 'attention_mask'],
        num_rows: 45
    })
    test: Dataset({
        features: ['record', 'text', 'input_ids', 'attention_mask'],
        num_rows: 5
    })
})

In [87]:
l_tokenized_genome_stage2.train_test_split(test_size=0.1)
m_tokenized_genome_stage2.train_test_split(test_size=0.1)

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 45
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 5
    })
})

In [89]:
# use textclassification methods from transformers
lla_model=AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-3.3-70B-Instruct", num_labels=3, torch_dtype="auto")

Loading checkpoint shards: 100%|████| 30/30 [00:22<00:00,  1.33it/s]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.3-70B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [90]:
mis_model=AutoModelForSequenceClassification.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", num_labels=3, torch_dtype="auto")

Loading checkpoint shards: 100%|██████| 2/2 [00:03<00:00,  1.79s/it]
Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# small_train_dataset_l=llama_tokenized_datasets["input_ids"].shuffle(seed=42).select(range(500))
# small_test_dataset_l=llama_tokenized_datasets["attention_mask"].shuffle(seed=42).select(range(500))
# shuffle() randomly rearranges the column values and creates an indices mapping
# select() returns rows according to indices
# shuffling can make your program run significantly slower
# maybe in a second iteration switch to an IterableDataset, as IterableDataset.shuffle() is faster