In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!pip install bitsandbytes

In [4]:
from datasets import load_dataset

In [5]:
ds = load_dataset("cornell-movie-review-data/rotten_tomatoes")

In [6]:
ds["train"]

Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})

In [7]:
# Tokenization
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "ibm-granite/granite-7b-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = "PAD"




In [8]:
tokenizer

LlamaTokenizerFast(name_or_path='ibm-granite/granite-7b-base', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'pad_token': 'PAD'}, clean_up_tokenization_spaces=False)

In [9]:
def tokenize_function(example):
  return tokenizer(example["text"],truncation=True, max_length=512)

tokenized_dataset = ds.map(tokenize_function, batched=True)

In [10]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1066
    })
})

In [11]:
# Data Collator

data_collator = DataCollatorWithPadding(tokenizer)

In [12]:
type(tokenized_dataset["train"])

datasets.arrow_dataset.Dataset

In [13]:
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")
tokenized_dataset["train"].column_names

['labels', 'input_ids', 'attention_mask']

In [14]:
tokenized_dataset["train"]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 8530
})

In [15]:
from torch.utils.data import DataLoader

# Very high level, DataLoader is an iterable over our Dataset
# We use DataLoader here because we need to create batches and we need something to iterate over these batches and shuffle the dataset.
train_dataloader = DataLoader(tokenized_dataset["train"], batch_size = 1, collate_fn = data_collator)
eval_dataloader = DataLoader(tokenized_dataset["validation"], batch_size = 1, collate_fn = data_collator)

In [16]:
# Import configuration
from transformers import LlamaConfig
config = LlamaConfig()

In [17]:

# Import model for sequence classification based on checkpoint in this case model is IBM Granite based on Llama architecture
from transformers import AutoModelForSequenceClassification, BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForSequenceClassification.from_config(config).from_pretrained(checkpoint, num_labels = 2, quantization_config=quantization_config)
# https://stackoverflow.com/questions/68084302/assertionerror-cannot-handle-batch-sizes-1-if-no-padding-token-is-defined
model.config.pad_token_id = model.config.eos_token_id



Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ibm-granite/granite-7b-base and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
for batch in train_dataloader:
  break

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [19]:
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([1]),
 'input_ids': torch.Size([1, 58]),
 'attention_mask': torch.Size([1, 58])}

In [None]:
# Try running a single batch through forward pass of model and see results, loss, logists etc.
outputs = model(**batch)
print(outputs)

In [21]:
# Set optimizer
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



In [22]:
# Set learning rate scheduler

from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

25590


In [24]:
import torch

#device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#model.to(device)
#device

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/25590 [00:00<?, ?it/s]