## Setup Environment

In [1]:
import logging
import os
import random
import sys
import warnings
from dataclasses import dataclass, field
from typing import List, Literal, Optional, Union

import datasets
import evaluate
import numpy as np
from datasets import Value, load_dataset

import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    PhiForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

from peft import (
    TaskType,
    LoraConfig,
    get_peft_model,
    PeftModel,
    PeftConfig,
)

from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

HF_TOKEN = os.getenv("HF_TOKEN")
WANDB_API_KEY = os.getenv("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "TEST_SEQ_CLASSIFICATION_RUNS"

## Load Dataset

In [2]:
raw_datasets = load_dataset("MAdAiLab/twitter_disaster")
# raw_datasets = df.rename_column("label", "labels")

In [3]:
logger = logging.getLogger(__name__)

In [4]:
def get_label_list(raw_dataset, split="train") -> List[str]:
    """Get the list of labels from a multi-label dataset"""

    if isinstance(raw_dataset[split]["label"][0], list):
        label_list = [label for sample in raw_dataset[split]["label"] for label in sample]
        label_list = list(set(label_list))
    else:
        label_list = raw_dataset[split].unique("label")
    # we will treat the label list as a list of string instead of int, consistent with model.config.label2id
    label_list = [str(label) for label in label_list]
    return label_list

In [5]:
label_list = get_label_list(raw_datasets, split="train")
for split in ["validation", "test"]:
    if split in raw_datasets:
        val_or_test_labels = get_label_list(raw_datasets, split=split)
        diff = set(val_or_test_labels).difference(set(label_list))
        if len(diff) > 0:
            # add the labels that appear in val/test but not in train, throw a warning
            logger.warning(
                f"Labels {diff} in {split} set but not in training set, adding them to the label list"
            )
            label_list += list(diff)
# if label is -1, we throw a warning and remove it from the label list
for label in label_list:
    if label == -1:
        logger.warning("Label -1 found in label list, removing it.")
        label_list.remove(label)

label_list.sort()
num_labels = len(label_list)
if num_labels <= 1:
    raise ValueError("You need more than one label to do classification.")

## Load pretrained model and tokenizer

In [6]:
checkpoint = "mistralai/Mistral-7B-v0.1"

config = AutoConfig.from_pretrained(
        pretrained_model_name_or_path=checkpoint,
        num_labels=num_labels,
        finetuning_task="text-classification",
        trust_remote_code=True,
)
config.problem_type = "single_label_classification"

tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=checkpoint,
    trust_remote_code=True,
)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path=checkpoint,
        config=config,
        trust_remote_code=True,
)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

peft_config = LoraConfig(
            r=2,
            target_modules=["q_proj", "v_proj"],
            lora_alpha=4,
            task_type=TaskType.SEQ_CLS,
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

label_to_id = {v: i for i, v in enumerate(label_list)}

model.config.label2id = label_to_id
model.config.id2label = {id: label for label, id in label_to_id.items()}

max_seq_length =  4096 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 860,160 || all params: 7,111,528,448 || trainable%: 0.012095290151611583


## Preprocess Dataset

In [7]:
def preprocess_function(examples):
    # Tokenize the texts
    result = tokenizer(examples["text"], padding="max_length", max_length=max_seq_length, truncation=True)

    return result

raw_datasets = raw_datasets.map(
            preprocess_function,
            batched=True,
            desc="Running tokenizer on dataset",
)

train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["validation"]
predict_dataset = raw_datasets["test"]

for index in random.sample(range(len(train_dataset)), 3):
    print(f"Sample {index} of the training set: {train_dataset[index]}.")

data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

Sample 1583 of the training set: {'text': 'Remember #Hiroshima destroyed by #Nuclear bomb..an occurrence should never ever happen again yet highly likely recur http://t.co/mB3MJevBb0', 'label': 0, 'input_ids': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

## Compute metric

In [8]:
def compute_metrics(p: EvalPrediction):
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    result = {
        "accuracy": accuracy.compute(predictions=preds, references=p.label_ids)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=p.label_ids, average="macro")["f1"],
        "f1_micro": f1.compute(predictions=preds, references=p.label_ids, average="micro")["f1"],
    }
    return result

## Training Args

In [9]:
training_args = TrainingArguments(
    do_train=True,
    do_eval=True,
    do_predict=True,
    # bf16=True,
    fp16=True,
    gradient_checkpointing=True,
    max_steps=200,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    load_best_model_at_end=True,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_accumulation_steps=50,
    max_grad_norm=1,
    weight_decay=0.1,
    optim="adafactor",
    learning_rate=5e-6,
    lr_scheduler_type="linear",
    num_train_epochs=3,
    report_to="wandb",
    logging_strategy="steps",
    logging_steps=10,
    save_total_limit=2,
    save_safetensors=False,
    overwrite_output_dir=True,
    log_level="warning",
    output_dir="./test_runs/mistralai/Mistral-7B-v0.1",
)


## Initialize Trainer

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## Training, Evaluation and Prediction

In [11]:
def main(training_args):
    # Training
    if training_args.do_train:
        logger.info("*** Train ***")
        train_result = trainer.train()
        metrics = train_result.metrics
        max_train_samples = len(train_dataset)
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
        trainer.save_model()  # Saves the tokenizer too for easy upload
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        metrics = trainer.evaluate(eval_dataset=eval_dataset)
        max_eval_samples = len(eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    # Predict
    if training_args.do_predict:
        logger.info("*** Predict ***")
        predictions = trainer.predict(predict_dataset)
        metrics["test_samples"] = len(predict_dataset)
        trainer.log_metrics("test", predictions.metrics)
        trainer.save_metrics("test", predictions.metrics)

In [12]:
main(training_args)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33makshat_patil[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 47.53 GiB of which 28.81 MiB is free. Including non-PyTorch memory, this process has 47.34 GiB memory in use. Of the allocated memory 46.84 GiB is allocated by PyTorch, and 13.40 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [20]:
del model
del trainer

NameError: name 'model' is not defined

In [22]:
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

31