## Setup Environment

In [1]:
import logging
import os
import random
import sys
import warnings
from dataclasses import dataclass, field
from typing import List, Literal, Optional, Union

import datasets
import evaluate
import numpy as np
from datasets import Value, load_dataset

import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    PhiForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

from peft import (
    TaskType,
    LoraConfig,
    get_peft_model,
    PeftModel,
    PeftConfig,
)

from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

HF_TOKEN = os.getenv("HF_TOKEN")
WANDB_API_KEY = os.getenv("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "TEST_SEQ_CLASSIFICATION_RUNS"

## Load Dataset

In [9]:
raw_datasets = load_dataset("MAdAiLab/twitter_disaster","default")
# raw_datasets = load_dataset("MAdAiLab/amazon-attrprompt","default")
# raw_datasets = load_dataset("coastalcph/lex_glue", "scotus")
# raw_datasets = load_dataset("ccdv/patent-classification","abstract")

# raw_datasets = df.rename_column("label", "labels")

In [None]:
logger = logging.getLogger(__name__)

In [10]:
def get_label_list(raw_dataset, split="train") -> List[str]:
    """Get the list of labels from a multi-label dataset"""

    if isinstance(raw_dataset[split]["label"][0], list):
        label_list = [label for sample in raw_dataset[split]["label"] for label in sample]
        label_list = list(set(label_list))
    else:
        label_list = raw_dataset[split].unique("label")
    # we will treat the label list as a list of string instead of int, consistent with model.config.label2id
    label_list = [str(label) for label in label_list]
    return label_list

In [11]:
label_list = get_label_list(raw_datasets, split="train")
for split in ["validation", "test"]:
    if split in raw_datasets:
        val_or_test_labels = get_label_list(raw_datasets, split=split)
        diff = set(val_or_test_labels).difference(set(label_list))
        if len(diff) > 0:
            # add the labels that appear in val/test but not in train, throw a warning
            logger.warning(
                f"Labels {diff} in {split} set but not in training set, adding them to the label list"
            )
            label_list += list(diff)
# if label is -1, we throw a warning and remove it from the label list
for label in label_list:
    if label == -1:
        logger.warning("Label -1 found in label list, removing it.")
        label_list.remove(label)

label_list.sort()
num_labels = len(label_list)
if num_labels <= 1:
    raise ValueError("You need more than one label to do classification.")

## Load pretrained model and tokenizer

In [5]:
checkpoint = "mistralai/Mistral-7B-v0.1"

config = AutoConfig.from_pretrained(
        pretrained_model_name_or_path=checkpoint,
        num_labels=num_labels,
        finetuning_task="text-classification",
        trust_remote_code=True,
)
config.problem_type = "single_label_classification"

tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=checkpoint,
    trust_remote_code=True,
)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path=checkpoint,
        config=config,
        trust_remote_code=True,
)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

peft_config = LoraConfig(
            r=2,
            target_modules=["q_proj", "v_proj"],
            lora_alpha=4,
            task_type=TaskType.SEQ_CLS,
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

label_to_id = {v: i for i, v in enumerate(label_list)}

model.config.label2id = label_to_id
model.config.id2label = {id: label for label, id in label_to_id.items()}

max_seq_length =  512 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 860,160 || all params: 7,111,528,448 || trainable%: 0.012095290151611583


## Preprocess Dataset

In [6]:
def preprocess_function(examples):
    # Tokenize the texts
    result = tokenizer(examples["text"], padding="max_length", max_length=max_seq_length, truncation=True)

    return result

raw_datasets = raw_datasets.map(
            preprocess_function,
            batched=True,
            desc="Running tokenizer on dataset",
)

train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["validation"]
predict_dataset = raw_datasets["test"]

for index in random.sample(range(len(train_dataset)), 3):
    print(f"Sample {index} of the training set: {train_dataset[index]}.")

data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

Running tokenizer on dataset:   0%|          | 0/1088 [00:00<?, ? examples/s]

Sample 6813 of the training set: {'text': 'Versions of KS where if a character was /every/ character world would explode.\n\nRin\nShizune\nMisha\nEmi\nKenji\nYuuko\nNomiya\nHisao', 'label': 0, 'input_ids': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2

## Compute metric

In [7]:
def compute_metrics(p: EvalPrediction):
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    result = {
        "accuracy": accuracy.compute(predictions=preds, references=p.label_ids)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=p.label_ids, average="macro")["f1"],
        "f1_micro": f1.compute(predictions=preds, references=p.label_ids, average="micro")["f1"],
    }
    return result

## Training Args

In [9]:
training_args = TrainingArguments(
    do_train=False,
    do_eval=False,
    do_predict=True,
    bf16=True,
    fp16=False,
    gradient_checkpointing=True,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    load_best_model_at_end=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_accumulation_steps=100,
    max_grad_norm=1,
    weight_decay=0.1,
    optim="adamw_torch",
    learning_rate=5e-6,
    lr_scheduler_type="linear",
    num_train_epochs=3,
    report_to="wandb",
    logging_strategy="steps",
    logging_steps=10,
    save_total_limit=2,
    save_safetensors=False,
    overwrite_output_dir=True,
    log_level="warning",
    output_dir="./test_runs/mistralai/Mistral-7B-v0.1/twitter_disaster",
)


## Initialize Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## Training, Evaluation and Prediction

In [None]:
def main(training_args):
    # Training
    if training_args.do_train:
        logger.info("*** Train ***")
        train_result = trainer.train()
        metrics = train_result.metrics
        max_train_samples = len(train_dataset)
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
        trainer.save_model()  # Saves the tokenizer too for easy upload
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        metrics = trainer.evaluate(eval_dataset=eval_dataset)
        max_eval_samples = len(eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    # Predict
    if training_args.do_predict:
        logger.info("*** Predict ***")
        predictions = trainer.predict(predict_dataset)
        metrics["test_samples"] = len(predict_dataset)
        trainer.log_metrics("test", predictions.metrics)
        trainer.save_metrics("test", predictions.metrics)

In [None]:
main(training_args)

In [10]:
del model
# del trainer

In [12]:
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

0

## Lora Model Inference

In [13]:
label_list = get_label_list(raw_datasets, split="train")
num_labels = len(label_list)
num_labels

2

In [12]:
# Load the model
label_list = get_label_list(raw_datasets, split="train")
num_labels = len(label_list)


peft_model_id = "../../experiments_checkpoints/twitter_disaster"
adapter_config = PeftConfig.from_pretrained(peft_model_id)
config = AutoConfig.from_pretrained(
        adapter_config.base_model_name_or_path,
        num_labels=num_labels,
        finetuning_task="text-classification",
        trust_remote_code=True,
        problem_type="single_label_classification",
    )

inference_model = AutoModelForSequenceClassification.from_pretrained(
    adapter_config.base_model_name_or_path,
    config=config,
    )

label_to_id = {v: i for i, v in enumerate(label_list)}

inference_model.config.label2id = label_to_id
inference_model.config.id2label = {id: label for label, id in label_to_id.items()}
tokenizer = AutoTokenizer.from_pretrained(adapter_config.base_model_name_or_path)
tokenizer.padding_side = "left"  
tokenizer.pad_token = tokenizer.eos_token

# Load the lora model
inference_model = PeftModel.from_pretrained(inference_model, peft_model_id)
inference_model.resize_token_embeddings(len(tokenizer))
inference_model.config.pad_token_id = tokenizer.pad_token_id

# merged_model = inference_model.merge_and_unload() 

# trainer = Trainer(
#             model=inference_model,
#             args=training_args,
#             train_dataset=train_dataset if training_args.do_train else None,
#             eval_dataset=eval_dataset if training_args.do_eval else None,
#             compute_metrics=compute_metrics,
#             tokenizer=tokenizer,
#             data_collator=data_collator,
#             # callbacks=[WandbLoggingCallback()],
#         )

# predictions = trainer.predict(predict_dataset)
# predictions.metrics['test_samples'] = len(predict_dataset)
# trainer.log_metrics("test", predictions.metrics)
# trainer.save_metrics("test", predictions.metrics)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
inference_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): MistralForSequenceClassification(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=128, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=128, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
           

In [15]:
adapter =  AutoModelForSequenceClassification.from_pretrained(
         pretrained_model_name_or_path=peft_model_id,
    )
adapter

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MistralForSequenceClassification(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=128, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=128, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=1024, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inpl

In [14]:
model =  AutoModelForSequenceClassification.from_pretrained(
    adapter_config.base_model_name_or_path,
    config=config,
    )
model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MistralForSequenceClassification(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): M

In [2]:
peft_model_id = "../../experiments_checkpoints/LoRA/meta_llama/patent_classification_abstract"
adapter =  AutoModelForSequenceClassification.from_pretrained(
         pretrained_model_name_or_path=peft_model_id,
         num_labels=9,
    )
adapter

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Identity()
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=128, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=128, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Identity()
            )
            (lora_A): M

In [1]:
adapter

NameError: name 'adapter' is not defined

In [None]:
del peft_model_id
del inference_model
del merged_model
del trainer

In [None]:
import torch, gc
torch.cuda.empty_cache()
gc.collect()

236

In [None]:
cd ../../test/

/home/harpreet_guest2/akshat/Sequence_Classification/test


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [None]:
pwd

'/home/harpreet_guest2/akshat/Sequence_Classification/notebook/lora'