In [3]:
# install necessary libraries for the below import statements
!pip install bitsandbytes
!pip install datasets
!pip install peft
!pip install trl
!pip install accelerate

In [1]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    Seq2SeqTrainer,
    DataCollatorForLanguageModeling,
    DataCollatorForSeq2Seq,
    AutoModelForCausalLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    BitsAndBytesConfig,
    AutoConfig,
    DataCollatorWithPadding
)
import json
import os
from datasets import Dataset
import argparse
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model, TaskType
from trl import SFTTrainer
import pyarrow as pa
import pyarrow.dataset as ds
# Load prompts data
prompts = pd.read_csv("mlhc_training_data.csv")

# Extract input texts and labels
df = prompts[["prompt", "label", "type", "label_int"]]

# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.4, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Prepare train inputs and labels
train_inputs = train_df["prompt"].tolist()
train_labels = train_df["label"].tolist()

In [2]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistral-7b",
                                          bos_token='<s>',
                                          eos_token='</s>',
                                          padding=True,
                                          add_prefix_space=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

#Create collator

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


# Initialize quantized model as a classifier
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)

config = AutoConfig.from_pretrained('mistral-7b')
config.pad_token_id = tokenizer.pad_token_id
config.eos_token_id = tokenizer.eos_token_id


model = AutoModelForSequenceClassification.from_pretrained("mistral-7b",
                                             torch_dtype=torch.bfloat16,
                                             quantization_config=bnb_config,
                                             config=config,
                                             device_map="auto")

model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

#add another layer designed to output a binary label
config.num_labels = 2  # Number of classes in your sequence classification task
model.classifier = torch.nn.Linear(config.hidden_size, config.num_labels)


# Convert the model to LoRA using PFET
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=16, lora_alpha=16, lora_dropout=0.1, bias="none",
    target_modules=[
        "q_proj",
        "v_proj",
    ],
)

model = get_peft_model(model, peft_config)

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistral-7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Create dataset
train_dataset = Dataset.from_dict({
    "text": train_inputs,
    "labels": train_df["label_int"].tolist()
})

def preprocess_function(examples):
    # Tokenize inputs
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    inputs["labels"] = examples["labels"]
    return inputs



train_dataset = train_dataset.map(preprocess_function, batched=True)

train_dataset.set_format("torch")


Map:   0%|          | 0/1498 [00:00<?, ? examples/s]

In [4]:

# Create dataset
val_dataset = Dataset.from_dict({
    "text": val_df["prompt"].tolist(),
    "labels": val_df["label_int"].tolist()
})

val_dataset = val_dataset.map(preprocess_function, batched=True)

val_dataset.set_format("torch")


Map:   0%|          | 0/375 [00:00<?, ? examples/s]

In [5]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  accuracy = accuracy_score(labels, preds)
  f1 = f1_score(labels, preds, average='weighted')
  return {'accuracy': accuracy, 'f1': f1}

In [6]:
lr = 2e-5
batch_size = 16
num_epochs = 1

training_args = TrainingArguments(
    output_dir="mistral-lora-token-classification",
    learning_rate=lr,
    lr_scheduler_type= "constant",
    warmup_ratio= 0.1,
    max_grad_norm= 0.3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.001,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_checkpointing=True,
)


mistral_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [7]:
mistral_trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.334341,0.925333,0.924963


Checkpoint destination directory mistral-lora-token-classification/checkpoint-94 already exists and is non-empty. Saving will proceed but saved results may be invalid.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


TrainOutput(global_step=94, training_loss=0.6143440084254488, metrics={'train_runtime': 3906.8214, 'train_samples_per_second': 0.383, 'train_steps_per_second': 0.024, 'total_flos': 3.215057524155187e+16, 'train_loss': 0.6143440084254488, 'epoch': 1.0})

In [9]:
os.mkdir('mlhc_final')
os.chdir(os.getcwd()+'/mlhc_final')
os.mkdir('data')
os.mkdir('models')

'/content/mlhc_final'

In [13]:
tokenizer.save_pretrained('models/mlhc_mistral7b_ft_model')
mistral_trainer.model.save_pretrained('models/mlhc_mistral7b_ft_tokenizer')
train_df.to_csv("data/mlhc_train_df.csv")
val_df.to_csv("data/mlhc_val_df.csv")
test_df.to_csv("data/mlhc_test_df.csv")

In [20]:
#import DataLoader
from torch.utils.data import DataLoader

# Load model
model_ft = mistral_trainer.model


# Create dataset
test_dataset = Dataset.from_dict({
    "text": test_df["prompt"].tolist(),
    "labels": test_df["label_int"].tolist()
})

test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns='text')

test_dataset.set_format("torch")

test_dataset = DataLoader(test_dataset, batch_size=16, shuffle=False)

Map:   0%|          | 0/1250 [00:00<?, ? examples/s]

In [21]:
model_ft.eval()  # Set model to evaluation mode
predictions = []
true_labels = []  # Only used if you have labels for the new dataset
pred_logits = []

with torch.no_grad():
  for batch in test_dataset:
    outputs = model_ft(**batch)
    logits = outputs.logits  # Assuming model outputs logits
    pred_logits.append(logits)
    predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
    if true_labels is not None:
        true_labels.extend(batch["labels"].cpu().numpy())  # Assuming labels are in "labels" key

# Evaluate model performance (accuracy in this example)
if true_labels is not None:
  accuracy = accuracy_score(true_labels, predictions)
  f1 = f1_score(true_labels, predictions, average="weighted")
  print(f"Accuracy on new dataset: {accuracy:.4f}")
  print(f"F1-score (weighted) on new dataset: {f1:.4f}")
else:
  print("Predicted labels:", predictions)  # Print predicted labels if no ground truth


Accuracy on new dataset: 0.9152
F1-score (weighted) on new dataset: 0.9151


In [23]:
test_df["pred"] = predictions

In [25]:
test_df.to_csv("data/mlhc_test_df_pred.csv")