In [1]:
!pip install bitsandbytes
!pip install datasets
!pip install peft
!pip install trl
!pip install accelerate

In [1]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    Seq2SeqTrainer,
    DataCollatorForLanguageModeling,
    DataCollatorForSeq2Seq,
    AutoModelForCausalLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    BitsAndBytesConfig,
    AutoConfig,
    DataCollatorWithPadding
)
import json
import os
from datasets import Dataset
import argparse
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model, TaskType
from trl import SFTTrainer
import pyarrow as pa
import pyarrow.dataset as ds
# Load prompts data
prompts = pd.read_csv("mlhc_training_data.csv")

# Extract input texts and labels
df = prompts[["prompt", "label", "type", "label_int"]]

# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.4, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Prepare train inputs and labels
train_inputs = train_df["prompt"].tolist()
train_labels = train_df["label"].tolist()
# Train the model

In [29]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistral-7b",
                                          bos_token='<s>',
                                          eos_token='</s>',
                                          padding=True,
                                          add_prefix_space=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id



# Tokenize train inputs
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


# Initialize model
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)


config = AutoConfig.from_pretrained('mistral-7b')
config.pad_token_id = tokenizer.pad_token_id
config.eos_token_id = tokenizer.eos_token_id


model = AutoModelForSequenceClassification.from_pretrained("/content/mistral-lora-token-classification/checkpoint-94",
                                             torch_dtype=torch.bfloat16,
                                             quantization_config=bnb_config,
                                             config=config,
                                             device_map="auto")





Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistral-7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading adapter weights from /content/mistral-lora-token-classification/checkpoint-94 led to unexpected keys not found in the model:  ['classifier.bias', 'classifier.weight']. 


In [26]:
# Create dataset
from torch.utils.data import DataLoader

test_dataset = Dataset.from_dict({
    "text": test_df["prompt"].tolist(),
    "labels": test_df["label_int"].tolist()
})



def preprocess_function(examples):
    # Tokenize inputs
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    inputs["labels"] = examples["labels"]
    return inputs


test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns='text')

test_dataset.set_format("torch")

test_dataset = DataLoader(test_dataset, batch_size=32, shuffle=False)

Map:   0%|          | 0/1250 [00:00<?, ? examples/s]

In [34]:
from logging import log
predictions = []
true_labels = []
logits_set = []

with torch.no_grad():
  for batch in test_dataset:
    outputs = model(**batch)
    logits = outputs.logits  # Assuming model outputs logits
    predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
    logit = logits
    logits_set.extend(logit)
    if true_labels is not None:
        true_labels.extend(batch["labels"].cpu().numpy())  # Assuming labels are in "labels" key

# Evaluate model performance (accuracy in this example)
if true_labels is not None:
  accuracy = accuracy_score(true_labels, predictions)
  f1 = f1_score(true_labels, predictions, average="weighted")
  print(f"Accuracy on new dataset: {accuracy:.4f}")
  print(f"F1-score (weighted) on new dataset: {f1:.4f}")
else:
  print("Predicted labels:", predictions)  # Print predicted labels if no ground truth


Accuracy on new dataset: 0.9168
F1-score (weighted) on new dataset: 0.9167


In [37]:
data = pd.read_csv("/content/mlhc_final/mlhc_test_df_pred_w_flipped.csv")

In [53]:
logits_list = []

# Loop through each tensor and convert to a list
for tensor in logits_set:
  data_list = tensor.tolist()
  logits_list.append(data_list)

In [54]:
data["pred_logits"] = logits_list

In [72]:
data.to_csv("mlhc_final/mlhc_final_w_logit.csv")