In [1]:
import os
import random
import functools
import csv
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from datasets import Dataset, DatasetDict
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    get_linear_schedule_with_warmup
)


In [None]:
!pip show numpy
!pip show pandas
!pip show torch
!pip show scikit-multilearn
!pip show scikit-learn
!pip show datasets
!pip show transformers

In [2]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [3]:
# Set random seed for reproducibility
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x28b7a0d15f0>

In [5]:
# Load dataset
df = pd.read_csv('preprocessed_datasets/baseline_dataset.csv')

#df grab 3000 rows from each category, with 1500 of each label
df = df.groupby('category').head(10000).groupby('label').head(5000)

# print the distrubution of the dataset
print(df['category'].value_counts())
print(df['label'].value_counts())

category
0    3000
Name: count, dtype: int64
label
1    1596
0    1404
Name: count, dtype: int64


In [5]:
df = df.drop('category', axis=1)

In [6]:
df.dropna(inplace=True)

In [7]:
#df display unique values in label column
print(df['label'].unique())

[1 0]


In [8]:
# Split into text and labels
text = df['claim'].values
labels = df['label'].values

In [9]:
# Class weights for binary classification
class_counts = np.bincount(labels)
class_weights = torch.tensor([len(labels) / class_counts[1], len(labels) / class_counts[0]], dtype=torch.float32)

In [10]:
# Train-test split
x_train, x_val, y_train, y_val = train_test_split(text, labels, test_size=0.1, stratify=labels, random_state=42)

In [11]:
# Create Hugging Face datasets
ds = DatasetDict({
    'train': Dataset.from_dict({'text': x_train, 'labels': y_train}),
    'val': Dataset.from_dict({'text': x_val, 'labels': y_val})
})

In [12]:
# model name
model_name = 'mistralai/Mistral-7B-v0.1'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [13]:
# Preprocess dataset with tokenizer
def tokenize_examples(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['text'], truncation=True, padding=True, max_length=512)
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs

tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
tokenized_ds = tokenized_ds.with_format('torch')

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [14]:
# qunatization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# lora config
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

# load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=1
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  new_value = value.to(device)
Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# define custom batch preprocessor
def collate_fn(batch, tokenizer):
    dict_keys = ['input_ids', 'attention_mask', 'labels']
    d = {k: [dic[k] for dic in batch] for k in dict_keys}
    d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=0
    )
    d['labels'] = torch.stack(d['labels'])
    return d

# Custom Trainer for handling class weights in binary classification
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        if logits.shape[-1] == 1:
            logits = logits.squeeze(-1)
        labels = labels.float()
        loss_fn = torch.nn.BCEWithLogitsLoss()
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Metrics computation for binary classification
def compute_metrics(p):
    predictions, labels = p
    predictions = torch.sigmoid(torch.tensor(predictions)).numpy()
    predictions = np.round(predictions)  # Threshold at 0.5
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)

    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy
    }

In [16]:
# Define training args with gradient accumulation, learning rate scheduler, and early stopping
training_args = TrainingArguments(
    output_dir='baseline_binary_mistral_10000',
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Adjusted batch size
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy='steps',
    eval_steps=50,  # Evaluate every 500 steps
    save_steps=50,
    load_best_model_at_end=True,
    logging_steps=100,
    fp16=True,  # Enable mixed precision training if supported by hardware
)

In [17]:
# Calculate the number of training steps for the scheduler
num_training_steps = (len(tokenized_ds['train']) // training_args.per_device_train_batch_size) * training_args.num_train_epochs

# Initialize optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_training_steps
)

# Initialize the Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['val'],
    tokenizer=tokenizer,
    data_collator=functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [18]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: arman-dogru. Use `wandb login --relogin` to force relogin


  0%|          | 0/2248 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  attn_output = torch.nn.functional.scaled_dot_product_attention(


KeyboardInterrupt: 

In [None]:
# Save and load model
peft_model_id = 'baseline_binary_mistral_10000'

In [None]:
trainer.model.save_pretrained(peft_model_id)

In [None]:
trainer.model.save_pretrained(peft_model_id)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(peft_model_id)