In [2]:
"""
!pip install pip==24.0
!pip install accelerate==0.28.0
!pip install bitsandbytes==0.43.0
!pip install numpy==1.26.4
!pip install pandas==2.2.1
!pip install scikit-learn==1.4.1.post1
!pip install scikit-multilearn==0.2.0
!pip install transformers==4.38.2
!pip install peft==0.9.0
!pip install torch==2.2.1
"""

'\n!pip install pip==24.0\n!pip install accelerate==0.28.0\n!pip install bitsandbytes==0.43.0\n!pip install numpy==1.26.4\n!pip install pandas==2.2.1\n!pip install scikit-learn==1.4.1.post1\n!pip install scikit-multilearn==0.2.0\n!pip install transformers==4.38.2\n!pip install peft==0.9.0\n!pip install torch==2.2.1\n'

In [None]:
!python -m pip install --upgrade pip
!pip install peft
!pip install bitsandbytes
!pip install accelerate
!pip install --upgrade -q wandb

In [None]:
import wandb

from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

# I have saved my API token with "wandb_api" as Label. 
# If you use some other Label make sure to change the same below. 
wandb_api = user_secrets.get_secret("wandb_api") 

wandb.login(key=wandb_api)


In [None]:
import os
import random
import functools
import csv
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from datasets import Dataset, DatasetDict
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    get_linear_schedule_with_warmup
)


In [None]:
#os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [None]:
# Set random seed for reproducibility
np.random.seed(0)
torch.manual_seed(0)

In [None]:
# Load dataset
df = pd.read_csv('/kaggle/input/baseline-dataset/baseline_dataset.csv')

#df grab 3000 rows from each category, with 1500 of each label
df = df.groupby('category').head(10000).groupby('label').head(5000)

# print the distrubution of the dataset
print(df['category'].value_counts())
print(df['label'].value_counts())

In [None]:
df = df.drop('category', axis=1)

In [None]:
df.dropna(inplace=True)

In [None]:
#df display unique values in label column
print(df['label'].unique())


In [None]:
# Split into text and labels
text = df['claim'].values
labels = df['label'].values


In [None]:
# Class weights for binary classification
class_counts = np.bincount(labels)
class_weights = torch.tensor([len(labels) / class_counts[1], len(labels) / class_counts[0]], dtype=torch.float32)


In [None]:
# Train-test split
x_train, x_val, y_train, y_val = train_test_split(text, labels, test_size=0.1, stratify=labels, random_state=42)


In [None]:
# Create Hugging Face datasets
ds = DatasetDict({
    'train': Dataset.from_dict({'text': x_train, 'labels': y_train}),
    'val': Dataset.from_dict({'text': x_val, 'labels': y_val})
})


In [None]:
# model name
model_name = 'mistralai/Mistral-7B-v0.1'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
# Preprocess dataset with tokenizer
def tokenize_examples(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['text'], truncation=True, padding=True, max_length=512)
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs

tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
tokenized_ds = tokenized_ds.with_format('torch')


In [None]:
# qunatization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# lora config
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

# load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=1
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id


In [None]:
# define custom batch preprocessor
def collate_fn(batch, tokenizer):
    dict_keys = ['input_ids', 'attention_mask', 'labels']
    d = {k: [dic[k] for dic in batch] for k in dict_keys}
    d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=0
    )
    d['labels'] = torch.stack(d['labels'])
    return d

# Custom Trainer for handling class weights in binary classification
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        if logits.shape[-1] == 1:
            logits = logits.squeeze(-1)
        labels = labels.float()
        loss_fn = torch.nn.BCEWithLogitsLoss()
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Metrics computation for binary classification
def compute_metrics(p):
    predictions, labels = p
    predictions = torch.sigmoid(torch.tensor(predictions)).numpy()
    predictions = np.round(predictions)  # Threshold at 0.5
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)

    metrics = {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy
    }

    wandb.log(metrics)
    return metrics


In [None]:
# Define training args with gradient accumulation, learning rate scheduler, and early stopping
training_args = TrainingArguments(
    output_dir='baseline_binary_mistral_10000',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy='steps',
    eval_steps=50,
    save_steps=50,
    load_best_model_at_end=True,
    logging_steps=100,
    fp16=True,
    report_to='wandb',  # Enable wandb logging
)


In [None]:
# Calculate the number of training steps for the scheduler
num_training_steps = (len(tokenized_ds['train']) // training_args.per_device_train_batch_size) * training_args.num_train_epochs

# Initialize optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_training_steps
)

# Initialize the Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['val'],
    tokenizer=tokenizer,
    data_collator=functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)


In [None]:
np.object = object

trainer.train()