### 1. Environment Setup

In [6]:
import pandas as pd
import csv
import torch
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score
from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

### Set Model and Epochs

In [7]:
model_name = 'meta-llama/Meta-Llama-3-8B'
epochs = 5

### Change between augmented dataset and Non-augmented

In [None]:
models_folder = '../standard_finetuned_models/'
predictions_folder = '../predictions_csv/'

augmented_dataset = 0

if augmented_dataset:   
    output_dir = f'{models_folder}outputmodel_{model_name.split("/")[-1]}_augmented_dataset_{epochs}_epochs'
    predictions_csv_output = f'{predictions_folder}outputmodel_{model_name.split("/")[-1]}_augmented_dataset_{epochs}_epochs.csv'
else:
    output_dir = f'{models_folder}outputmodel_{model_name.split("/")[-1]}_{epochs}_epochs'
    predictions_csv_output = f'{predictions_folder}outputmodel_{model_name.split("/")[-1]}_{epochs}_epochs.csv'

#Token Max length
MAX_LEN = 512


### 2. Load Dataset

In [9]:

df_train = pd.read_csv('../dataset_splits/train_dataset.csv')
df_val = pd.read_csv('../dataset_splits/val_dataset.csv')    
df_test = pd.read_csv('../dataset_splits/test_dataset.csv')

if augmented_dataset:
    df_temp = pd.read_csv('../generated_datasets/merged_dataset.csv')
    df_train = pd.concat([df_train, df_temp])

### 3. Prepare Datasets for Training

In [10]:
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)

dataset = DatasetDict({
    'train': dataset_train,
    'val': dataset_val,
    'test': dataset_test
})

class_counts = df_train['Memorable'].value_counts(normalize=True)
class_weights = torch.tensor((1 / class_counts).tolist(), dtype=torch.float32)
class_weights /= class_weights.sum()

### 4. Model configurations

In [11]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout=0.05,
    bias='none',
    task_type='SEQ_CLS'
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=2
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Load the tokenizer and eos and pad tokens

In [12]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### 5. Tokenize and Process Dataset

In [None]:
def preprocessing_function(examples):
    return tokenizer(examples['Quote'], truncation=True, max_length=MAX_LEN)

tokenized_datasets = dataset.map(preprocessing_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column('Memorable', 'label')
tokenized_datasets.set_format('torch')


Map:   0%|          | 0/3075 [00:00<?, ? examples/s]

Map:   0%|          | 0/659 [00:00<?, ? examples/s]

Map:   0%|          | 0/660 [00:00<?, ? examples/s]

### 6. Define Custom Trainer

In [None]:
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights.to(self.args.device) if class_weights is not None else None

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop('labels').long()
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss = F.cross_entropy(logits, labels, weight=self.class_weights) if self.class_weights is not None else F.cross_entropy(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
# Training Arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=epochs,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    logging_steps = 100
)

# Initialize Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['val'],
    tokenizer=tokenizer,
    data_collator=collate_fn,
    class_weights=class_weights
)

# Train the model
train_result = trainer.train()


ValueError: Unable to avoid copy while creating an array as requested.
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword.

In [None]:
# Evaluation on test set
trainer.evaluate(eval_dataset=tokenized_datasets['test'])

# Make predictions on the test set
def make_predictions(trainer, df_test):
    sentences = df_test['Quote'].tolist()
    batch_size = 32  
    all_outputs = []
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i + batch_size]
        inputs = tokenizer(batch_sentences, return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(trainer.args.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = trainer.model(**inputs)
            all_outputs.append(outputs['logits'])
    final_outputs = torch.cat(all_outputs, dim=0)
    df_test['predictions'] = final_outputs.argmax(axis=1).cpu().numpy()
    df_test.to_csv(predictions_csv_output, index=False)
    
make_predictions(trainer, df_test)

In [None]:
def get_performance_metrics(df_test):
    y_true = df_test['Memorable']
    y_pred = df_test['predictions']
    print('Confusion Matrix:')
    print(confusion_matrix(y_true, y_pred))
    print('\nClassification Report:')
    print(classification_report(y_true, y_pred))
    print('Balanced Accuracy Score:', balanced_accuracy_score(y_true, y_pred))
    print('Accuracy Score:', accuracy_score(y_true, y_pred))

get_performance_metrics(df_test)

### Save model and tokenizer

In [None]:
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)