### Install the dependencies

In [None]:
import os
import pandas as pd
from datasets import Dataset
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import (AutoTokenizer, TrainingArguments)
import pandas as pd
from datasets import Dataset
from trl import SFTTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported

output_dir = '../finetuned_models/outputmodel_standard/llama3'

## Step 1: Load and Preprocess the Dataset

First, we load the dataset from `quotes.csv` and preprocess it for binary classification.

In [None]:
# Load dataset
try:
    file_path = '../dataset/quotes_classification_data.csv'
    dataset = pd.read_csv(file_path)
    print('Dataset lodaded succesfully')

    if 'Memorable' in dataset.columns:
        # Convert 'Memorable' column to binary values
        dataset['Memorable'] = dataset['Memorable'].map({'Yes': 1, 'No': 0})
        print('Memorable column converted to binary values.')
    else:
        raise KeyError('Memorable column not found in dataset')

    if dataset['Memorable'].isnull().any():
        raise ValueError('Memorable column contains missing values.')
      
    # Split dataset into train and validation sets
    train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)

    # Convert to Huggingface datasets, ensuring correct types
    train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
    val_dataset = Dataset.from_pandas(val_df, preserve_index=False)

except FileNotFoundError:
    print(f"Error: The file at {file_path} was not found. Please check the file path.")
except pd.errors.EmptyDataError:
    print("Error: The CSV file is empty or corrupt.")
except KeyError as e:
    print(f"Error: {e}")
except ValueError as e:
    print(f"Error: {e}")
except Exception as e:
    # Catch oll other Exceptions which could occur
    print(f"An unexpected error occurred: {e}")


## Step 2: Tokenize the Dataset

Use tokenizer to preprocess the data so it fits our model.


In [None]:
model_name = "unsloth/llama-3-8b-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    if 'Quote' not in examples:
        raise KeyError('Error: Column Quote not found in Dataset.')
    return tokenizer(examples["Quote"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = train_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "Memorable"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "Memorable"])

## Step 3: Fine-Tune the Model

We'll fine-tune the pre trained model using the tokenized dataset.


In [None]:
max_seq_length = 512 # Depends on the length of the quotes, as no quotes are very long, 512 should be more than sufficient

number_of_training_examples = len(train_dataset)
per_device_train_batch_size = 8
gradient_accumulation_steps = 2
per_device_eval_size = 4
number_of_devices = 1
learning_rate = 3e-5
warmup_steps = 100

steps_per_epoch = (number_of_training_examples // (per_device_train_batch_size * gradient_accumulation_steps * number_of_devices))
number_of_epochs = 30

try:
    model, _ = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=None,
        load_in_4bit=True
    )
    print("Model loaded successfully.")
except Exception as e:
    raise RuntimeError(f"Error loading model {model_name}: {e}")

# Apply fast LoRA weights and model patching
try:
    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj"],
        lora_alpha=16,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        max_seq_length=max_seq_length,
        use_rslora=False,
        loftq_config=None,
    )
    print("LoRA weights applied successfully.")
except Exception as e:
    raise RuntimeError(f"Error applying LoRA weights to the model: {e}")

# Define training arguments
try:
    training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        per_device_eval_batch_size=per_device_eval_size,
        learning_rate=learning_rate,
        warmup_steps=warmup_steps,
        max_steps=steps_per_epoch * number_of_epochs,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=50,
        output_dir=output_dir,
        optim="adamw_8bit",
        seed=3407
    )
    print("Training arguments initialized successfully.")
except Exception as e:
    raise ValueError(f"Error in initializing training arguments: {e}")

# Initialize the trainer
try:
    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        args=training_args,
        dataset_text_field='Quote',
    )
    print("Trainer initialized successfully.")
except Exception as e:
    raise RuntimeError(f"Error initializing trainer: {e}")

# Train the model
trainer.train()

## Step 4: Clean up and Saving


In [None]:
# Save fine-tuned model and tokenizer
try:
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print("Tokenizer and model saved successfully.")
except Exception as e:
    raise RuntimeError(f"Error saving model or tokenizer to {output_dir}: {e}")

# Save the validation dataset
try:
    # Ensure output directory exists and if not create it
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save validation dataset as CSV
    val_df.to_csv(os.path.join(output_dir, 'val_dataset.csv'), index=False)
    print("Validation dataset saved successfully.")
except PermissionError:
    raise PermissionError(f"Permission denied: unable to write to {output_dir}.")
except FileNotFoundError:
    raise FileNotFoundError(f"Directory {output_dir} not found or cannot be created.")
except Exception as e:
    raise RuntimeError(f"Error saving validation dataset to {output_dir}: {e}")