This notebook was executed in Google Colab using A100-GPU

### Start of execution

In [1]:
import time

In [2]:
start = time.time()

# 1. Setting the environment

In [3]:
!pip install -q datasets==2.20.0

In [4]:
!pip install -q bitsandbytes==0.43.1

In [5]:
!pip install -q peft==0.11.1

In [6]:
!pip install -q trl==0.9.6

# 2. Import Libraries

In [7]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
import os
import time
import scipy
import torch
import numpy as np
import transformers
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
from trl import SFTTrainer
import bitsandbytes as bnb
from datasets import Dataset
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from matplotlib import image as mpimg
from peft import LoraConfig, PeftConfig
from huggingface_hub import notebook_login
from sklearn.model_selection import train_test_split

from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)

from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)

In [9]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 3. Preparation

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
from google.colab import userdata

In [12]:
HUGGING_FACE_TOKEN = userdata.get('HUGGING_FACE_TOKEN')

In [13]:
path_general = 'drive/MyDrive/Profesional_Academico/Github_Personal/ML_AI_Contents/09.Deep_Learning/46.Finetunning_LLM_Hugging_Face'

# 4. Functions

In [14]:
def generate_prompt(text, sentiment):
    return f"""
            [INST]Analyze the sentiment of the news headline enclosed in square brackets,
            determine if it is positive, neutral, or negative, and return the answer as
            the corresponding sentiment label "positive" or "neutral" or "negative" [/INST]

            [{text}] = {sentiment}
            """.strip()

In [15]:
def generate_test_prompt(text):
    return f"""
            [INST]Analyze the sentiment of the news headline enclosed in square brackets,
            determine if it is positive, neutral, or negative, and return the answer as
            the corresponding sentiment label "positive" or "neutral" or "negative" [/INST]

            [{text}] = """.strip()

# 5. Creation of datasets

In [16]:
df = pd.read_csv(f'{path_general}/data/data.csv')

In [17]:
df_train, df_val = train_test_split(df, test_size = 0.40, random_state = 42)

In [18]:
df_val, df_test = train_test_split(df_val, test_size = 0.50, random_state = 42)

In [19]:
df_train.shape

(2907, 2)

In [20]:
df_val.shape

(969, 2)

In [21]:
df_test.shape

(970, 2)

In [22]:
df_train.to_csv(f'{path_general}/datasets/df_train.csv', index = False)

In [23]:
df_val.to_csv(f'{path_general}/datasets/df_val.csv', index = False)

In [24]:
df_test.to_csv(f'{path_general}/datasets/df_test.csv', index = False)

In [25]:
X_train = pd.DataFrame(df_train.apply(lambda x: generate_prompt(x['text'], x['sentiment']), axis = 1)).rename(columns = {0: 'text'})
y_train = df_train['sentiment']

In [26]:
X_val = pd.DataFrame(df_val.apply(lambda x: generate_prompt(x['text'], x['sentiment']), axis = 1)).rename(columns = {0: 'text'})
y_val = df_val['sentiment']

In [27]:
train_data = Dataset.from_pandas(X_train)
val_data = Dataset.from_pandas(X_val)

# 6. Load model

In [28]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

In [29]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    load_4bit_use_double_quant = False,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
    )

Unused kwargs: ['load_4bit_use_double_quant']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [30]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config = bnb_config,
    device_map = "auto",
    token = HUGGING_FACE_TOKEN
    )

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [31]:
model.config.use_cache = False
model.config.pretraining_tp = 1

In [32]:
tokenizer = AutoTokenizer.from_pretrained(model_id,
                                          trust_remote_code = True,
                                          padding_side = 'left',
                                          add_bos_token = True,
                                          add_eos_token = True,
                                          token = HUGGING_FACE_TOKEN
                                          )

In [33]:
tokenizer.pad_token = tokenizer.eos_token

# 7. Training

In [34]:
num_epochs = 2

In [35]:
peft_config = LoraConfig(
    lora_alpha = 16,
    lora_dropout = 0.1,
    r = 64,
    bias = "none",
    task_type = "CAUSAL_LM"
    )

In [36]:
training_arguments = TrainingArguments(
    output_dir = "logs",
    num_train_epochs = num_epochs,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 8,
    optim = "paged_adamw_32bit",
    save_steps = 0,
    logging_steps = 25,
    learning_rate = 2e-4,
    weight_decay = 0.001,
    fp16 = True,
    bf16 = False,
    max_grad_norm = 0.3,
    max_steps = -1,
    warmup_ratio = 0.03,
    group_by_length = True,
    lr_scheduler_type = "cosine",
    report_to = "tensorboard",
    evaluation_strategy = "epoch"
    )

In [37]:
trainer = SFTTrainer(
    model = model,
    train_dataset = train_data,
    eval_dataset = val_data,
    peft_config = peft_config,
    dataset_text_field = "text",
    tokenizer = tokenizer,
    args = training_arguments,
    packing = False,
    max_seq_length = 1024
    )

Map:   0%|          | 0/2907 [00:00<?, ? examples/s]

Map:   0%|          | 0/969 [00:00<?, ? examples/s]

In [38]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,0.6052,0.79134
1,0.5891,0.770921


TrainOutput(global_step=726, training_loss=0.792697909696043, metrics={'train_runtime': 1622.8642, 'train_samples_per_second': 3.583, 'train_steps_per_second': 0.447, 'total_flos': 2.580890252004557e+16, 'train_loss': 0.792697909696043, 'epoch': 1.997936016511868})

# 8. Guardar modelo

In [39]:
trainer.save_model(f'{path_general}/model')

### End of execution

In [40]:
end = time.time()

delta = (end - start)

hours = int(delta/3_600)
mins = int((delta - hours*3_600)/60)
secs = int(delta - hours*3_600 - mins*60)

print(f'Hours: {hours}, Minutes: {mins}, Seconds: {secs}')

Hours: 0, Minutes: 27, Seconds: 54
