# Fine Tuning a model in 8 bits

In [None]:
# Importing the required libraries
import os
from dotenv import load_dotenv
from transformers import logging, pipeline
import wandb
from huggingface_hub import login
# Load environment variables from .env file
load_dotenv()

# Access the Hugging Face secret key
hugging_face_secret = os.getenv('HUGGING_FACE_SECRET')
wandb_api_key = os.getenv('WANDB_API_KEY')

# Login to wandb to log the metrics
wandb.login(key = wandb_api_key)

# Login to huggingface
login(hugging_face_secret)

2024-07-19 06:04:34.190222: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[34m[1mwandb[0m: Currently logged in as: [33mjcarmona[0m ([33mjcarmona-[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# Environment variables
PROJECT = "Fine-Tuning CAF"

## Fine Tune in 8 bits

In [None]:
import os
import sys
from dataclasses import dataclass, field
from typing import Optional
from datasets.arrow_dataset import Dataset
import torch
from datasets import load_dataset, load_from_disk
from peft import LoraConfig
from peft import AutoPeftModelForCausalLM
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
)
from trl import SFTTrainer


torch.manual_seed(42)


@dataclass
class ScriptArguments:
    """
    These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train.
    """

    local_rank: Optional[int] = field(default=-1, metadata={"help": "Used for multi-gpu"})

    per_device_train_batch_size: Optional[int] = field(default=10)
    per_device_eval_batch_size: Optional[int] = field(default=4)
    gradient_accumulation_steps: Optional[int] = field(default=17)
    learning_rate: Optional[float] = field(default=3e-5)
    max_grad_norm: Optional[float] = field(default=1.0)
    weight_decay: Optional[float] = field(default=0.01)
    lora_alpha: Optional[int] = field(default=16)
    lora_dropout: Optional[float] = field(default=0.1)
    lora_r: Optional[int] = field(default=8)
    max_seq_length: Optional[int] = field(default=256)
    model_name: Optional[str] = field(
        default="meta-llama/Meta-Llama-3-8B-Instruct",
        metadata={
            "help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc."
        }
    )
    dataset_name: Optional[str] = field(
        default=None,
        metadata={"help": "The preference dataset to use."},
    )
    dataset_path: Optional[str] = field(
        default="train",
        metadata={"help": "The local path to the dataset."},
    )

    use_4bit: Optional[bool] = field(
        default=False,
        metadata={"help": "Activate 4bit precision base model loading"},
    )
    use_nested_quant: Optional[bool] = field(
        default=False,
        metadata={"help": "Activate nested quantization for 4bit base models"},
    )
    bnb_4bit_compute_dtype: Optional[str] = field(
        default="float16",
        metadata={"help": "Compute dtype for 4bit base models"},
    )
    bnb_4bit_quant_type: Optional[str] = field(
        default="nf4",
        metadata={"help": "Quantization type fp4 or nf4"},
    )
    num_train_epochs: Optional[int] = field(
        default=3,
        metadata={"help": "The number of training epochs for the reward model."},
    )
    fp16: Optional[bool] = field(
        default=False,
        metadata={"help": "Enables fp16 training."},
    )
    bf16: Optional[bool] = field(
        default=True,
        metadata={"help": "Enables bf16 training."},
    )
    packing: Optional[bool] = field(
        default=False,
        metadata={"help": "Use packing dataset creating."},
    )
    gradient_checkpointing: Optional[bool] = field(
        default=True,
        metadata={"help": "Enables gradient checkpointing."},
    )
    optim: Optional[str] = field(
        default="adamw_torch",
        metadata={"help": "The optimizer to use."},
    )
    lr_scheduler_type: str = field(
        default="cosine",
        metadata={"help": "Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis"},
    )
    max_steps: int = field(default=200, metadata={"help": "How many optimizer update steps to take"})
    warmup_steps: int = field(default=20, metadata={"help": "# of steps to do a warmup for"})
    group_by_length: bool = field(
        default=True,
        metadata={
            "help": "Group sequences into batches with same length. Saves memory and speeds up training considerably."
        },
    )
    save_steps: int = field(default=200, metadata={"help": "Save checkpoint every X updates steps."})
    logging_steps: int = field(default=5, metadata={"help": "Log every X updates steps."})
    merge_and_push: Optional[bool] = field(
        default=True,
        metadata={"help": "Merge and push weights after training"},
    )
    output_dir: str = field(
        default="./results_packing",
        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
    )
    report_to: Optional[str] = field(
        default="wandb",
        metadata={"help": "The integration to report the results and logs to."},
    )

parser = HfArgumentParser(ScriptArguments)

# Remove the Jupyter-specific arguments
sys.argv = sys.argv[:1]
script_args = parser.parse_args_into_dataclasses()[0]
# Initialize wandb
wandb.init(project=PROJECT, config=vars(script_args))


def gen_batches_train():
    """
    Generator function that yields batches of data for training.
    """
    if script_args.dataset_path:
        ds = load_from_disk(script_args.dataset_path)
    else:
        ds = load_dataset(script_args.dataset_name, streaming=True, split="train")

# Prompt example

#         p = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

# Eres un asistente intelegente<|eot_id|><|start_header_id|>user<|end_header_id|>

# hola<|eot_id|>"""
    # Iterate over the dataset
    for sample in iter(ds):
        # Extract instruction and input from the sample
        instruction = str(sample['instruction'])
        out_text = str(sample['output'])
        # Format the prompt
        formatted_prompt = (
            f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
            f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n Classify the sentence into: 'Info','Functional', 'Regulatory', 'Management', 'Operational', 'Technical' or 'Maintenance'. Sentence: \n{instruction} \n\n### Response:\n"
            f"<|eot_id|><|start_header_id|>asssitant<|end_header_id|>\n\n",
            f"{str(out_text)}"
            f"<|eot_id|><|end_of_text|>"
        )
        formatted_prompt = "".join(formatted_prompt)
        yield {'text': formatted_prompt}

def create_and_prepare_model(args):
    """ Create and prepare the model for training.

    Args:
        args: Arguments for the model.

    Returns:
        model: The model to train.
        peft_config: The configuration for the PEFT model.
        tokenizer: The tokenizer for the model.
    """
    # Load the model with the specified configuration for quantization
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        load_in_4bit=args.use_4bit,
        bnb_4bit_quant_type=args.bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=getattr(torch, args.bnb_4bit_compute_dtype),
        bnb_4bit_use_double_quant=args.use_nested_quant,
    )

    # Load the entire model on the GPU 0
    # switch to `device_map = "auto"` for multi-GPU
    device_map = {"": 0}

    # Load the model
    model = AutoModelForCausalLM.from_pretrained(
        args.model_name, 
        quantization_config=bnb_config, 
        device_map=device_map, 
        use_auth_token=True,
    )
    
    # Set the configuration for the PEFT model
    peft_config = LoraConfig(
        lora_alpha=script_args.lora_alpha,
        lora_dropout=script_args.lora_dropout,
        r=script_args.lora_r,
        bias="none",
        task_type="CAUSAL_LM", 
        target_modules=['q_proj', 'v_proj'],
    )

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(script_args.model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    return model, peft_config, tokenizer

# Set up the training arguments
training_arguments = TrainingArguments(
    output_dir=script_args.output_dir, # The output directory
    per_device_train_batch_size=script_args.per_device_train_batch_size, # The batch size per GPU
    gradient_accumulation_steps=script_args.gradient_accumulation_steps, # The number of gradient accumulation steps
    optim=script_args.optim, # The optimizer to use
    save_steps=script_args.save_steps, # Save a checkpoint every X updates steps
    logging_steps=script_args.logging_steps, # Log every X updates steps
    learning_rate=script_args.learning_rate, # The learning rate
    fp16=script_args.fp16, # Enable fp16 training
    bf16=script_args.bf16, # Enable bf16 training
    max_grad_norm=script_args.max_grad_norm, # The maximum gradient norm
    max_steps=script_args.max_steps, # The maximum number of optimizer update steps
    warmup_steps=script_args.warmup_steps, # The number of steps to do a warmup for
    group_by_length=script_args.group_by_length, # Group sequences into batches with same length
    lr_scheduler_type=script_args.lr_scheduler_type, # The learning rate schedule
    report_to=script_args.report_to, # The integration to report the results and logs to
    gradient_checkpointing=script_args.gradient_checkpointing, # Enable gradient checkpointing
)

# Set up the model, PEFT configuration, and tokenizer
model, peft_config, tokenizer = create_and_prepare_model(script_args)
# Create the training generator
train_gen = Dataset.from_generator(gen_batches_train)
# Set the padding side
tokenizer.padding_side = "right"
# Create the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_gen,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=script_args.max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=script_args.packing,
)
# Train the model
trainer.train()
# Save the model
if script_args.merge_and_push:
    # Save the final model
    output_dir = os.path.join(script_args.output_dir, "final_checkpoints")
    trainer.model.save_pretrained(output_dir)

    # Free memory for merging weights
    del model
    torch.cuda.empty_cache()
    # Load the model for merging
    model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
    model = model.merge_and_unload()
    # Save the merged model
    output_merged_dir = os.path.join(script_args.output_dir, "final_merged_checkpoint")
    model.save_pretrained(output_merged_dir, safe_serialization=True)




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
5,4.9604
10,5.0398
15,5.0848
20,4.9248
25,4.5802
30,4.5215
35,4.1636
40,3.8469
45,3.5948
50,3.3596


### Prompt example and inference

In [18]:
p = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Eres un asistente intelegente<|eot_id|><|start_header_id|>user<|end_header_id|>

hola<|eot_id|>"""


In [19]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)


Function to make inference of the fine tuned model

In [None]:
def llm(prompt):  
    """ LLM inference function to give response to the user prompt.

    Args:
        prompt (str): The user prompt.

    Returns:
        str: The response to the user prompt.
    """
    p = (
        f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
        f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n Classify the sentence into: 'Info','Functional', 'Regulatory', 'Management', 'Operational', 'Technical' or 'Maintenance'. Sentence: \n{prompt} \n\n### Response:\n"
        f"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
    )


    marcador = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
    
    # Encontrar la posición del marcador en el texto
    

    
    # Devolver el contenido después del marcador
    result = pipe(p)
    result = result[0]['generated_text']
    posicion = result.find(marcador)
    return result[posicion + len(marcador)+2:] 

In [62]:
res = llm("IT-systems")

In [63]:
print(res)

Management


# Prueba de ejecución

In [66]:
test = pd.read_csv("test.csv")
for i in range(10):
    print("Instruction: ", test['instruction'][i], "\nOutput: ", test['output'][i], "\nInference Output: ", llm(test['instruction'][i]))
    print("-----------------------------------------------")

Instruction:  Systems Test Completion Reviews (TCRs): 
Output:  Info 
Inference Output:  Management
-----------------------------------------------
Instruction:  Without derogating from the Project Company’s obligations under and pursuant to the Agreement, the Project Company shall ensure that: 
Output:  Management 
Inference Output:  Management
-----------------------------------------------
Instruction:  Additional and/or alternative control measures and actions shall be implemented as are necessary and required to limit dust, prevent air pollution and preserve the quality of the air. 
Output:  Technical 
Inference Output:  Operational
-----------------------------------------------
Instruction:  The Project Company shall submit to the Owner a SDP as part of the SRR. 
Output:  Management 
Inference Output:  Technical
-----------------------------------------------
Instruction:  Reflect and conform to: (i) the Project Timeline and Milestones stipulated in Appendix B to the Agreement a

# Test Results

In [64]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

def test():
    """ Test the model on the test dataset.
    """
    test_df = pd.read_csv("test.csv")
    results = test_df['output']
    text = test_df['instruction']
    resultados = []
    
    for i in tqdm(text, desc="Processing"):
        response = llm(i)
        res = response
        resultados.append(res)
    
    # Filtrar resultados que son None
    filtered_results = [res for res in resultados if res is not None]
    filtered_actual = [res for i, res in enumerate(results.tolist()) if resultados[i] is not None]
    
    print(filtered_results)
    print(filtered_actual)  # Convierte la serie de pandas a una lista para que se vea igual que resultados
    
    # Compute accuracy and F1 score
    accuracy = accuracy_score(filtered_actual, filtered_results)
    f1 = f1_score(filtered_actual, filtered_results, average='weighted')
    
    # Print the results
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")

test()


Processing: 100%|██████████| 102/102 [00:07<00:00, 13.44it/s]

['Management', 'Management', 'Management', 'Management', 'Management', 'Management', 'Management', 'Management', 'Technical', 'Technical', 'Management', 'Management', 'Technical', 'Management', 'Management', 'Management', 'Management', 'Technical', 'Technical', 'Management', 'Info', 'Info', 'Management', 'Management', 'Management', 'Management', 'Technical', 'Technical', 'Management', 'Info', 'Technical', 'Technical', 'Management', 'Info', 'Info', 'Management', 'Management', 'Management', 'Info', 'Technical', 'Management', 'Technical', 'Technical', 'Functional', 'Management', 'Management', 'Technical', 'Management', 'Management', 'Info', 'Management', 'Management', 'Technical', 'Management', 'Management', 'Technical', 'Management', 'Management', 'Management', 'Technical', 'Management', 'Management', 'Management', 'Info', 'Management', 'Management', 'Technical', 'Technical', 'Management', 'Technical', 'Management', 'Management', 'Info', 'Management', 'Management', 'Technical', 'Manageme




# Prompting vs Fine-Tuning with prompt

In [1]:
import requests

def original_model(text_input, endpoint_url:str = "http://172.16.59.1:8000/v2/models/ensemble/generate"):
    """ Function to generate text using the original model.

    Args:
        text_input (str): The input text.
        endpoint_url (str, optional): Endpoint URL for the model. Defaults to "http://172.16.59.1:8000/v2/models/ensemble/generate".

    Returns:
        str: The generated text.
    """
    headers = {
        "Content-Type": "application/json"
    }
    
    p = (
        f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
        f"Below is an instruction that describes a task. Answer with an unique word.\n\n### Instruction:\n Classify the sentence into: 'Info','Functional', 'Regulatory', 'Management', 'Operational', 'Technical' or 'Maintenance'. Sentence: \n{text_input} \n\n### Response:\n"
        f"<|eot_id|><|start_header_id|>asssitant<|end_header_id|>\n\n"
    )
    payload = {
        "text_input": p,
        "parameters": {
            "max_tokens": 1024,
            "bad_words": [""],
            "stop_words": [""]
        }
    }
    
    response = requests.post(endpoint_url, json=payload, headers=headers)
    
    if response.status_code == 200:
        return response.json()["text_output"]
    else:
        response.raise_for_status()


In [2]:
def prompting():
    """Function to generate text using the original model."""    
    test_df = pd.read_csv("test.csv")
    results = test_df['output']
    text = test_df['instruction']
    resultados = []
    
    for i in tqdm(text, desc="Processing"):
        response = original_model(i)
        res = response
        resultados.append(res)
    
    # Filtrar resultados que son None
    filtered_results = [res for res in resultados if res is not None]
    filtered_actual = [res for i, res in enumerate(results.tolist()) if resultados[i] is not None]
    
#     print(filtered_results)
    
    # Compute accuracy and F1 score
    accuracy = accuracy_score(filtered_actual, filtered_results)
    f1 = f1_score(filtered_actual, filtered_results, average='weighted')
    
    # Print the results
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    return filtered_results
    


In [9]:
resultados = prompting()

Processing: 100%|██████████| 102/102 [00:20<00:00,  5.00it/s]

Accuracy: 0.0
F1 Score: 0.0





In [10]:
test = pd.read_csv("test.csv")
for i in range(10):
    print("Instruction: ", test['instruction'][i], "\nOutput: ", test['output'][i], "\nInference Output: ", original_model(test['instruction'][i]))
    print("")

Instruction:  All training courses shall be presented in English (except where the training instructor and all trainees are Hebrew speakers) by a member of the Project Company’s training staff. 
Output:  Management 
Inference Output:  **Unique Word:** **Categorization**

Instruction:  The Uninterruptible Power System (UPS) shall comply with the following input requirements; 
Output:  Info 
Inference Output:  **Classification:** Technical

Instruction:  Secure communication protocol for remote access administration (e.g. SSH), and SNMP v3. 
Output:  Technical 
Inference Output:  **Authentication**

Instruction:  Time synchronization; and 
Output:  Management 
Inference Output:  **Classification:** Technical

Instruction:  Occupational hygiene; 
Output:  Technical 
Inference Output:  **Hygienic**

Instruction:  The Project Company shall inform the Owner at least forty-eight (48) hours prior to the performance of each Assessment Point. 
Output:  Management 
Inference Output:  **Unique Wor