In [1]:
# ! pip install pyarrow==14.0.1
# ! pip install cudf-cu12
# ! pip install ibis-framework
# ! pip install -q -U jsonlines datasets transformers accelerate peft bitsandbytes wandb

In [None]:
import jsonlines
import itertools
import pandas as pd
import numpy as np
from pprint import pprint

import datasets
from datasets import load_dataset

import os
import torch
import time

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer, 
    TrainingArguments,
    Trainer, 
    GenerationConfig,
    pipeline #permet de mettre en place, plusiers ppipeline donc le tokenizer et les promts ainsi que le models
)



In [3]:
Device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(Device) #visualiser le divice utiliser

cuda


Data preparation
link:https://huggingface.co/datasets/prsdm/MedQuad-phi2-1k

In [4]:
dataset = load_dataset("yahma/alpaca-cleaned", split="train", streaming=True) #fr

In [5]:
print(dataset) 

IterableDataset({
    features: ['output', 'input', 'instruction'],
    num_shards: 1
})


In [6]:
n = 10
print("Dataset:")
top_n = itertools.islice(dataset, n)
for i in top_n:
    print(i)


Dataset:
{'output': '1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.', 'input': '', 'instruction': 'Give three tips for staying healthy.'}
{'output': 'The three primary colors are red, blue, and yellow. These colors are called primary because they cannot be created by mixing other colors 

In [7]:
# Convert the dataset in DataFrame
dat=pd.json_normalize(dataset) 

In [8]:
#Visualization
dat.head() 

Unnamed: 0,output,input,instruction
0,1. Eat a balanced and nutritious diet: Make su...,,Give three tips for staying healthy.
1,"The three primary colors are red, blue, and ye...",,What are the three primary colors?
2,An atom is the basic building block of all mat...,,Describe the structure of an atom.
3,There are several ways to reduce air pollution...,,How can we reduce air pollution?
4,I had to make a difficult decision when I was ...,,Pretend you are a project manager of a constru...


**Promt template**

In [9]:
promt_template_whith_input = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{input}
### Response:
{output}

"""

#aAdd additional prompt to only take the intruction and output
promt_template_without_input = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}

### Response:
{output}
"""


In [10]:
def format_dataset(dataset,template_whith_input,template_without_input):
    formatted_data = []
    for entry in dataset:
        if entry["input"]:
          # Apply the input prompt template
            formatted_entry = template_whith_input.format(instruction=entry["instruction"],input=entry["input"],output=entry["output"])
        else:
          # Apply prompt template without input
            formatted_entry = template_without_input.format(instruction=entry["instruction"],output=entry["output"])
        formatted_data.append(formatted_entry)
    return formatted_data


In [11]:
#Formater ou refactoriser nos donnees du data
formatted_data = format_dataset(dataset,promt_template_whith_input,promt_template_without_input)

In [15]:
# print(formatted_data[0])

In [14]:
# for item in formatted_data:
#     print(item)
#     print()

# Fine-Tunig
The fine-tuning, is one of the most popular techniques using in Geneative AI to allow Large Lnaguage models learn new structures and pattern from new input, outpout samples.
Even Large Lnaguage models has been trained on large amount of data, there are private data, means data that LLMs doesn't have access, fine-tuning LLMs helps to add new data (private or others), can by guiding form strong prompt engineering techniques, reduce Hallucinations.

**Papers**
- Fine-tuning LLMs,Zero-shot Learners: https://arxiv.org/pdf/2109.01652
- Fine-tuning Large Language Models with Human-inspired Learning Strategies in Medical Question Answering : https://arxiv.org/abs/2408.07888

**Dataset**
- liens:https://huggingface.co/datasets/prsdm/MedQuad-phi2-1k

In [14]:
dataset = load_dataset("prsdm/MedQuad-phi2-1k", split="train")

In [15]:
print(dataset)

Dataset({
    features: ['text'],
    num_rows: 1000
})


In [16]:
n = 10
print("dataset:")
top_n = itertools.islice(dataset, n)
for i in top_n:
    print(i)


dataset:
{'text': '### Instruction: How to prevent Lung Cancer ? ### Assistant: Key Points\n                    - Avoiding risk factors and increasing protective factors may help prevent lung cancer.    - The following are risk factors for lung cancer:         - Cigarette, cigar, and pipe smoking      - Secondhand smoke     - Family history     - HIV infection     - Environmental risk factors     - Beta carotene supplements in heavy smokers        - The following are protective factors for lung cancer:         - Not smoking     - Quitting smoking     - Lower exposure to workplace risk factors      - Lower exposure to radon        - It is not clear if the following decrease the risk of lung cancer:         - Diet     - Physical activity        - The following do not decrease the risk of lung cancer:         - Beta carotene supplements in nonsmokers     - Vitamin E supplements         - Cancer prevention clinical trials are used to study ways to prevent cancer.    -  New ways to prevent 

## Model Download

In [None]:
from types import new_class
# model
base_model = "microsoft/phi-2" 
new_model = "phi-2-bedoo96" 

#tokeniser
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token 
tokenizer.padding_side="right"


## Training

In [18]:
from dotenv import load_dotenv
import os

# Load environment variables from a .env file
load_dotenv()
token = os.getenv('HF_TOKEN')  

# Check if the token was retrieved successfully
if token:
    print("Token loaded successfully")
else:
    print("Token not found")

Token loaded successfully


In [19]:
#quantization du model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, #on charge
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,)


# Intialize model
model = AutoModelForCausalLM.from_pretrained(
    base_model,  # model calling
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map={"": 0},  # GPU usage)
)


model.config.use_cache = False
model.config.pretraining_tp = 1 

Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.55s/it]


In [20]:
# Model architecture Visualization
print(model) 

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_la

In [21]:
# print number of total trainable parameters
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(total_trainable_params )

262364160


## LoRA Configuration
LoRA (Low-Rank Adaptation) is a technique for fine-tuning large pre-trained models efficiently by adjusting low-rank matrices, reducing computational cost while maintaining performance. Key parameters in LoRA configuration include:

* **Rank:** Defines the dimensionality of low-rank matrices. A higher rank provides more flexibility, while a lower rank reduces computational cost but may limit capacity.
* **Alpha:** A scaling factor that controls the influence of low-rank matrices on model parameters. A higher alpha increases their impact on the model.
* **Bias:** Includes or excludes a bias term in the adaptation, helping the model adjust for shifts in data distribution.
* Task Type: Specifies the task (e.g., classification, regression) for which the adaptation is being applied, aligning model adjustments to the specific requirements of the task.
Target Modules: Specifies the layers or components of the model to adapt. This helps focus fine-tuning on the most important parts of the model, reducing unnecessary computational effort

In [22]:
from peft import LoraConfig, prepare_model_for_kbit_training

# Supposons que vous avez déjà un modèle chargé
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


In [23]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [24]:
from peft import LoraConfig, get_peft_model

# Configuration de LoRA
perf_config = LoraConfig(
    r=64, #nombre de rang
    lora_alpha=16, #
    lora_dropout=0.05, #
    bias="none", #
    task_type="CAUSAL_LM", 
    # Specifies which layers of the model to adapt during fine-tuning
    target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','dense','fc1','fc2',]
    )

# Applying the LoRA configuration to the model
model = get_peft_model(model, perf_config)
print_trainable_parameters(model)

trainable params: 94371840 || all params: 1615764480 || trainable%: 5.840692821765707


## Training Steps

In [25]:
new_model = 'phi-22-bedoo96'

In [26]:
# parameter configuration
training_arguments = TrainingArguments(
    output_dir="./phi-2-result",
    num_train_epochs=1,
    per_device_train_batch_size=4, 
    gradient_accumulation_steps=1,

    gradient_checkpointing=True,
    max_grad_norm=0.3,

    learning_rate=2e-4,
    weight_decay=0.001,
    optim="paged_adamw_8bit",
    lr_scheduler_type="linear",
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    save_steps=0,
    logging_steps=15,
    run_name='phi-22-bedoo96',
    overwrite_output_dir=True,
    report_to="wandb"
)

In [29]:
from trl import SFTTrainer
# Set of supervisor parameter
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=dataset,
    peft_config=perf_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
)

#Train model
trainer.train()

# #save model
trainer.save_model(new_model)




Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mverneus96obed[0m ([33mverneus96obed-student[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
15,1.6331
30,1.3615
45,1.259
60,1.1798
75,1.1795
90,1.1192
105,1.139
120,1.1154
135,1.1683
150,1.1912


In [10]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Path to the saved model
model_path = "phi-22-bedoo96"

# Load the model on GPU with torch_dtype set to float16
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
   
)
# Ensure the model is fully on GPU
if torch.cuda.is_available():
    model = model.to("cuda")
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

print("Model loaded successfully on GPU.")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully on GPU.


In [7]:
tokenizer = AutoTokenizer.from_pretrained("phi-22-bedoo96")

In [13]:

user_input = input(" What are the treatments for Primary CNS Lymphoma ")    
# Tokenize the input and move it to the same device as the model
inputs = tokenizer(user_input, return_tensors="pt", truncation=True, padding=True, max_length=512)
inputs = {key: value.to("cuda") for key, value in inputs.items()}  # Move tensors to GPU    
# Generate a response
output = model.generate(**inputs, max_length=100, num_return_sequences=1)    
# Decode the output
response = tokenizer.decode(output[0], skip_special_tokens=True)
print("Model: " + response)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Model:  What are the treatments for Primary CNS Lymphoma?
                - What are the treatments for Secondary CNS Lymphoma?
                - What are the treatments for CNS Lymphoma in Children?
                - What are the treatments for CNS Lymphoma in Adults?
                - What are the treatments for CNS Lymphoma in Older Adults?
                - What are the treatments for CNS Lymphoma in People with HIV/AIDS?
                - What are the treatments for CNS L
