In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pandas numpy scikit-learn datasets transformers accelerate peft bitsandbytes spacy evaluate
!python -m spacy download fr_core_news_sm
!pip install --quiet \
  datasets \

  ray[tune] \
  torch

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import re
import json
import numpy as np
import pandas as pd
import torch
import spacy
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import evaluate
from huggingface_hub import login

# Login to Hugging Face
login(token="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")

# Load ROUGE metric
# Define GPU cleanup function
def clear_gpu():
    torch.cuda.empty_cache()

# Load spaCy model for French lemmatization
nlp = spacy.load("fr_core_news_sm")

In [None]:
# Define text cleaning and lemmatization functions
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def lemmatize_text(text):
    if not text:
        return ""
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

In [None]:
# Define functions to build prompt and target
def build_prompt(row):
    nom = lemmatize_text(clean_text(row["Nom du projet"]))
    description = lemmatize_text(clean_text(row["Description"]))
    duree = row["Durée (mois)"]
    complexite = row["Complexité (1-5)"]
    secteur = clean_text(row["Secteur"])
    taches = row["Tâches Identifiées"]
    prompt = (f"Nom du projet: {nom}\n"
              f"Description: {description}\n"
              f"Durée (mois): {duree}\n"
              f"Complexité (1-5): {complexite}\n"
              f"Secteur: {secteur}\n"
              f"Tâches Identifiées: {taches}\n\n"
              "### Instruction:\n"
              "Fournis les informations en format JSON pour:\n"
              "- Compétences Requises\n"
              "- Employés Alloués\n"
              "- Répartition par Compétences\n\n"
              "### Réponse:\n")
    return prompt

def build_target(row):
    comp_str = row["Compétences Requises"]
    competences = [x.strip() for x in comp_str.split(",")] if isinstance(comp_str, str) else []
    employes = row["Employés Alloués"]
    repart_str = row["Répartition par Compétences"]
    repartition = {}
    if isinstance(repart_str, str):
        for item in repart_str.split(","):
            if ":" in item:
                key, value = item.split(":", 1)
                repartition[key.strip()] = int(value.strip())
    target_dict = {
        "Compétences Requises": competences,
        "Employés Alloués": employes,
        "Répartition par Compétences": repartition
    }
    return json.dumps(target_dict, ensure_ascii=False)


In [None]:
# Load and prepare dataset
df = pd.read_csv("dataset_final_fusionne.csv")
df["prompt"] = df.apply(build_prompt, axis=1)
df["target"] = df.apply(build_target, axis=1)
df["full_text"] = df["prompt"] + df["target"]

# Split into train and validation sets for tuning
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df[["prompt", "target", "full_text"]])
val_dataset = Dataset.from_pandas(val_df[["prompt", "target", "full_text"]])
print(val_dataset[0])
hf_dataset = Dataset.from_pandas(df[["prompt", "target", "full_text"]])  # Full dataset for final training

{'prompt': "Nom du projet: système de gestion intelligent de espace vert urbain\nDescription: ce projet vise à concevoir et implémenter un plateform numérique intégrer pour optimiser le gestion , le maintenance et le planification de espace vert au sein de un municipalité . le système comprendre un plateform web pour le administrateur et planificateur , ainsi que un application mobile pour le équipe de terrain . il agrègerer un donnée provenir de source hétérogène : base de donnée sig existant ( localisation , type de végétation ) , donnée de capteur iot ( humidité de sol , température ) , observation manuel ( état de santé , besoin de arrosage , présence de nuisible ) , et donnée externe ( météo ) . le défi technique majeur inclure le intégration et le normalisation de ce flux de donnée diversifier , le développement de modèle de machine learning pour prédir le santé de plante , le besoin en eau ou le risque de maladie , le mise en place de un cartographie interactif performante ( sig

In [None]:
# Define model and configurations
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, token="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")
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
# Alternative: Simpler approach that ensures target is always present
def tokenize_function_simple(example):
    """
    Simplified tokenization that directly uses prompt and target fields
    """
    prompt = example["prompt"]
    target = example["target"]

    # Combine prompt and target
    full_text = prompt + target

    # Tokenize the full text
    tokenized = tokenizer(
        full_text,
        truncation=True,
        padding="max_length",
        max_length=1024,
        return_tensors=None
    )

    # Tokenize just the prompt to know where to mask
    prompt_tokenized = tokenizer(
        prompt,
        truncation=True,
        padding=False,
        return_tensors=None
    )

    # Create labels - mask prompt tokens, keep target tokens
    labels = tokenized["input_ids"].copy()
    prompt_length = len(prompt_tokenized["input_ids"])

    # Mask prompt tokens
    for i in range(min(prompt_length, len(labels))):
        labels[i] = -100

    # Mask padding tokens
    for i, token_id in enumerate(tokenized["input_ids"]):
        if token_id == tokenizer.pad_token_id:
            labels[i] = -100

    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": labels
    }

# Debug function to check your data structure
def debug_dataset_structure(dataset, num_samples=3):
    """
    Debug function to understand your dataset structure
    """
    print("=== DATASET STRUCTURE DEBUG ===")
    print(f"Dataset keys: {dataset.features.keys()}")

    for i in range(min(num_samples, len(dataset))):
        print(f"\n--- Sample {i} ---")
        sample = dataset[i]

        for key, value in sample.items():
            if isinstance(value, str):
                print(f"{key}: {value[:100]}{'...' if len(value) > 100 else ''}")
            else:
                print(f"{key}: {value}")

        # Check if prompt and target are properly separated
        if "prompt" in sample and "target" in sample and "full_text" in sample:
            prompt_len = len(sample["prompt"])
            full_len = len(sample["full_text"])
            expected_target = sample["full_text"][prompt_len:]
            actual_target = sample["target"]

            print(f"Prompt length: {prompt_len}")
            print(f"Full text length: {full_len}")
            print(f"Expected target: {expected_target[:50]}...")
            print(f"Actual target: {actual_target[:50]}...")
            print(f"Targets match: {expected_target == actual_target}")

# Usage:
# First debug your dataset structure
debug_dataset_structure(val_dataset)

# Then apply the appropriate tokenization
try:
    # Try the simple approach first
    tokenized_train = train_dataset.map(
        tokenize_function_simple,
        remove_columns=train_dataset.column_names,
        batched=False
    )

    tokenized_val = val_dataset.map(
        tokenize_function_simple,
        remove_columns=val_dataset.column_names,
        batched=False
    )

    # Verify the results
    print("\n=== TOKENIZATION VERIFICATION ===")
    sample = tokenized_val[0]
    print(f"Input IDs shape: {len(sample['input_ids'])}")
    print(f"Labels shape: {len(sample['labels'])}")
    print(f"Attention mask shape: {len(sample['attention_mask'])}")

    # Count non-masked labels
    valid_labels = [x for x in sample['labels'] if x != -100]
    print(f"Valid (non-masked) labels: {len(valid_labels)}")

    if len(valid_labels) == 0:
        print("❌ Still all labels masked - checking data...")
        # Print the actual tokens for debugging
        tokens = tokenizer.convert_ids_to_tokens(sample['input_ids'][:50])
        print(f"First 50 tokens: {tokens}")
    else:
        print("✅ Labels properly set up")

except Exception as e:
    print(f"Error in tokenization: {e}")
    print("Falling back to debug mode...")
    debug_dataset_structure(val_dataset, 1)

=== DATASET STRUCTURE DEBUG ===
Dataset keys: dict_keys(['prompt', 'target', 'full_text', '__index_level_0__'])

--- Sample 0 ---
prompt: Nom du projet: système de gestion intelligent de espace vert urbain
Description: ce projet vise à co...
target: {"Compétences Requises": ["Chef de projet", "BA", "Expert en architecture", "Expert SIG WEB", "Datab...
full_text: Nom du projet: système de gestion intelligent de espace vert urbain
Description: ce projet vise à co...
__index_level_0__: 565
Prompt length: 2679
Full text length: 3155
Expected target: {"Compétences Requises": ["Chef de projet", "BA", ...
Actual target: {"Compétences Requises": ["Chef de projet", "BA", ...
Targets match: True

--- Sample 1 ---
prompt: Nom du projet: optimisation de le irrigation agricole par sig et iot
Description: le projet viser à ...
target: {"Compétences Requises": ["Expert en architecture", "IoT", "Cloud", "Database", "Data Science", "Exp...
full_text: Nom du projet: optimisation de le irrigation agricol

Map:   0%|          | 0/2356 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/590 [00:00<?, ? examples/s]


=== TOKENIZATION VERIFICATION ===
Input IDs shape: 1024
Labels shape: 1024
Attention mask shape: 1024
Valid (non-masked) labels: 279
✅ Labels properly set up


In [None]:
import os
import torch
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler

# Hugging Face + Ray Train integration
from ray.train.huggingface.transformers import (
    RayTrainReportCallback,
    prepare_trainer,
)
import bitsandbytes as bnb
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
ray.shutdown()
# 2) Initialize Ray
ray.init(ignore_reinit_error=True)
print("Resources:", ray.cluster_resources())

2025-06-04 14:42:14,124	INFO worker.py:1888 -- Started a local Ray instance.


Resources: {'accelerator_type:L4': 1.0, 'node:__internal_head__': 1.0, 'CPU': 12.0, 'object_store_memory': 16949836185.0, 'node:172.28.0.12': 1.0, 'memory': 39549617767.0, 'GPU': 1.0}


In [None]:
import os
from ray import tune
import os
import torch
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from transformers import (
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)


# 3) Load & tokenize your dataset (example: JSON with 'text' field)
original_cols =train_dataset.column_names

# Tokenize datasets
train_tokenized = train_dataset.map(tokenize_function_simple,remove_columns=train_dataset.column_names,batched=False)
val_tokenized = val_dataset.map(
        tokenize_function_simple,
        remove_columns=val_dataset.column_names,
        batched=False
    )

print("Train columns:", train_tokenized.column_names)
print("Val   columns:",   val_tokenized.column_names)


tune_train = train_tokenized.select(range(300))  # first 500 examples
tune_val   = val_tokenized.select(range(60))

# 2) Push them into Ray object store
train_ref = ray.put(tune_train)
val_ref   = ray.put(tune_val)
tok_ref   = ray.put(tokenizer)   # if tokenizer is large

#――――――――――――――――――――――――
# 5) The training function for each Ray Tune trial
def train_mistral(config):
    # 1) Pull the real objects back from Ray
    tokenizer = ray.get(tok_ref)
    train_dataset = ray.get(train_ref)
    val_dataset = ray.get(val_ref)

    # 2) Build your quant config
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    # 3) Load model & apply LoRA
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quant_config,
        device_map="auto",
    )
    model = prepare_model_for_kbit_training(model)
    lora_conf = LoraConfig(
        task_type="CAUSAL_LM",
        inference_mode=False,
        r=config["lora_r"],
        lora_alpha=config["lora_alpha"],
        lora_dropout=config["lora_dropout"],
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    )
    model = get_peft_model(model, lora_conf)

    # 4) Training arguments (CWD is already trial dir)
    args = TrainingArguments(
        output_dir=os.getcwd(),
        per_device_train_batch_size=4,
        gradient_accumulation_steps=config["grad_accum_steps"],
        num_train_epochs=config["epoch"],
        learning_rate=config["learning_rate"],
        fp16=True,
        #optim="adamw_8bit",
        gradient_checkpointing=True,
        eval_strategy="epoch",
        save_strategy="no",
        report_to="none",
    )

    # 5) Trainer + Ray callback
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
    )

    # FIXED: Use only one of these approaches - either the callback OR direct tune.report
    # Approach 1: Using Ray Train callback (preferred)
    from ray.train.huggingface.transformers import RayTrainReportCallback
    from ray.train.huggingface.transformers import prepare_trainer
    trainer = prepare_trainer(trainer)
    trainer.add_callback(RayTrainReportCallback())

    # 6) Run training
    trainer.train()

    # 7) Final evaluation
    final_metrics = trainer.evaluate()

    # 8) Save model
    model_dir = os.path.join(os.getcwd(), "model")
    trainer.save_model(model_dir)

    # No need to call tune.report() as the callback handles it
    # If you prefer direct reporting instead of the callback, use this approach:
    # Approach 2: Direct reporting (alternative)
    from ray import train
    train.report({"eval_loss": final_metrics["eval_loss"]})
    # OR with older API: tune.report(eval_loss=final_metrics["eval_loss"])

# Rest of the Ray Tune setup
search_space = {
    "learning_rate": tune.loguniform(5e-6, 1e-4),
    "grad_accum_steps": tune.choice([4, 8, 16]),
    "lora_r": tune.choice([16, 32, 64]),
    "lora_alpha": tune.choice([32, 64, 128]),
    "lora_dropout": tune.uniform(0.05, 0.15),
    "epoch": tune.choice([ 3,5])
}

# ASHA scheduler (Hyperband variant)
scheduler = ASHAScheduler(
    max_t=3,
    grace_period=1,
    reduction_factor=4,
)
# Run Ray Tune
analysis = tune.run(
    train_mistral,
    resources_per_trial={"cpu": 12, "gpu": 1},
    config=search_space,
    num_samples=10,  # Reduced number of trials for faster completion
    scheduler=scheduler,
    metric="eval_loss",
    mode="min",
    storage_path="/content/ray_results",
    time_budget_s=3600 * 1,  # 6 hours max
)


Map:   0%|          | 0/2356 [00:00<?, ? examples/s]

Map:   0%|          | 0/590 [00:00<?, ? examples/s]

Train columns: ['input_ids', 'attention_mask', 'labels']
Val   columns: ['input_ids', 'attention_mask', 'labels']
+----------------------------------------------------------------------+
| Configuration for experiment     train_mistral_2025-06-04_14-42-48   |
+----------------------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator               |
| Scheduler                        AsyncHyperBandScheduler             |
| Number of trials                 10                                  |
+----------------------------------------------------------------------+

View detailed results here: /content/ray_results/train_mistral_2025-06-04_14-42-48
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2025-06-04_14-42-12_953979_1111/artifacts/2025-06-04_14-42-48/train_mistral_2025-06-04_14-42-48/driver_artifacts`

Trial status: 10 PENDING
Current time: 2025-06-04 14:42:48. Total running time: 0s
Logic

[36m(pid=14005)[0m 2025-06-04 14:42:55.432208: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=14005)[0m E0000 00:00:1749048175.453777   14005 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=14005)[0m E0000 00:00:1749048175.460373   14005 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered



Trial train_mistral_2f3bd_00000 started with configuration:
+----------------------------------------------------+
| Trial train_mistral_2f3bd_00000 config             |
+----------------------------------------------------+
| epoch                                            3 |
| grad_accum_steps                                 4 |
| learning_rate                                2e-05 |
| lora_alpha                                      64 |
| lora_dropout                               0.07481 |
| lora_r                                          16 |
+----------------------------------------------------+


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]
Loading checkpoint shards:  33%|███▎      | 1/3 [00:05<00:11,  5.82s/it]
Loading checkpoint shards:  67%|██████▋   | 2/3 [00:11<00:05,  5.98s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:17<00:00,  5.73s/it]
[36m(train_mistral pid=14005)[0m No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  0%|          | 0/57 [00:00<?, ?it/s]
[36m(train_mistral pid=14005)[0m `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.



Trial status: 1 RUNNING | 9 PENDING
Current time: 2025-06-04 14:43:18. Total running time: 30s
Logical resource usage: 12.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+----------------------------------------------------------------------------------------------------------------------------------+
| Trial name                  status       learning_rate     grad_accum_steps     lora_r     lora_alpha     lora_dropout     epoch |
+----------------------------------------------------------------------------------------------------------------------------------+
| train_mistral_2f3bd_00000   RUNNING        1.54513e-05                    4         16             64        0.0748114         3 |
| train_mistral_2f3bd_00001   PENDING        1.36236e-05                    4         32             64        0.0779944         3 |
| train_mistral_2f3bd_00002   PENDING        3.84584e-05                    4         64             32        0.128864          3 |
| train_mistral_2f3bd_00003

[36m(train_mistral pid=14005)[0m   2%|▏         | 1/57 [00:39<36:26, 39.04s/it]


Trial status: 1 RUNNING | 9 PENDING
Current time: 2025-06-04 14:44:18. Total running time: 1min 30s
Logical resource usage: 12.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+----------------------------------------------------------------------------------------------------------------------------------+
| Trial name                  status       learning_rate     grad_accum_steps     lora_r     lora_alpha     lora_dropout     epoch |
+----------------------------------------------------------------------------------------------------------------------------------+
| train_mistral_2f3bd_00000   RUNNING        1.54513e-05                    4         16             64        0.0748114         3 |
| train_mistral_2f3bd_00001   PENDING        1.36236e-05                    4         32             64        0.0779944         3 |
| train_mistral_2f3bd_00002   PENDING        3.84584e-05                    4         64             32        0.128864          3 |
| train_mistral_2f3bd_0

[36m(train_mistral pid=14005)[0m   4%|▎         | 2/57 [01:18<35:51, 39.12s/it]
2025-06-04 14:44:41,878	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/content/ray_results/train_mistral_2025-06-04_14-42-48' in 0.0029s.


Trial status: 1 RUNNING | 9 PENDING
Current time: 2025-06-04 14:44:41. Total running time: 1min 53s
Logical resource usage: 12.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+----------------------------------------------------------------------------------------------------------------------------------+
| Trial name                  status       learning_rate     grad_accum_steps     lora_r     lora_alpha     lora_dropout     epoch |
+----------------------------------------------------------------------------------------------------------------------------------+
| train_mistral_2f3bd_00000   RUNNING        1.54513e-05                    4         16             64        0.0748114         3 |
| train_mistral_2f3bd_00001   PENDING        1.36236e-05                    4         32             64        0.0779944         3 |
| train_mistral_2f3bd_00002   PENDING        3.84584e-05                    4         64             32        0.128864          3 |
| train_mistral_2f3bd_0

KeyboardInterrupt: 

In [None]:
best_trial = analysis.get_best_trial("eval_loss", "min", "last")
best_params = best_trial.config
print(f"Best trial config: {best_trial.config}")
print(f"Best trial final validation loss: {best_trial.last_result['eval_loss']}")

In [None]:
from transformers import TrainerCallback
import logging
logging.basicConfig(level=logging.INFO)


full_train = train_dataset.map(
    tokenize_function_simple,
    remove_columns=train_dataset.column_names,
    batched=False
)
full_val = val_dataset.map(
    tokenize_function_simple,
    remove_columns=val_dataset.column_names,
    batched=False
)
# 2) Build 4-bit quant + LoRA config with your best hyperparams
quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    # Load model & apply LoRA with best config
model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quant_config,
        device_map="auto",
        torch_dtype=torch.bfloat16,
    )
model = prepare_model_for_kbit_training(model)


class StepLoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
        if logs:
            print(f"Step {state.global_step}: {logs}")

lora_conf = LoraConfig(
        task_type="CAUSAL_LM",
        inference_mode=False,
        r=16,
        lora_alpha=128,
        lora_dropout=0.1,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    )
model = get_peft_model(model, lora_conf)

    # Training arguments with epochs for final training
args = TrainingArguments(
        output_dir="/content/drive/MyDrive/my_mistral_checkpoints_4",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=8 ,
        num_train_epochs=3,  # Train for 3 epochs instead of fixed steps
        learning_rate=2e-05 ,
        fp16=True,
        logging_dir="/content/drive/MyDrive/logs",
        gradient_checkpointing=True,
        eval_strategy="steps",
        eval_steps=19,
        save_strategy="steps",  # Save at end of each epoch
        save_steps= 38,
        report_to="none",
        logging_strategy="steps",
        logging_steps=19,  # Log every 50 steps
        dataloader_pin_memory=False,
        save_total_limit=2,
        load_best_model_at_end=True,  # Load best model based on eval metric
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        disable_tqdm=False,
    )




    # Create trainer
trainer = Trainer(
        model=model,
        args=args,
        train_dataset=full_train,
        eval_dataset=full_val,
        tokenizer=tokenizer,
        #callbacks=[StepLoggingCallback()],  # Add StepLoggingCallback
    )

    # Train final model
trainer.train()

    # Final evaluation
final_metrics = trainer.evaluate()
print(f"Final model validation loss: {final_metrics['eval_loss']}")

    # Save final model
trainer.save_model("/content/drive/MyDrive/final_model_4")
tokenizer.save_pretrained("/content/drive/MyDrive/final_model_4")
print("✅ Training complete! Model saved.")

Map:   0%|          | 0/2356 [00:00<?, ? examples/s]

Map:   0%|          | 0/590 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
19,0.7484,0.461506
38,0.4229,0.375392
57,0.3603,0.338357
76,0.3291,0.318297
95,0.2893,0.300256
114,0.2742,0.290561
133,0.2733,0.282834
152,0.2604,0.27978
171,0.2399,0.276165
190,0.2358,0.273966


Final model validation loss: 0.2739661633968353
✅ Training complete! Model saved.


In [1]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Your data
steps = [19, 38, 57, 76, 95, 114, 133, 152, 171, 190, 209]
training_loss = [0.748400, 0.422900, 0.360300, 0.329100, 0.289300,
                0.274200, 0.273300, 0.260400, 0.239900, 0.235800, 0.231800]
validation_loss = [0.461506, 0.375392, 0.338357, 0.318297, 0.300256,
                  0.290561, 0.282834, 0.279780, 0.276165, 0.273966, 0.271214]

# Create the figure
fig = go.Figure()

# Add training loss
fig.add_trace(go.Scatter(
    x=steps,
    y=training_loss,
    mode='lines+markers',
    name='Training Loss',
    line=dict(color='#2563eb', width=3),
    marker=dict(size=8, color='white', line=dict(color='#2563eb', width=2)),
    hovertemplate='<b>Step:</b> %{x}<br><b>Training Loss:</b> %{y:.6f}<extra></extra>'
))

# Add validation loss
fig.add_trace(go.Scatter(
    x=steps,
    y=validation_loss,
    mode='lines+markers',
    name='Validation Loss',
    line=dict(color='#dc2626', width=3),
    marker=dict(size=8, color='white', line=dict(color='#dc2626', width=2)),
    hovertemplate='<b>Step:</b> %{x}<br><b>Validation Loss:</b> %{y:.6f}<extra></extra>'
))

# Update layout
fig.update_layout(
    title={
        'text': 'Model Training Progress: Loss Over Time',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 18, 'family': 'Arial Black'}
    },
    xaxis_title='Training Step',
    yaxis_title='Loss',
    hovermode='x unified',
    plot_bgcolor='white',
    width=900,
    height=500,
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,
        bgcolor="rgba(255,255,255,0.8)",
        bordercolor="rgba(0,0,0,0.2)",
        borderwidth=1
    )
)

# Update axes
fig.update_xaxes(
    showgrid=True,
    gridwidth=1,
    gridcolor='rgba(128,128,128,0.2)',
    showline=True,
    linewidth=1,
    linecolor='black'
)

fig.update_yaxes(
    showgrid=True,
    gridwidth=1,
    gridcolor='rgba(128,128,128,0.2)',
    showline=True,
    linewidth=1,
    linecolor='black'
)

# Show the figure
fig.show()

# Print summary
print("📊 Training Summary:")
print(f"   • Training loss improved from {training_loss[0]:.3f} to {training_loss[-1]:.3f}")
print(f"   • Validation loss improved from {validation_loss[0]:.3f} to {validation_loss[-1]:.3f}")
print(f"   • Total improvement: {((training_loss[0] - training_loss[-1]) / training_loss[0] * 100):.1f}%")

📊 Training Summary:
   • Training loss improved from 0.748 to 0.232
   • Validation loss improved from 0.462 to 0.271
   • Total improvement: 69.0%


In [None]:
# Define inference function
def generate_response(new_project):
    prompt = build_prompt(new_project)
    model = AutoModelForCausalLM.from_pretrained(
        "/content/drive/MyDrive/final_model_4",
        quantization_config=quant_config,
        device_map="auto"
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,
        do_sample=False,
        eos_token_id=tokenizer.eos_token_id
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "### Réponse:" in generated_text:
        json_output = generated_text.split("### Réponse:")[-1].strip()
    else:
        json_output = generated_text.strip()
    return json_output


In [None]:
# Example inference
new_project = {
    "Nom du projet": "EDU",
    "Description": "Ce contrat de maintenance vise à exécuter des prestations de maintenance préventive, corrective et évolutive pour un système éducatif au Sénégal. Le projet inclut la planification des interventions et la coordination des équipes pour assurer la continuité du service dans un environnement à faible complexité technique.",
    "Durée (mois)": 12,
    "Complexité (1-5)": 1,
    "Secteur": "Éducation",
    "Tâches Identifiées": "Planification, Coordination"
}

print("\n===== Inference Example =====")
response = generate_response(new_project)
print("Generated JSON response:")
print(response)


===== Inference Example =====


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated JSON response:
{"Compétences Requises": ["Chef de projet", "Coordinateur"], "Employés Alloués": 4, "Répartition par Compétences": {"Chef de projet": 2, "Coordinateur": 2}}
