In [1]:
##############################################################################
# Handle imports
##############################################################################
from docling.document_converter import DocumentConverter
import traceback
from collections.abc import Iterable
import os
import pypdfium2 as pdfium
import re
import json
import jsonlines
import uuid
from langchain_openai import ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval import assert_test
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import GEval, AnswerRelevancyMetric
from deepeval import evaluate
from huggingface_hub import snapshot_download, login, HfApi
from sklearn.model_selection import train_test_split
from transformers import AutoModelForCausalLM, AutoTokenizer
import xml.etree.ElementTree as etree
from longdocfactscore.ldfacts import LongDocFACTScore
from datetime import datetime
import nltk
import pandas as pd
from datasets import load_dataset, interleave_datasets
nltk.download('punkt_tab')
from dotenv import load_dotenv
load_dotenv()
from transformers import DataCollatorForLanguageModeling, TrainingArguments
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig
from transformers import EarlyStoppingCallback
import torch
import optuna
import traceback
torch.cuda.empty_cache()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
##############################################################################
# Set up variables
##############################################################################
SOURCE_DIR="source_docs"
SOURCE_DIR_CHUNKED="source_docs_chunked"
MARKDOWN_DIR="markdown"
MARKDOWN_URI_PREFIX="https://raw.githubusercontent.com/agapebondservant/code-generation-capstone/refs/heads/main/eda/resources"
REPORT_DIR="reports"
OUTPUT_DIR="output"
INVALID_DIR="invalid"
ERROR_DIR="error" 
MODEL_DIR="models"
MODEL_IDS = ["ibm-granite/granite-8b-code-instruct-4k","ibm-granite/granite-8b-code-base-128k"]
DEVICE="cuda"
DATASET_REPO=f"{os.getenv('HF_USERNAME')}/codegen"

In [3]:
##############################################################################
# Set up object instances
##############################################################################

data_generator_llm = ChatOpenAI(
    model=os.getenv("DATA_GENERATOR_MODEL_ID"), # os.getenv('QWEN25CODER_MODEL_ID'),
    api_key=os.getenv('OPENROUTER_TOKEN'),
    base_url=os.getenv('OPENROUTER_API_BASE'),
    temperature=0.1,
)

class DataGeneratorLLM(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        return chat_model.invoke(prompt).content

    async def a_generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        res = await chat_model.ainvoke(prompt)
        return res.content

    def get_model_name(self):
        return "Custom Data Generator LLM (GPT-OSS)"

evaluator_llm = DataGeneratorLLM(data_generator_llm)

In [4]:
##############################################################################
# PROMPTS AND PROMPT TYPES
##############################################################################

summary_prompt = """
Your task is to analyze this code snippet and provide an explanation of the code.
    
Instructions:
1. Provide a concise explanation that summarizes the purpose of the code without getting into too many specific technical details.
2. If the provided snippet does not appear to be a code snippet, indicate that this is not valid code.
3. Also exclude any details that link the requirements to a specific programming language or framework.
"""

topics_prompt = """
Use the provided summary to analyze this code snippet and generate a list of programming topics that are related to the code.
    
Instructions:
1. Provide a short list of topics that you can identify.
2. If the provided snippet does not appear to be a code snippet, indicate that this is not valid code.
"""

components_prompt = """
Your task is to analyze this code snippet and generate a specification of all the JSP relevant components you can find.

Instructions:
1. Include only relevant components.
3. If the provided snippet does not appear to be a code snippet, indicate that this is not valid code.
"""

domain_prompt = """
Your task is to analyze this code snippet and generate an outline of the domain model associated with this code.
    
Instructions:
1. Avoid getting into too many specific technical details. Simply provide a domain model of the code.
2. If the provided snippet does not appear to be a code snippet, indicate that this is not valid code.
3. Include the current state of the domain objects based on information extracted from the code.
"""

keywords_prompt = """
Your task is to analyze this code snippet and generate a list of keywords that are associated with the code.
    
Instructions:
1. Provide a short list of keywords.
2. If the provided snippet does not appear to be a code snippet, indicate that this is not valid code.
"""

functional_requirements_prompt = """
Use the provided summary to analyze this code snippet and generate a list of programming topics that are related to the code.
    
Instructions:
1. Provide a short list of topics that you can identify.
2. If the provided snippet does not appear to be a code snippet, indicate that this is not valid code.
"""

business_requirements_prompt = """
Use the provided summary to generate an outline of sample business requirements that might be connected to the code.

Instructions:
1. Provide a short list of relevant requirements. Do not include requirements that are not related to the code.
2. If the provided snippet does not appear to be a code snippet, indicate that this is not valid code.
"""

prompts = {

    "functional_requirements": {
        
        "prompt": functional_requirements_prompt, 

        "title": "Functional Requirements",
    },
    "business_requirements": {
        
        "prompt": business_requirements_prompt, 

        "title": "Business Requirements",
    },
    "topics": {
        
        "prompt": topics_prompt, 

        "title": "Components",
    },
    "components": {
        
        "prompt": components_prompt,

        "title": "Topics",
    },
    "keywords": {
        
        "prompt": keywords_prompt,

        "title": "Keywords",
    },
    "summary": {
        
        "prompt": summary_prompt,

        "title": "Summary",
    }
}


prompts_with_dependencies = {
    "topics": "summary",
    
    "business_requirements": "summary",
    
    "functional_requirements": "summary",
}

### Download candidate models
The following candidate models will be downloaded:
- ibm-granite/granite-8b-code-instruct-4k
- ibm-granite/granite-8b-code-base-128k

In [5]:
##############################################################################
# UTILITY METHODS
##############################################################################

def download_models(repo_id):
    try:
        ##############################################################################
        # Save the model
        ##############################################################################
        local_dir = snapshot_download(repo_id=repo_id, cache_dir=MODEL_DIR)
        
        print(f"Model {repo_id} downloaded to: {local_dir}")

        ##############################################################################
        # Save the tokenizer
        ##############################################################################
        tokenizer = AutoTokenizer.from_pretrained(repo_id)

        if tokenizer.pad_token is None:
            
            tokenizer.pad_token = tokenizer.eos_token

        tokenizer.save_pretrained(local_dir)
        
        
    except Exception as e:
    
        print(f"Error downloading model {repo_id}: {e}")

def upload_models(repo_id, model_dir):

    try:
    
        tokenizer = AutoTokenizer.from_pretrained(model_dir)
        
        model = AutoModelForCausalLM.from_pretrained(model_dir, 
                                                     trust_remote_code=True,
                                                     device_map=DEVICE)
    
        api = HfApi()
    
        api.create_repo(repo_id=repo_id, repo_type="model")
    
        api.upload_folder(
            folder_path=model_dir,
            
            repo_id=repo_id,
            
            repo_type="model"
        )

    except Exception as e:
    
        print(f"Error uploading model {repo_id} from directory {model_dir}: {e}")

def build_datasets(dataset_name):

    final_datasets = []

    def process_summary_to_text(example, code_type=""):
        
        example["text"], example["completion"], example["code_type"] = example["summary"], example[code_type], [code_type]*len(example["code"])
        
        return example

    def process_code_to_text(example, code_type=""):
        example["text"], example["completion"], example["code_type"] = example["code"], example[code_type], [code_type]*len(example["code"])
        
        return example
    
    train_dataset = load_dataset(dataset_name, split="train")

    test_dataset = load_dataset(dataset_name, split="test")

    for dataset in [train_dataset, test_dataset]:

        datasets = []

        code_types = [c for c, obj in prompts.items() if c not in ['code']]
        
        for code_type in code_types:

            if code_type in prompts_with_dependencies:

                datasets.append(dataset.map(process_summary_to_text, batched=True, fn_kwargs={"code_type": code_type}))
            
            else:

                datasets.append(dataset.map(process_code_to_text, batched=True, fn_kwargs={"code_type": code_type}))

        final_datasets.append(interleave_datasets(datasets))

    return final_datasets
        
    

In [6]:
##############################################################################
# Code Formatting Helper Function
##############################################################################
def code_text_formatter(example):

    _code = example['code']
    
    _summary = example['summary']

    _code_type = example["code_type"]

    _text = example['text']

    _prompt = prompts[_code_type]["prompt"]

    _title = prompts[_code_type]["title"]

    ######################################
    # Code-Summary pair
    ######################################
    if _code_type in prompts_with_dependencies:
        text = f"""
        <|assistant|>
        {_prompt}
        Summary:
        {_summary}
        <|assistant|>
        {_title}:
        {_text}<|endoftext|>
        """

        return text

    #######################
    # Code-Text pair
    #######################
    else:
        text = f"""
        <|system|>
        You are a helpful assistant.
        {_prompt}
        Code to analyze:
        <|user|>
        {_code}
        <|assistant|>
        {_title}:
        {_text}<|endoftext|>
        """

        return text

In [7]:
##############################################################################
# PIPELINES
##############################################################################
def peft_finetuning_pipeline(dataset_name, use_dora=False):
    """
    Executes the LoRA pipeline.
    """
    try:
        [os.makedirs(dirname, exist_ok=True) for dirname in [
            MODEL_DIR
        ]]
    
        ##############################################################################
        # Early Stopping Callback
        ##############################################################################
        early_stopping_callback = EarlyStoppingCallback(
            early_stopping_patience=3,
            
            early_stopping_threshold=0.001,
        )   
    
        ##############################################################################
        # Load models to finetune
        ##############################################################################
        for model_id in MODEL_IDS:
    
            print(f"Start finetuning {model_id}...")

            base_model_dir = model_id.replace("/","_")

            [os.makedirs(dirname, exist_ok=True) for dirname in [
                MODEL_DIR/base_model_dir/"experiment",
                MODEL_DIR/base_model_dir/"final",
                MODEL_DIR/base_model_dir/"model",
            ]]
    
            model = AutoModelForCausalLM.from_pretrained(
                
                model_id,
                
                device_map="auto",

                trust_remote_code=True,
            )
    
            tokenizer = AutoTokenizer.from_pretrained(model_id)
    
            if tokenizer.pad_token is None:
                
                tokenizer.pad_token = tokenizer.eos_token

            train_dataset, test_dataset = build_datasets(dataset_name)
        
            ##############################################################################
            # Data collator
            ##############################################################################
            collator = DataCollatorForLanguageModeling(
                
                tokenizer=tokenizer,
                
                mlm=False,
            )
            
            ##############################################################################
            # Objective Function for Hyperparameter Tuning
            ##############################################################################
            def objective(trial):

                ##############################################################################
                # Hyperparameters
                ##############################################################################

                learning_rate = trial.suggest_float(
                    "learning_rate", 1e-5, 1e-4, log=True
                )
                
                per_device_train_batch_size = trial.suggest_categorical(
                    "per_device_train_batch_size", [16, 32]
                )
                
                r = trial.suggest_categorical(
                    "r", [8, 16, 32]
                )
                
                lora_alpha = trial.suggest_categorical(
                    "lora_alpha", [16, 32, 64]
                )
                
                lora_dropout = trial.suggest_categorical(
                    "lora_dropout", [0.05, 0.1]
                )
                
    
                ##############################################################################
                # LoRA / DORA Configuration
                ##############################################################################
            
                lora_config = LoraConfig(
                    r=r, 
                    
                    lora_alpha=lora_alpha,
                    
                    target_modules=None,
                    
                    lora_dropout=lora_dropout,
                    
                    bias="none",
            
                    use_dora=use_dora,
                )

                ##############################################################################
                # Training Arguments / SFTConfig
                ##############################################################################
                training_args = SFTConfig(
                    
                    output_dir=MODEL_DIR/base_model_dir/"experiment",
                    
                    learning_rate=learning_rate,
                    
                    per_device_train_batch_size=per_device_train_batch_size,
                    
                    per_device_eval_batch_size=per_device_train_batch_size,
                    
                    num_train_epochs=10,
                    
                    logging_steps=100,
                    
                    fp16=True,
                    
                    report_to="none",
                    
                    eval_strategy="epoch",  
                    
                    save_strategy="epoch",   
                    
                    load_best_model_at_end=True,  
                
                    metric_for_best_model="eval_loss", 
                    
                    greater_is_better=False,   
                    
                    max_length=4096,
                    
                    packing=False,    
                
                    seed=42,
                )
        
                ##############################################################################
                # Supervised Finetuning Trainer
                ##############################################################################
            
                trainer = SFTTrainer(
                    
                    model=model,
                    
                    args=training_args,
                    
                    train_dataset=train_dataset,
                    
                    eval_dataset=test_dataset,
                    
                    peft_config = lora_config,
                    
                    formatting_func = code_text_formatter,
                    
                    data_collator = collator,
                    
                    callbacks=[early_stopping_callback],
                )

                trainer.train()

                return trainer.state.best_metric
                

            ##############################################################################
            # Perform Hyperparameter Search
            ##############################################################################\

            study = optuna.create_study(direction="minimize") # Minimize loss
            
            study.optimize(objective, n_trials=10)
    
            final_lora_config = LoraConfig(
                r=study.best_params["r"], 
                
                lora_alpha=study.best_params["lora_alpha"],
                
                target_modules=None,
                
                lora_dropout=study.best_params["lora_dropout"],
                
                bias="none",
        
                use_dora=use_dora,
            )
            
            final_training_args = SFTConfig(
                output_dir=MODEL_DIR/base_model_dir/"final",
            
                learning_rate=study.best_params["learning_rate"],
                
                per_device_train_batch_size=study.best_params["per_device_train_batch_size"],
                
                per_device_eval_batch_size=study.best_params["per_device_train_batch_size"],
                
                num_train_epochs=10,
                
                logging_steps=100,
                
                fp16=True,
                
                report_to="none",
                
                eval_strategy="epoch",  
                
                save_strategy="epoch",   
                
                load_best_model_at_end=True,  
            
                metric_for_best_model="eval_loss", 
                
                greater_is_better=False,   
                
                max_length=4096,
                
                packing=False,    
            
                seed=42,
            )
            
            final_trainer = SFTTrainer(
                
                model=model,
                
                args=final_training_args,
                
                train_dataset=train_dataset,
                
                eval_dataset=test_dataset,
                
                peft_config = final_lora_config,
                
                formatting_func = code_text_formatter,
                
                data_collator = collator,
                
                callbacks=[early_stopping_callback],
            )
            
            ##############################################################################
            # Start finetuning!
            ##############################################################################
            final_trainer.train()
    
            ##############################################################################
            # Save snapshot and push to HuggingFace Hub
            ##############################################################################

            try:
            
                model.save_pretrained(MODEL_DIR/base_model_dir/"model")
                
                tokenizer.save_pretrained(MODEL_DIR/base_model_dir/"model")

                published_model_id = model_id.partition("/")[2] or model_id

                model.push_to_hub(published_model_id)

            except Exception as e:

                print(f"Error saving and pushing to HuggingFace: {e}")
        
                traceback.print_exc()
            
    except Exception as e:

        print(f"Error running PEFT pipeline: {e}")

        traceback.print_exc()

def lora_finetuning_pipeline(dataset_name):
    """
    Executes the LoRA pipeline.
    """
    return peft_finetuning_pipeline(dataset_name, use_dora=False)
        
def dora_finetuning_pipeline(dataset_name):
    """
    Executes the DORA pipeline.
    """
    return peft_finetuning_pipeline(dataset_name, use_dora=True)

            

### Run the pipeline
Execute the pipelines!

In [8]:
lora_finetuning_pipeline(f"{os.getenv('HF_USERNAME')}/jsp-code-to-text")

[I 2025-11-01 05:16:03,248] A new study created in memory with name: no-name-b20b472b-84d4-4cde-baca-6bbc3a56a7f4


Start finetuning ibm-granite/granite-8b-code-instruct-4k...


2025-11-01 05:16:03,899 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,No log,0.667576,0.638615,408902.0,0.855645
2,No log,0.566938,0.530779,817804.0,0.873005
3,0.766500,0.539024,0.491951,1226706.0,0.876737
4,0.766500,0.515617,0.46216,1635608.0,0.880178
5,0.766500,0.500406,0.446687,2044510.0,0.881825
6,0.464900,0.488214,0.423459,2453412.0,0.886007
7,0.464900,0.485286,0.398853,2862314.0,0.885761
8,0.372800,0.494797,0.369039,3271216.0,0.886097
9,0.372800,0.497241,0.36129,3680118.0,0.885624
10,0.372800,0.49828,0.359393,4089020.0,0.885552


[I 2025-11-01 05:41:37,046] Trial 0 finished with value: 0.4852861762046814 and parameters: {'learning_rate': 9.559733394045224e-05, 'per_device_train_batch_size': 32, 'r': 8, 'lora_alpha': 64, 'lora_dropout': 0.05}. Best is trial 0 with value: 0.4852861762046814.
2025-11-01 05:41:37,172 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Start finetuning ibm-granite/granite-8b-code-instruct-4k...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,No log,0.699664,0.671339,408902.0,0.84862
2,1.044300,0.575546,0.538176,817804.0,0.872612
3,0.562200,0.550816,0.505889,1226706.0,0.875699
4,0.521400,0.534113,0.486414,1635608.0,0.878801
5,0.521400,0.522573,0.475035,2044510.0,0.880047
6,0.485200,0.511664,0.461564,2453412.0,0.881748
7,0.459200,0.501671,0.45276,2862314.0,0.882547
8,0.436200,0.49704,0.442574,3271216.0,0.882917
9,0.416600,0.494651,0.438579,3680118.0,0.883454
10,0.416600,0.494482,0.434681,4089020.0,0.883118


[I 2025-11-01 06:03:55,236] Trial 1 finished with value: 0.4944818913936615 and parameters: {'learning_rate': 5.151115763529888e-05, 'per_device_train_batch_size': 16, 'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05}. Best is trial 0 with value: 0.4852861762046814.
2025-11-01 06:03:55,361 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Start finetuning ibm-granite/granite-8b-code-instruct-4k...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,No log,1.030858,1.023342,408902.0,0.771341
2,No log,0.634351,0.60159,817804.0,0.861545
3,0.960600,0.583014,0.536688,1226706.0,0.8707
4,0.960600,0.566059,0.525362,1635608.0,0.872855
5,0.960600,0.552976,0.51504,2044510.0,0.874947
6,0.545100,0.544287,0.510028,2453412.0,0.876234
7,0.545100,0.538311,0.489178,2862314.0,0.87733
8,0.505100,0.534073,0.486135,3271216.0,0.877861
9,0.505100,0.531646,0.481163,3680118.0,0.878805
10,0.505100,0.531101,0.479228,4089020.0,0.878943


[I 2025-11-01 06:29:30,042] Trial 2 finished with value: 0.531100869178772 and parameters: {'learning_rate': 6.546652626974007e-05, 'per_device_train_batch_size': 32, 'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.1}. Best is trial 0 with value: 0.4852861762046814.
2025-11-01 06:29:30,173 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Start finetuning ibm-granite/granite-8b-code-instruct-4k...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,No log,1.453379,1.085384,408902.0,0.711866
2,No log,1.345312,1.142937,817804.0,0.718818
3,1.467100,1.228181,1.086024,1226706.0,0.736601
4,1.467100,1.123504,1.036935,1635608.0,0.75391
5,1.467100,1.014178,0.905792,2044510.0,0.783732
6,1.167300,0.917159,0.823116,2453412.0,0.803229
7,1.167300,0.849851,0.772105,2862314.0,0.818287
8,0.904200,0.806593,0.745216,3271216.0,0.828494
9,0.904200,0.785022,0.729093,3680118.0,0.8334
10,0.904200,0.778643,0.724313,4089020.0,0.834705


[I 2025-11-01 06:55:03,298] Trial 3 finished with value: 0.7786428928375244 and parameters: {'learning_rate': 1.2118037018196061e-05, 'per_device_train_batch_size': 32, 'r': 8, 'lora_alpha': 32, 'lora_dropout': 0.05}. Best is trial 0 with value: 0.4852861762046814.
2025-11-01 06:55:03,438 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Start finetuning ibm-granite/granite-8b-code-instruct-4k...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,No log,1.334222,1.158241,408902.0,0.719743
2,No log,1.095506,1.039453,817804.0,0.757898
3,1.295500,0.823237,0.755867,1226706.0,0.823245
4,1.295500,0.687408,0.655788,1635608.0,0.853127
5,1.295500,0.634215,0.603073,2044510.0,0.863148
6,0.724300,0.611048,0.579962,2453412.0,0.866435
7,0.724300,0.600214,0.56455,2862314.0,0.869055
8,0.591700,0.594758,0.556287,3271216.0,0.869502
9,0.591700,0.591941,0.55345,3680118.0,0.870068
10,0.591700,0.591042,0.552307,4089020.0,0.870114


[I 2025-11-01 07:20:38,598] Trial 4 finished with value: 0.5910419821739197 and parameters: {'learning_rate': 1.7558066996821136e-05, 'per_device_train_batch_size': 32, 'r': 16, 'lora_alpha': 64, 'lora_dropout': 0.1}. Best is trial 0 with value: 0.4852861762046814.
2025-11-01 07:20:38,744 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Start finetuning ibm-granite/granite-8b-code-instruct-4k...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,No log,1.141581,1.084432,408902.0,0.750288
2,1.329700,0.682337,0.645384,817804.0,0.855025
3,0.728100,0.599664,0.562997,1226706.0,0.869394
4,0.584100,0.581541,0.539903,1635608.0,0.871513
5,0.584100,0.570469,0.534729,2044510.0,0.873305
6,0.554100,0.562669,0.531792,2453412.0,0.873993
7,0.541900,0.556699,0.516076,2862314.0,0.874772
8,0.532500,0.552582,0.511524,3271216.0,0.875284
9,0.524800,0.550398,0.508923,3680118.0,0.875677
10,0.524800,0.549898,0.507392,4089020.0,0.875615


[I 2025-11-01 07:42:56,950] Trial 5 finished with value: 0.5498979091644287 and parameters: {'learning_rate': 2.3556885553743287e-05, 'per_device_train_batch_size': 16, 'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.1}. Best is trial 0 with value: 0.4852861762046814.
2025-11-01 07:42:57,095 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Start finetuning ibm-granite/granite-8b-code-instruct-4k...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,No log,1.276755,1.135776,408902.0,0.722978
2,1.424400,0.872045,0.787426,817804.0,0.815027
3,0.930900,0.66083,0.626535,1226706.0,0.858931
4,0.646800,0.611721,0.574257,1635608.0,0.86781
5,0.646800,0.594391,0.556559,2044510.0,0.870358
6,0.585300,0.586073,0.549078,2453412.0,0.871049
7,0.570800,0.580714,0.541433,2862314.0,0.871953
8,0.562600,0.576675,0.536553,3271216.0,0.87241
9,0.556000,0.574526,0.53556,3680118.0,0.872762
10,0.556000,0.573885,0.534594,4089020.0,0.872583


[I 2025-11-01 08:05:17,613] Trial 6 finished with value: 0.5738846063613892 and parameters: {'learning_rate': 2.3445691559115525e-05, 'per_device_train_batch_size': 16, 'r': 32, 'lora_alpha': 16, 'lora_dropout': 0.1}. Best is trial 0 with value: 0.4852861762046814.
2025-11-01 08:05:17,760 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Start finetuning ibm-granite/granite-8b-code-instruct-4k...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,No log,0.709852,0.679115,408902.0,0.845326
2,1.054200,0.577082,0.539214,817804.0,0.872381
3,0.564600,0.552416,0.50764,1226706.0,0.875252
4,0.523700,0.535791,0.487625,1635608.0,0.878375
5,0.523700,0.524446,0.47706,2044510.0,0.879874
6,0.488300,0.515151,0.46395,2453412.0,0.881501
7,0.464100,0.504501,0.455151,2862314.0,0.882152
8,0.441800,0.499867,0.444621,3271216.0,0.882902
9,0.422500,0.495934,0.441416,3680118.0,0.883235
10,0.422500,0.495588,0.437631,4089020.0,0.883066


[I 2025-11-01 08:27:36,059] Trial 7 finished with value: 0.495587557554245 and parameters: {'learning_rate': 5.009766817925159e-05, 'per_device_train_batch_size': 16, 'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05}. Best is trial 0 with value: 0.4852861762046814.
2025-11-01 08:27:36,202 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Start finetuning ibm-granite/granite-8b-code-instruct-4k...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,No log,1.493803,1.051355,408902.0,0.707886
2,No log,1.413748,1.114586,817804.0,0.715158
3,1.516400,1.339164,1.137895,1226706.0,0.718933
4,1.516400,1.258945,1.096515,1635608.0,0.728684
5,1.516400,1.198144,1.079167,2044510.0,0.742267
6,1.313800,1.144519,1.046024,2453412.0,0.750744
7,1.313800,1.098721,0.997028,2862314.0,0.758393
8,1.151000,1.062878,0.955772,3271216.0,0.764391
9,1.151000,1.040964,0.931616,3680118.0,0.773694
10,1.151000,1.033641,0.924323,4089020.0,0.778314


[I 2025-11-01 08:53:09,997] Trial 8 finished with value: 1.0336408615112305 and parameters: {'learning_rate': 1.2275766321499958e-05, 'per_device_train_batch_size': 32, 'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05}. Best is trial 0 with value: 0.4852861762046814.
2025-11-01 08:53:10,138 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Start finetuning ibm-granite/granite-8b-code-instruct-4k...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,No log,1.449667,1.124329,408902.0,0.708864
2,1.531600,1.285001,1.122447,817804.0,0.722069
3,1.329200,1.126454,1.035548,1226706.0,0.751335
4,1.110700,0.948483,0.846474,1635608.0,0.798463
5,1.110700,0.813942,0.7461,2044510.0,0.828347
6,0.882300,0.740846,0.694734,2453412.0,0.841983
7,0.755300,0.703911,0.665239,2862314.0,0.852093
8,0.700100,0.682326,0.646669,3271216.0,0.856262
9,0.674300,0.671284,0.636521,3680118.0,0.857863
10,0.674300,0.668098,0.63304,4089020.0,0.8585


[I 2025-11-01 09:15:30,240] Trial 9 finished with value: 0.6680976152420044 and parameters: {'learning_rate': 1.0907720688848449e-05, 'per_device_train_batch_size': 16, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1}. Best is trial 0 with value: 0.4852861762046814.


Error running PEFT pipeline: name 'model' is not defined


Traceback (most recent call last):
  File "/tmp/ipykernel_18610/2702774244.py", line 229, in peft_finetuning_pipeline
    model=model,
          ^^^^^
NameError: name 'model' is not defined


### Evaluate the candidate models
Evaluate the candidate models using the following metrics / bechmarks:

In [9]:
###################################
# Perform data
###################################
