<a href="https://colab.research.google.com/github/bivekSapkota/datasetjsp/blob/NursingScheduling/Lora_finetune_Nursing_scheduling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# !pip install datasets
# !pip install transformers
# !pip install peft
# !pip install accelerate
# !pip install tensorboard
# !pip install -U bitsandbytes

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2"

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    set_seed
)
import transformers
import torch
from functools import partial
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Assuming 'utils' directory is in the same directory as this script
# and contains 'data_preprocessing.py' and '__init__.py'
try:
    from utils.data_preprocessing import preprocess_dataset
except ModuleNotFoundError:
    print("Could not import 'preprocess_dataset' from 'utils.data_preprocessing'.")
    print("Please ensure that:")
    print("- The 'utils' directory exists in the same directory as this script.")
    print("- The 'utils' directory contains a file named 'data_preprocessing.py'.")
    print("- The 'utils' directory contains an empty file named '__init__.py'.")
    print("- The 'preprocess_dataset' function is defined within 'data_preprocessing.py'.")
    # You can add more detailed troubleshooting steps here
    raise  # Re-raise the exception to stop execution

from utils.helping_functions import print_number_of_trainable_model_parameters

In [11]:
import torch

if torch.cuda.is_available():
    print(torch.version.cuda)
else:
    print("CUDA is not available.")

12.1


In [12]:
seed = 42
set_seed(seed)
#tensorboard --logdir=peft-phi3-NursingScheduling-training-consise/logs

inference_dataset_name = './gcode_prompt_response_data.json'

dataset = load_dataset("json", data_files=inference_dataset_name)
print(dataset)

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )

#use device_map = {"": 0} if you want to train on specific GPU
device_map="cuda",

my_model_name='microsoft/Phi-3.5-mini-instruct'

my_model = AutoModelForCausalLM.from_pretrained(my_model_name,
                                                      device_map={'':torch.cuda.current_device()},
                                                      quantization_config=bnb_config,
                                                      trust_remote_code=True,
                                                      use_auth_token=False,
                                                      )

tokenizer = AutoTokenizer.from_pretrained(my_model_name,trust_remote_code=True)#,padding_side="left",add_eos_token=True,add_bos_token=True,use_fast=False)
# tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length = 1000
tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'


eval_tokenizer = AutoTokenizer.from_pretrained(my_model_name, trust_remote_code=True,)# add_bos_token=True, use_fast=False)
eval_tokenizer.pad_token = eval_tokenizer.unk_token
# tokenizer.pad_token = tokenizer.eos_token

DatasetDict({
    train: Dataset({
        features: ['index', 'prompt', 'response'],
        num_rows: 30000
    })
})


Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.36s/it]


In [13]:
def gen(model,p, maxlen=2000, sample=False):
    """
    Generates text using the model based on the provided prompt.

    Args:
        model: The pre-trained language model.
        p (str): The prompt text.
        maxlen (int, optional): Maximum length of the generated text. Defaults to 1000.
        sample (bool, optional): Whether to use sampling. Defaults to False.

    Returns:
        list: A list of generated text sequences.
    """
    toks = eval_tokenizer(p, return_tensors="pt")
    res = model.generate(**toks.to("cuda"), max_new_tokens=maxlen, do_sample=sample,num_return_sequences=1).to('cpu')
    return eval_tokenizer.batch_decode(res,skip_special_tokens=True)


index = 0

VAL_SET_SIZE = 1000

train_val = dataset["train"].train_test_split(
    test_size=VAL_SET_SIZE, shuffle=True, seed=42
)
dataset = train_val

prompt = dataset['test'][index]['prompt']
summary = dataset['test'][index]['response']

formatted_prompt = f"Instruct: Provide a Gcode for HaaS mill for following problem .\n{prompt}\nOutput:\n"
res = gen(my_model,formatted_prompt,1000,)
# print(res[0])
output = res[0].split('Output:\n')[1]

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{formatted_prompt}')
print(dash_line)
print(f'BASELINE SOLUTION:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')





---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Instruct: Provide a Gcode for HaaS mill for following problem .
A rectangular aluminum block of length 90mm, width 140mm and a height of 35mm is fixtured in a milling machine,   where the stock has to be milled on all 4 sides by 0mm. The top left corner is placed at (12, 30) in reference to the home postion of the machine   The top left corner needs a Chamfer of 1mm. The top right corner needs a Chamfer of 1mm. The bottom left corner needs a Radius of 8mm. The bottom right corner needs a Radius of 4mm.
  A mill tool of diameter 9mm   having a side tooth engagement of 40mm which is greater than the thickness of stock is used for milling.
  The bottom tip of the tool will be positioned 8.75mm below the part surface. 
The cutter will be operated at a spindle speed of 1000 RPM   and at a feed rate of 1000 mm/min.
The TM1 mill cutter is in the location T3   with tool length stor

In [14]:

max_length = 1000
print('Max length to be used is ; ', max_length)

print(dataset)
# print(train_dataset["text"][0])

train_dataset = preprocess_dataset(tokenizer, max_length,seed, dataset['train'])
eval_dataset = preprocess_dataset(tokenizer, max_length,seed, dataset['test'])
print(f"Shapes of the datasets:")
print(f"Training: {train_dataset.shape}")
print(f"Validation: {eval_dataset.shape}")
print(train_dataset)

Max length to be used is ;  1000
DatasetDict({
    train: Dataset({
        features: ['index', 'prompt', 'response'],
        num_rows: 29000
    })
    test: Dataset({
        features: ['index', 'prompt', 'response'],
        num_rows: 1000
    })
})
Preprocessing dataset...


Map: 100%|██████████| 29000/29000 [00:05<00:00, 5370.79 examples/s]
Map: 100%|██████████| 29000/29000 [00:10<00:00, 2669.90 examples/s]


Preprocessing dataset...


Map: 100%|██████████| 1000/1000 [00:00<00:00, 5116.54 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 2768.83 examples/s]

Shapes of the datasets:
Training: (29000, 3)
Validation: (1000, 3)
Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 29000
})





In [None]:
print(train_dataset['text'][1])

In [15]:
# #### 8. Setup the PEFT/LoRA model for Fine-Tuning
# Now, let's perform Parameter Efficient Fine-Tuning (PEFT) fine-tuning. PEFT is a form of instruction fine-tuning that is much more efficient than full fine-tuning.
# PEFT is a generic term that includes Low-Rank Adaptation (LoRA) and prompt tuning (which is NOT THE SAME as prompt engineering!). In most cases, when someone says PEFT, they typically mean LoRA.
# LoRA, in essence, enables efficient model fine-tuning using fewer computational resources, often achievable with just a single GPU. Following LoRA fine-tuning for a specific task or use case,
# the outcome is an unchanged original LLM and the emergence of a considerably smaller "LoRA adapter," often representing a single-digit percentage of the original LLM size (in MBs rather than GBs).
#
# During inference, the LoRA adapter must be combined with its original LLM. The advantage lies in the ability of many LoRA adapters to reuse the original LLM, thereby reducing overall memory
# requirements when handling multiple tasks and use cases.
#
# Note the rank (r) hyper-parameter, which defines the rank/dimension of the adapter to be trained.
# r is the rank of the low-rank matrix used in the adapters, which thus controls the number of parameters trained.
# A higher rank will allow for more expressivity, but there is a compute tradeoff.
#
# alpha is the scaling factor for the learned weights. The weight matrix is scaled by alpha/r, and thus a higher value for alpha assigns more weight to the LoRA activations.


print(print_number_of_trainable_model_parameters(my_model))



config = LoraConfig(
    r=128, #Rank
    lora_alpha=256,
    target_modules=[
        'qkv_proj',
        'o_proj',
        'fc1',
        'fc2',
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

# 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
my_model.gradient_checkpointing_enable()

# 2 - Using the prepare_model_for_kbit_training method from PEFT
my_model = prepare_model_for_kbit_training(my_model)

peft_model = get_peft_model(my_model, config)


# Once everything is set up and the base model is prepared, we can use the print_trainable_parameters()
# helper function to see how many trainable parameters are in the model.

print(print_number_of_trainable_model_parameters(peft_model))


output_dir = './testing_final_code_peft-phi3-Gcode_Generation_30k/'


MICRO_BATCH_SIZE = 2
BATCH_SIZE = 64
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
LOG_STEP = 50
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=MICRO_BATCH_SIZE,
    per_device_eval_batch_size=MICRO_BATCH_SIZE,   # batch size for evaluation*
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    # max_steps=3,
    num_train_epochs=2,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=LOG_STEP,
    logging_dir=output_dir+"logs",
    save_strategy="steps",
    save_steps=LOG_STEP,
    evaluation_strategy="steps",
    eval_steps=LOG_STEP,
    do_eval=True,
    gradient_checkpointing=True,
    # report_to="tensorboard",
    overwrite_output_dir = 'True',
    group_by_length=True,
    save_total_limit=50,
    # output_dir="phi2_lora-jssp_machines_first_lora_r_{32}",
    fp16=True,
)

peft_model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

peft_training_args.device



trainable model parameters: 197200896
all model parameters: 2009140224
percentage of trainable model parameters: 9.82%
trainable model parameters: 75497472
all model parameters: 2084637696
percentage of trainable model parameters: 3.62%


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


device(type='cuda', index=0)

In [16]:
peft_trainer.train()

# peft_trainer.train(resume_from_checkpoint=True)

# Free memory for merging weights
del my_model
del peft_trainer
torch.cuda.empty_cache()

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 17%|█▋        | 50/302 [1:31:27<7:42:06, 110.03s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'loss': 0.3334, 'grad_norm': 58803.90625, 'learning_rate': 0.00016744186046511629, 'epoch': 0.33}


                                                     
 17%|█▋        | 50/302 [1:35:07<7:42:06, 110.03s/it]

{'eval_loss': 0.07762635499238968, 'eval_runtime': 218.345, 'eval_samples_per_second': 4.58, 'eval_steps_per_second': 0.765, 'epoch': 0.33}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 33%|███▎      | 100/302 [3:06:15<6:05:15, 108.49s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'loss': 0.0771, 'grad_norm': 22714.306640625, 'learning_rate': 0.00013421926910299004, 'epoch': 0.66}


                                                      
 33%|███▎      | 100/302 [3:09:54<6:05:15, 108.49s/it]

{'eval_loss': 0.07646479457616806, 'eval_runtime': 218.2619, 'eval_samples_per_second': 4.582, 'eval_steps_per_second': 0.765, 'epoch': 0.66}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 50%|████▉     | 150/302 [4:41:11<4:37:26, 109.52s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'loss': 0.0761, 'grad_norm': 15566.3701171875, 'learning_rate': 0.00010099667774086379, 'epoch': 0.99}


                                                      
 50%|████▉     | 150/302 [4:44:51<4:37:26, 109.52s/it]

{'eval_loss': 0.07600916922092438, 'eval_runtime': 219.8411, 'eval_samples_per_second': 4.549, 'eval_steps_per_second': 0.76, 'epoch': 0.99}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 66%|██████▌   | 200/302 [6:16:11<3:07:06, 110.06s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'loss': 0.0758, 'grad_norm': 10332.0908203125, 'learning_rate': 6.777408637873754e-05, 'epoch': 1.32}


                                                      
 66%|██████▌   | 200/302 [6:19:54<3:07:06, 110.06s/it]

{'eval_loss': 0.0758180096745491, 'eval_runtime': 221.7201, 'eval_samples_per_second': 4.51, 'eval_steps_per_second': 0.753, 'epoch': 1.32}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 83%|████████▎ | 250/302 [7:51:06<1:34:04, 108.56s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'loss': 0.0757, 'grad_norm': 8231.904296875, 'learning_rate': 3.4551495016611294e-05, 'epoch': 1.65}


                                                      
 83%|████████▎ | 250/302 [7:54:46<1:34:04, 108.56s/it]

{'eval_loss': 0.07576054334640503, 'eval_runtime': 219.5001, 'eval_samples_per_second': 4.556, 'eval_steps_per_second': 0.761, 'epoch': 1.65}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
 99%|█████████▉| 300/302 [9:26:09<03:39, 109.85s/it]  Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


{'loss': 0.0755, 'grad_norm': 8582.6806640625, 'learning_rate': 1.3289036544850498e-06, 'epoch': 1.99}


                                                    
 99%|█████████▉| 300/302 [9:29:49<03:39, 109.85s/it]

{'eval_loss': 0.07564808428287506, 'eval_runtime': 218.5903, 'eval_samples_per_second': 4.575, 'eval_steps_per_second': 0.764, 'epoch': 1.99}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
100%|██████████| 302/302 [9:33:31<00:00, 113.95s/it]


{'train_runtime': 34411.9086, 'train_samples_per_second': 1.685, 'train_steps_per_second': 0.009, 'train_loss': 0.11866202397851755, 'epoch': 2.0}
