In [2]:
%run training_functions.ipynb

['train', 'validation', 'test']
['train', 'validation', 'test']
In, Out, get_ipython, exit, quit, open, accelerate, datasets, evaluate, math, np, peft, pickle, pytest, ipytest, pd, transformers, torch, load_dataset, load_dataset_builder, get_dataset_split_names, get_dataset_config_names, LoftQConfig, LoraConfig, get_peft_model, AutoModelForCausalLM, AutoTokenizer, pipeline, TrainingArguments, Trainer, SFTTrainer, SFTConfig, @py_builtins, @pytest_ar, model_from_pkl, ds_gst1_train, ds_gst1_test, ds_gst2_train, ds_gst2_test, search_with_strings, map_data, print_trainable_parameters, metric, comp_metrics_output, compute_metrics, make_trainer, get_dataframe, get_training_output, 

In [3]:
import torch
from opacus.grad_sample import GradSampleModule

In [4]:
lla_321, lla_321_tokenizer=model_from_pkl("Llama-3.2-1B-Instruct")

In [5]:
l_tokenized_stage1_train=map_data(ds_gst1_train, lla_321, lla_321_tokenizer)
l_tokenized_stage1_test=map_data(ds_gst1_test, lla_321, lla_321_tokenizer)

In [6]:
from torch.utils.data import DataLoader
train_dataloader=DataLoader(l_tokenized_stage1_train, shuffle=True)
test_dataloader=DataLoader(l_tokenized_stage1_test, shuffle=True)

In [7]:
# to fine-tune with LoRA, instantiate a base model (as above)
# create LoraConfig where LoRA-specific parameters are defined
config=LoraConfig(
    #inference_mode=False,
    r=8, #rank of update matrices, lower value results in smaller matrices with fewer parameters
    lora_alpha=16, #LoRA scaling factor
    task_type="CAUSAL_LM",
    lora_dropout=0.1, # dropout probability of LoRA layers
    bias="none", # specifies if bias parameters should be trained
    #modules_to_save=["decode_head"] #models apart from LoRA layers that are trainable
)

In [63]:
lla_lora_model=get_peft_model(lla_321, config)

In [64]:
optimizer=torch.optim.AdamW(lla_lora_model.base_model.parameters(),
                           amsgrad=False, # the AMSGrad variant of this algorithm won't be used 
                            betas=(0.9, 0.999), # coefficients used for computing running averages of gradient and its square
                            capturable=False, # whether the instance will be captured in a CUDA graph
                            differentiable=False, # whether autogad should occur through the optimzer step in training
                            eps=1e-08, # added to denominator to improve numerical stablitity
                            foreach=None, # whether foreach implementation is used
                            fused=None, #whether the fused implementation is used
                            #initial_lr=2e-05,
                            lr=0.1, #learning rate
                            maximize=False, # whether the object is maximized with respect to params instead og
                            weight_decay=0.0)

In [65]:
l3_trainer=make_trainer(lla_lora_model, train_dataloader.dataset, test_dataloader.dataset, config,
                          SFTConfig(output_dir="test_trainer", eval_strategy="epoch",
                                    per_device_train_batch_size=1,
                                    max_grad_norm=1.0,
                                    num_train_epochs=1,
                                    logging_strategy="epoch",
                                    #logging_steps=6
                                   ), (optimizer))

In [None]:
l3_trainer.train()



Epoch,Training Loss,Validation Loss




In [27]:
l3_trainer.state.log_history[0]['loss']

0.6782

method

In [12]:
l3_trainer.train_dataset

Dataset({
    features: ['record', 'text', 'input_ids', 'attention_mask'],
    num_rows: 500
})

In [56]:
# the Trainer object will call the optimizer's train() function at each training step if method is callable/exists
# maybe create a train method for your optimizer with the required steps for DPOptimizer 
def optimize(trainer):
    #for i in range(l3_trainer.args.num_train_epochs): might not be needed if the trainer executes this at steps/epochs
    losses=[] # empty loss list
    data_loader=trainer.get_train_dataloader
    optimizer=trainer.optimizer
    optimizer.zero_grad() # resets the gradients of the parameter tensors to zero

    outputs= trainer.state.log_history#training output at this step/epoch
    loss=outputs[0]['loss'] #appropriate position in outputs
    loss.backward() # calculates gradient by backpropagating error from the current loss
    losses.append(loss.item()) #add the gradient value to the list
    optimizer.step() # perform optimization step to update the parameter
    
    

In [11]:
optimizer.step

<bound method AdamW.step of AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.1
    maximize: False
    weight_decay: 0.0
)>

In [20]:
optimizer.pre_step

<bound method DPOptimizer.pre_step of AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.1
    lr: 0.1
    maximize: False
    weight_decay: 0.0
)>

In [49]:
optimizer.train=optimize

In [44]:
from opacus import PrivacyEngine
privacy_engine = PrivacyEngine(secure_mode=False)
model, optimizer, train_loader = privacy_engine.make_private(
        module=lla_lora_model,
        optimizer=optimizer,
        data_loader=train_dataloader,
        noise_multiplier=1.3,
        max_grad_norm=1.0,
        )

In [40]:
lla_lora_model=get_peft_model(lla_321, config)

In [50]:
l6_trainer=make_trainer(lla_lora_model, train_dataloader.dataset, test_dataloader.dataset, config,
                          SFTConfig(output_dir="test_trainer", eval_strategy="epoch",
                                    per_device_train_batch_size=1,
                                    max_grad_norm=1.0,
                                    num_train_epochs=1,
                                    logging_strategy="epoch",
                                    #logging_steps=6,
                                    ), optimizer)

In [25]:
dir(optimizer)

['OptimizerPostHook',
 'OptimizerPreHook',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_skip_next_step',
 '_cuda_graph_capture_health_check',
 '_get_flat_grad_sample',
 '_group_tensors_by_device_and_dtype',
 '_is_last_step_skipped',
 '_opt_called',
 '_optimizer_step_code',
 '_patch_step_function',
 '_process_value_according_to_param_policy',
 '_step_skip_queue',
 'accumulated_iterations',
 'add_noise',
 'add_param_group',
 'attach_step_hook',
 'clip_and_accumulate',
 'defaults',
 'expected_batch_size',
 'generator',
 'grad_samples',
 'load_state_dict',
 'loss_reduction',
 'max_grad_norm',
 'noise_multipli

In [51]:
l6_trainer.optimizer.train(l5_trainer)

AttributeError: 'float' object has no attribute 'backward'

In [32]:
l5_trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.6645,2.611787,0.000937




TrainOutput(global_step=125, training_loss=0.6644692993164063, metrics={'train_runtime': 373.4442, 'train_samples_per_second': 1.339, 'train_steps_per_second': 0.335, 'total_flos': 2992122101760000.0, 'train_loss': 0.6644692993164063})

In [33]:
dir(l3_trainer.optimizer)

['OptimizerPostHook',
 'OptimizerPreHook',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_cuda_graph_capture_health_check',
 '_group_tensors_by_device_and_dtype',
 '_init_group',
 '_opt_called',
 '_optimizer_load_state_dict_post_hooks',
 '_optimizer_load_state_dict_pre_hooks',
 '_optimizer_state_dict_post_hooks',
 '_optimizer_state_dict_pre_hooks',
 '_optimizer_step_code',
 '_optimizer_step_post_hooks',
 '_optimizer_step_pre_hooks',
 '_patch_step_function',
 '_process_value_according_to_param_policy',
 '_warned_capturable_if_run_uncaptured',
 '_zero_grad_profile_name',
 'add_param_group',
 'defaults',
 'load_stat

In [19]:
type(l3_trainer)

trl.trainer.sft_trainer.SFTTrainer

In [39]:
# run the .get_optimizer_cls_and_kwargs on the trainer
l3_trainer.get_optimizer_cls_and_kwargs(SFTConfig(output_dir="test_trainer", eval_strategy="epoch",
                                    per_device_train_batch_size=1,
                                    max_grad_norm=1.0,
                                    num_train_epochs=1,
                                    logging_strategy="epoch",
                                    #logging_steps=6
                                   ))

(torch.optim.adamw.AdamW, {'lr': 2e-05, 'betas': (0.9, 0.999), 'eps': 1e-08})

In [29]:
l4_trainer=make_trainer(lla_lora_model, train_dataloader.dataset, test_dataloader.dataset, config,
                          SFTConfig(output_dir="test_trainer", eval_strategy="epoch",
                                    per_device_train_batch_size=1,
                                    max_grad_norm=1.0,
                                    num_train_epochs=1,
                                    logging_strategy="epoch",
                                    #logging_steps=6
                                   ))

In [34]:
l4_trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.6643,2.611203,0.000859




TrainOutput(global_step=125, training_loss=0.664299072265625, metrics={'train_runtime': 385.8257, 'train_samples_per_second': 1.296, 'train_steps_per_second': 0.324, 'total_flos': 2992122101760000.0, 'train_loss': 0.664299072265625})

In [36]:
dir(l4_trainer.optimizer)
# it seems that the grad_samples parameter doesn't exist in a regular AdamW optimizer, it is just for the DPOptimizer version
# when the .get_optimizer_cls_and_kwargs function is ran on the trainer with the DPOptimizer, it retrieves arguments from the normal AdamW optimizer

['OptimizerPostHook',
 'OptimizerPreHook',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_cuda_graph_capture_health_check',
 '_group_tensors_by_device_and_dtype',
 '_init_group',
 '_opt_called',
 '_optimizer_load_state_dict_post_hooks',
 '_optimizer_load_state_dict_pre_hooks',
 '_optimizer_state_dict_post_hooks',
 '_optimizer_state_dict_pre_hooks',
 '_optimizer_step_code',
 '_optimizer_step_post_hooks',
 '_optimizer_step_pre_hooks',
 '_patch_step_function',
 '_process_value_according_to_param_policy',
 '_warned_capturable_if_run_uncaptured',
 '_zero_grad_profile_name',
 'add_param_group',
 'defaults',
 'load_stat

In [41]:
l4_trainer.get_optimizer_cls_and_kwargs(SFTConfig(output_dir="test_trainer", eval_strategy="epoch",
                                    per_device_train_batch_size=1,
                                    max_grad_norm=1.0,
                                    num_train_epochs=1,
                                    logging_strategy="epoch",
                                    #logging_steps=6
                                   ))
# for the default and imported optimizers, the same arguments are retrieved by this function

(torch.optim.adamw.AdamW, {'lr': 2e-05, 'betas': (0.9, 0.999), 'eps': 1e-08})