In [6]:
import os
import torch
import mlflow as mf
import shutil
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    Trainer,
    AutoModelForSequenceClassification,       
    TrainingArguments,
    pipeline,
    logging,
    DataCollatorWithPadding
)
from ftzard.utils.dvc import get_current_date_time
import datetime
from datasets import concatenate_datasets
from pathlib import Path
import torch.nn.functional as F
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from sklearn.metrics import balanced_accuracy_score, accuracy_score
import numpy as np
from trl import SFTTrainer
from hydra import initialize, compose
import optuna
import ftzard.utils.mlflow as mf_utils
import joblib
import dagstermill as dgm

In [2]:
!nvidia-smi

Tue Jul  2 11:42:27 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
| 41%   56C    P8     1W / 260W |   8209MiB / 11264MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
| 46%   65C    P8    39W / 260W |   8275MiB / 11264MiB |      0%      Default |
|       

In [3]:
# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of available CUDA devices
    num_devices = torch.cuda.device_count()
    print(f"Number of CUDA devices available: {num_devices}")

    # Print information about each device
    for i in range(num_devices):
        device_name = torch.cuda.get_device_name(i)
        print(f"Device {i}: {device_name}")
else:
    print("CUDA is not available on this system.")

Number of CUDA devices available: 2
Device 0: NVIDIA GeForce RTX 2080 Ti
Device 1: NVIDIA GeForce RTX 2080 Ti


In [4]:
base_path = '/app/ftzard'
config_path = f'{base_path}/config/'
try:
    os.symlink(config_path, "config_link")
except Exception as e:
    print("Symlink already created...")
config_name = 'config'
data1_path = f"{base_path}/data/tokenized_data.joblib"
data2_path = f"{base_path}/data/retraining_data.joblib"

Symlink already created...


In [5]:
with initialize(version_base=None, config_path="config_link"):
    cfg = compose(config_name=config_name)
    tracking_uri, experiment_name = cfg.MLFLOW.TRACKING.URI, cfg.MLFLOW.EXPERIMENT.NAME
    

In [8]:
os.environ['MLFLOW_TRACKING_URI'] = tracking_uri
run_name = get_current_date_time()
base_run_name = "RETRAINING"
model_name = cfg.HUGGINGFACE.MODEL.NAME
max_len = 1024

print('Mlflow Base Run Name: ', base_run_name)
print('Previous Data Path: ', data1_path)
print('Sampled Data Path: ', data2_path)
print('Mlflow Experiment Name: ', experiment_name)
print('Mlflow Run Name: ', run_name)
print('Model Name: ', model_name)

Mlflow Base Run Name:  RETRAINING
Previous Data Path:  /app/ftzard/data/tokenized_data.joblib
Sampled Data Path:  /app/ftzard/data/retraining_data.joblib
Mlflow Experiment Name:  senetiment_analysis
Mlflow Run Name:  2024-07-02_11:44
Model Name:  tiiuae/falcon-7b


In [9]:
datasets = joblib.load(data1_path)
sampled_dataset = joblib.load(data2_path)

In [10]:
datasets = datasets["datasets"]
sampled_dataset = sampled_dataset["data"]

In [11]:
print("--------------- Previous Training Data ------------------")
print(datasets)
print("--------------- Sampled Data ------------------")
print(sampled_dataset)

--------------- Previous Training Data ------------------
DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 14400
    })
    val: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1120
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 480
    })
})
--------------- Sampled Data ------------------
Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 120
})


In [12]:
retrain_data = concatenate_datasets([datasets["train"], sampled_dataset])

In [13]:
print(retrain_data)

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 14520
})


In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quant_config,
    num_labels=2,
    device_map="auto"
)

print(model)

  return self.fget.__get__(instance, owner)()
Loading checkpoint shards:  50%|██████████████████████                      | 1/2 [00:13<00:13, 13.69s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 40.00 MiB (GPU 1; 10.76 GiB total capacity; 2.10 GiB already allocated; 34.56 MiB free; 2.12 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
model = prepare_model_for_kbit_training(model)
print(model)

In [None]:
def get_lora_model(model, config):
    return get_peft_model(model, config)

In [None]:

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'balanced_accuracy' : balanced_accuracy_score(predictions, labels),'accuracy':accuracy_score(predictions,labels)}

class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure label_weights is a tensor
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels").long()

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [None]:
study = optuna.create_study(direction='maximize')

In [None]:
for i in range(2):
    mf.end_run()
artifact_path = "mlartifacts"
experiment_id = mf_utils.create_experiment(exp_name=experiment_name)
top_level_run_id = mf_utils.get_run_id_by_name(run_name=base_run_name, 
                                             experiment_ids=[experiment_id])

### RUN HP_TUNING ###
with mf.start_run(run_id = top_level_run_id, run_name=base_run_name, experiment_id=experiment_id):
    base_run_id = mf_utils.get_run_id_by_name(run_name=run_name, 
                                                 experiment_ids=[experiment_id], nested=True)
    print('Experiment Id: ', experiment_id)

    ## RUN FROM get_current_date_time()
    with mf.start_run(run_id = base_run_id, experiment_id=experiment_id,
                        run_name=run_name, nested=True):
        for i in range(1, 5):
            trial = study.ask()
            nested_run_name = f"{run_name}_trial_{i}"
            run_id = mf_utils.get_run_id_by_name(run_name=nested_run_name, 
                                                 experiment_ids=[experiment_id],
                                                nested = True)
            print('Run Id: ', run_id)

            ### HYPERPARAM RUN ###
            if run_id:
                mf.start_run(run_id=run_id, run_name=nested_run_name, experiment_id=experiment_id, nested=True)
            else:
                mf.start_run(run_name=nested_run_name, experiment_id=experiment_id, nested=True)
    
            ## CHOOSE HYPERAPARAMTERS ####
    
            rank = trial.suggest_categorical("rank", [8,16,32,64])
            lr = trial.suggest_float("lr", 0.00006, 0.0004,)
            batch_size = trial.suggest_categorical("batch_size", [8,16,32,64])
            weight_decay = trial.suggest_float("weight_decay", 0.0005, 0.02)
            lora_dropout = trial.suggest_float("lora_dropout", 0.03, 0.06)
            try:
                mf.log_params(trial.params)
            except Exception as e:
                pass
    
            lora_config = LoraConfig(
            r = rank, # the dimension of the low-rank matrices
            lora_alpha = rank//2, # scaling factor for LoRA activations vs pre-trained weight activations
            target_modules = [
                "query_key_value",
                "dense",
                "dense_h_to_4h",
                "dense_4h_to_h",
                # "score"
            ],
            lora_dropout = lora_dropout, # dropout probability of the LoRA layers
            bias = 'none', # wether to train bias weights, set to 'none' for attention layers
            task_type = 'SEQ_CLS'
            )
    
            lora_model = get_lora_model(model=model, config=lora_config)
            lora_model.config.use_cache = False
            lora_model.config.pretraining_tp = 1
            lora_model.config.pad_token_id = tokenizer.pad_token_id
            
            training_args = TrainingArguments(
            output_dir = f'sentiment_classification_run_{i}',
            learning_rate = lr,
            per_device_train_batch_size = batch_size,
            per_device_eval_batch_size = batch_size,
            num_train_epochs = 1,
            weight_decay = weight_decay,
            evaluation_strategy = 'epoch',
            save_strategy = 'epoch',
            load_best_model_at_end = True
            )
        

            trainer = CustomTrainer(
            model = lora_model,
            args = training_args,
            train_dataset = retrain_data,
            eval_dataset = sampled_dataset,
            tokenizer = tokenizer,
            data_collator = collate_fn,
            compute_metrics = compute_metrics,
            )
    
            result = trainer.train()
            eval_res = trainer.evaluate()
            study.tell(trial, eval_res['eval_accuracy'])
            trainer.save_model(f'{base_path}/data/artifacts/{run_name}_run_{i}')
            mf.log_artifacts(local_dir=f'{base_path}/data/artifacts/{run_name}_run_{i}')
            mf.end_run()
            shutil.rmtree(f'sentiment_classification_run_{i}', ignore_errors=True)
            del lora_model
            
    
    

The provided experiment name senetiment_analysis already exists, the run will be logged in this experiment.
                                 
Experiment Id:  1
Run Id:  be663a108e5d4bcb9751953d29d0de37


2024/06/25 14:39:54 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id be663a108e5d4bcb9751953d29d0de37: Failed to log run data: Exception: Changing param values is not allowed. Params were already logged='[{'key': 'logging_dir', 'old_value': 'sentiment_classification_run_1/runs/Jun18_19-01-28_933e8d6f554d', 'new_value': 'sentiment_classification_run_1/runs/Jun25_14-39-53_8e802e6f43fc'}]' for run ID='be663a108e5d4bcb9751953d29d0de37'.
2024/06/25 14:39:54 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id be663a108e5d4bcb9751953d29d0de37: Failed to log run data: Exception: Changing param values is not allowed. Params were already logged='[{'key': 'learning_rate', 'old_value': '0.00018373234106961294', 'new_value': '0.0002561582931817167'}, {'key': 'weight_decay', 'old_value': '0.0011609033084593473', 'new_value': '0.0070471347745436255'}]' for run ID='be663a108e5d4bcb9751953d29d0de37'.


Epoch,Training Loss,Validation Loss
