In [1]:
import os
import torch
import mlflow as mf
import shutil
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    Trainer,
    AutoModelForSequenceClassification,       
    TrainingArguments,
    pipeline,
    logging,
    DataCollatorWithPadding
)
from pathlib import Path
import torch.nn.functional as F
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from sklearn.metrics import balanced_accuracy_score, accuracy_score
import numpy as np
from ftzard.utils.common import get_current_date_time
from trl import SFTTrainer
from hydra import initialize, compose
import optuna
import ftzard.utils.mlflow as mf_utils
import joblib
import dagstermill as dgm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
!nvidia-smi

Mon Jul  1 08:26:37 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
| 40%   53C    P8     1W / 260W |     18MiB / 11264MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
| 43%   64C    P8    39W / 260W |      8MiB / 11264MiB |      0%      Default |
|       

In [4]:
# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of available CUDA devices
    num_devices = torch.cuda.device_count()
    print(f"Number of CUDA devices available: {num_devices}")

    # Print information about each device
    for i in range(num_devices):
        device_name = torch.cuda.get_device_name(i)
        print(f"Device {i}: {device_name}")
else:
    print("CUDA is not available on this system.")

Number of CUDA devices available: 2
Device 0: NVIDIA GeForce RTX 2080 Ti
Device 1: NVIDIA GeForce RTX 2080 Ti


In [5]:
os.getcwd()

'/app/ftzard/pipeline/notebooks'

In [6]:
base_path = '/app/ftzard'
config_path = f'{base_path}/config/'
try:
    os.symlink(config_path, "config_link")
except Exception as e:
    print("Symlink already created...")
data_path = f"{base_path}/data/tokenized_dataset.joblib"
config_name = 'config'

Symlink already created...


In [7]:
with initialize(version_base=None, config_path="config_link"):
    cfg = compose(config_name=config_name)
    tracking_uri, experiment_name = cfg.MLFLOW.TRACKING.URI, cfg.MLFLOW.EXPERIMENT.NAME
    

In [8]:
os.environ['MLFLOW_TRACKING_URI'] = tracking_uri
run_name = get_current_date_time()
model_name = cfg.HUGGINGFACE.MODEL.NAME
base_run_name = "HP-TUNING"
max_len = 1024
print("Base Run Name: ", base_run_name)
print('Data Path: ', data_path)
print('Mlflow Experiment Name: ', experiment_name)
print('Mlflow Run Name: ', run_name)
print('Model Name: ', model_name)

Base Run Name:  HP-TUNING
Data Path:  /app/ftzard/data/tokenized_dataset.joblib
Mlflow Experiment Name:  senetiment_analysis
Mlflow Run Name:  2024-07-01_8:26
Model Name:  tiiuae/falcon-7b


In [9]:
datasets = joblib.load(data_path)

In [10]:
datasets = datasets["datasets"]

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quant_config,
    num_labels=2,
    device_map="auto"
)

print(model)

  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|████████████████████████████████████████████| 2/2 [00:14<00:00,  7.07s/it]
Some weights of FalconForSequenceClassification were not initialized from the model checkpoint at tiiuae/falcon-7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FalconForSequenceClassification(
  (transformer): FalconModel(
    (word_embeddings): Embedding(65024, 4544)
    (h): ModuleList(
      (0-31): 32 x FalconDecoderLayer(
        (self_attention): FalconAttention(
          (rotary_emb): FalconRotaryEmbedding()
          (query_key_value): Linear4bit(in_features=4544, out_features=4672, bias=False)
          (dense): Linear4bit(in_features=4544, out_features=4544, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): FalconMLP(
          (dense_h_to_4h): Linear4bit(in_features=4544, out_features=18176, bias=False)
          (act): GELUActivation()
          (dense_4h_to_h): Linear4bit(in_features=18176, out_features=4544, bias=False)
        )
        (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=4544, out_features=2, bias=False)
)


In [14]:
model = prepare_model_for_kbit_training(model)
print(model)

FalconForSequenceClassification(
  (transformer): FalconModel(
    (word_embeddings): Embedding(65024, 4544)
    (h): ModuleList(
      (0-31): 32 x FalconDecoderLayer(
        (self_attention): FalconAttention(
          (rotary_emb): FalconRotaryEmbedding()
          (query_key_value): Linear4bit(in_features=4544, out_features=4672, bias=False)
          (dense): Linear4bit(in_features=4544, out_features=4544, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): FalconMLP(
          (dense_h_to_4h): Linear4bit(in_features=4544, out_features=18176, bias=False)
          (act): GELUActivation()
          (dense_4h_to_h): Linear4bit(in_features=18176, out_features=4544, bias=False)
        )
        (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=4544, out_features=2, bias=False)
)


In [15]:
def get_lora_model(model, config):
    return get_peft_model(model, config)

In [16]:

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'balanced_accuracy' : balanced_accuracy_score(predictions, labels),'accuracy':accuracy_score(predictions,labels)}

class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure label_weights is a tensor
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels").long()

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [17]:
study = optuna.create_study(direction='maximize')

[I 2024-07-01 08:27:06,340] A new study created in memory with name: no-name-feec8c7c-3f0d-41da-9020-bd6022ec8d7d


In [18]:
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 14400
    })
    val: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1120
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 480
    })
})

#### I will train on a subset of data to capture the output using the select function in huggingace datset.

In [19]:
for i in range(2):
    mf.end_run()
artifact_path = "mlartifacts"
experiment_id = mf_utils.create_experiment(exp_name=experiment_name)
top_level_run_id = mf_utils.get_run_id_by_name(run_name=base_run_name, 
                                             experiment_ids=[experiment_id])

### RUN HP_TUNING ###
with mf.start_run(run_id = top_level_run_id, run_name=base_run_name, experiment_id=experiment_id):
    base_run_id = mf_utils.get_run_id_by_name(run_name=run_name, 
                                                 experiment_ids=[experiment_id], nested=True)
    print('Experiment Id: ', experiment_id)

    ## RUN FROM get_current_date_time()
    with mf.start_run(run_id = base_run_id, experiment_id=experiment_id,
                        run_name=run_name, nested=True):
        for i in range(1, 5):
            trial = study.ask()
            nested_run_name = f"{run_name}_trial_{i}"
            run_id = mf_utils.get_run_id_by_name(run_name=nested_run_name, 
                                                 experiment_ids=[experiment_id],
                                                nested = True)
            print('Run Id: ', run_id)

            ### HYPERPARAM RUN ###
            if run_id:
                mf.start_run(run_id=run_id, run_name=nested_run_name, experiment_id=experiment_id, nested=True)
            else:
                mf.start_run(run_name=nested_run_name, experiment_id=experiment_id, nested=True)
    
            ## CHOOSE HYPERAPARAMTERS ####
    
            rank = trial.suggest_categorical("rank", [8,16,32,64])
            lr = trial.suggest_float("lr", 0.00006, 0.0004,)
            batch_size = trial.suggest_categorical("batch_size", [8,16,32,64])
            weight_decay = trial.suggest_float("weight_decay", 0.0005, 0.02)
            lora_dropout = trial.suggest_float("lora_dropout", 0.03, 0.06)
            try:
                mf.log_params(trial.params)
            except Exception as e:
                pass
    
            lora_config = LoraConfig(
            r = rank, # the dimension of the low-rank matrices
            lora_alpha = rank//2, # scaling factor for LoRA activations vs pre-trained weight activations
            target_modules = [
                "query_key_value",
                "dense",
                "dense_h_to_4h",
                "dense_4h_to_h",
                # "score"
            ],
            lora_dropout = lora_dropout, # dropout probability of the LoRA layers
            bias = 'none', # wether to train bias weights, set to 'none' for attention layers
            task_type = 'SEQ_CLS'
            )
    
            lora_model = get_lora_model(model=model, config=lora_config)
            lora_model.config.use_cache = False
            lora_model.config.pretraining_tp = 1
            lora_model.config.pad_token_id = tokenizer.pad_token_id
            
            training_args = TrainingArguments(
            output_dir = f'sentiment_classification_run_{i}',
            learning_rate = lr,
            per_device_train_batch_size = batch_size,
            per_device_eval_batch_size = batch_size,
            num_train_epochs = 1,
            weight_decay = weight_decay,
            evaluation_strategy = 'epoch',
            save_strategy = 'epoch',
            load_best_model_at_end = True
            )
        

            trainer = CustomTrainer(
            model = lora_model,
            args = training_args,
            train_dataset = datasets['train'].select(range(1000)),
            eval_dataset = datasets['val'],
            tokenizer = tokenizer,
            data_collator = collate_fn,
            compute_metrics = compute_metrics,
            )
    
            result = trainer.train()
            eval_res = trainer.evaluate()
            study.tell(trial, eval_res['eval_accuracy'])
            trainer.save_model(f'{base_path}/data/artifacts/{run_name}_run_{i}')
            mf.log_artifacts(local_dir=f'{base_path}/data/artifacts/{run_name}_run_{i}')
            mf.end_run()
            shutil.rmtree(f'sentiment_classification_run_{i}', ignore_errors=True)
            del lora_model
            
    
    

INFO:alembic.runtime.migration:Context impl SQLiteImpl.
INFO:alembic.runtime.migration:Will assume non-transactional DDL.


The provided experiment name senetiment_analysis already exists, the run will be logged in this experiment.
                                 
Experiment Id:  1
Run Id:  ff351744c39747a0ad96867671e49953




Epoch,Training Loss,Validation Loss,Balanced Accuracy,Accuracy
1,No log,0.406359,0.824145,0.823214




INFO:peft.tuners.tuners_utils:Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!


Run Id:  0a517780063642ecadd0fbeece580d28




Epoch,Training Loss,Validation Loss,Balanced Accuracy,Accuracy
1,No log,0.476641,0.811426,0.794643




INFO:peft.tuners.tuners_utils:Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!


Run Id:  d86ae004ad9b436ca757d8ee9af0b44a




Epoch,Training Loss,Validation Loss,Balanced Accuracy,Accuracy
1,No log,0.782862,0.538034,0.533036




INFO:peft.tuners.tuners_utils:Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!


Run Id:  acf3d09f0575420d8e6109223861b7cd




Epoch,Training Loss,Validation Loss,Balanced Accuracy,Accuracy
1,No log,0.439007,0.81559,0.814286




