In [1]:
import os
import torch
import mlflow as mf
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoModelForSequenceClassification,       
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
from hydra import initialize, compose
import optuna
import ftzard.utils.mlflow as mf_utils
import dagstermill as dgm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!nvidia-smi

Sat Jun 15 10:36:06 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
| 41%   37C    P8     1W / 260W |   4488MiB / 11264MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
| 41%   38C    P8    12W / 260W |      8MiB / 11264MiB |      0%      Default |
|       

In [3]:
# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of available CUDA devices
    num_devices = torch.cuda.device_count()
    print(f"Number of CUDA devices available: {num_devices}")

    # Print information about each device
    for i in range(num_devices):
        device_name = torch.cuda.get_device_name(i)
        print(f"Device {i}: {device_name}")
else:
    print("CUDA is not available on this system.")

Number of CUDA devices available: 2
Device 0: NVIDIA GeForce RTX 2080 Ti
Device 1: NVIDIA GeForce RTX 2080 Ti


In [4]:
base_path = '/home/aamir/FTzard/ftzard'
config_path = f'../../config/'
data_path = f"{base_path}/data/cleaned_data.csv"
config_name = 'config'

In [5]:
with initialize(version_base=None, config_path=config_path):
    cfg = compose(config_name=config_name)
    tracking_uri, experiment_name = cfg.MLFLOW_TRACKING_URI, cfg.MLFLOW_EXPERIMENT_NAME
    

In [6]:
os.environ['MLFLOW_TRACKING_URI'] = tracking_uri
run_name = 'hyper_param_tuning'
model_name = cfg['model_name']
max_len = 1024
print('Data Path: ', data_path)
print('Mlflow Experiment Name: ', experiment_name)
print('Mlflow Run Name: ', run_name)
print('Model Name: ', model_name)

Data Path:  /home/aamir/FTzard/ftzard/data/cleaned_data.csv
Mlflow Experiment Name:  senetiment_analysis
Mlflow Run Name:  hyper_param_tuning
Model Name:  tiiuae/falcon-7b


In [None]:
datasets = {}

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quant_config,
    num_labels=2
)

print(model)

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Downloading shards: 100%|███████████████████████████████████████████████████| 2/2 [02:16<00:00, 68.12s/it]
  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|████████████████████████████████████████████| 2/2 [00:10<00:00,  5.14s/it]
Some weights of FalconForSequenceClassification were not initialized from the model checkpoint at tiiuae/falcon-7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FalconForSequenceClassification(
  (transformer): FalconModel(
    (word_embeddings): Embedding(65024, 4544)
    (h): ModuleList(
      (0-31): 32 x FalconDecoderLayer(
        (self_attention): FalconAttention(
          (rotary_emb): FalconRotaryEmbedding()
          (query_key_value): Linear4bit(in_features=4544, out_features=4672, bias=False)
          (dense): Linear4bit(in_features=4544, out_features=4544, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): FalconMLP(
          (dense_h_to_4h): Linear4bit(in_features=4544, out_features=18176, bias=False)
          (act): GELUActivation()
          (dense_4h_to_h): Linear4bit(in_features=18176, out_features=4544, bias=False)
        )
        (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=4544, out_features=2, bias=False)
)


In [11]:
model = prepare_model_for_kbit_training(model)
print(model)

FalconForSequenceClassification(
  (transformer): FalconModel(
    (word_embeddings): Embedding(65024, 4544)
    (h): ModuleList(
      (0-31): 32 x FalconDecoderLayer(
        (self_attention): FalconAttention(
          (rotary_emb): FalconRotaryEmbedding()
          (query_key_value): Linear4bit(in_features=4544, out_features=4672, bias=False)
          (dense): Linear4bit(in_features=4544, out_features=4544, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): FalconMLP(
          (dense_h_to_4h): Linear4bit(in_features=4544, out_features=18176, bias=False)
          (act): GELUActivation()
          (dense_4h_to_h): Linear4bit(in_features=18176, out_features=4544, bias=False)
        )
        (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=4544, out_features=2, bias=False)
)


In [12]:
def get_lora_model(model, config):
    return get_peft_model(model, config)


In [15]:

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'balanced_accuracy' : balanced_accuracy_score(predictions, labels),'accuracy':accuracy_score(predictions,labels)}

class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure label_weights is a tensor
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels").long()

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss


NameError: name 'Trainer' is not defined

In [14]:
study = optuna.create_study(direction='maximize')

[I 2024-06-15 10:59:59,893] A new study created in memory with name: no-name-a2a83c9f-21d7-47ed-b1ef-7e74946b046b


In [None]:
with mf.start_run(experiment_name=experiment_name,
                    run_name=run_name):
    for i in range(1, 11):
        trial = study.ask()
        nested_run_name = f"trial_{i}"
        run_id = mf_utils.get_run_id_by_name(run_name=nested_run_name, experiment_ids=[experiment_id])
        print('Run Id: ', run_id)
        if run_id:
            mf.start_run(run_id=run_id, run_name=nested_run_name, experiment_id=experiment_id, nested=True)
        else:
            mf.start_run(run_name=nested_run_name, experiment_id=experiment_id, nested=True)

        ## CHOOSE HYPERAPARAMTERS ####

        rank = trial.suggest_categorical("rank", [8,16,32,64])
        lr = trial.suggest_float("lr", 0.00006, 0.0004,)
        batch_size = trial.suggest_categorical("batch_size", [8,16,32,64])
        weight_decay = trial.suggest_float("weight_decay", 0.0005, 0.02)
        lora_dropout = trial.suggest_float("lora_dropout", 0.03, 0.06)

        mf.log_params(trial.params)

        lora_config = LoraConfig(
        r = rank, # the dimension of the low-rank matrices
        lora_alpha = rank//2, # scaling factor for LoRA activations vs pre-trained weight activations
        target_modules = [
            "query_key_value",
            "dense",
            "dense_h_to_4h",
            "dense_4h_to_h",
        ],
        lora_dropout = lora_dropout, # dropout probability of the LoRA layers
        bias = 'none', # wether to train bias weights, set to 'none' for attention layers
        task_type = 'SEQ_CLS'
        )

        lora_model = get_lora_model(model=model, config=lora_config)
        lora_model.config.use_cache = False
        lora_model.config.pretraining_tp = 1
        lora_model.config.pad_token_id = tokenizer.pad_token_id
        
        training_args = TrainingArguments(
        output_dir = 'sentiment_classification',
        learning_rate = lr,
        per_device_train_batch_size = batch_size,
        per_device_eval_batch_size = batch_size,
        num_train_epochs = 3,
        weight_decay = weight_decay,
        evaluation_strategy = 'epoch',
        save_strategy = 'epoch',
        load_best_model_at_end = True
        )

        trainer = CustomTrainer(
        model = lora_model,
        args = training_args,
        train_dataset = datasets['train'],
        eval_dataset = datasets['val'],
        tokenizer = tokenizer,
        data_collator = collate_fn,
        compute_metrics = compute_metrics,
        )
  

        study.tell(trial, val_accuracy)
        mf.end_run()
            
        
    
    