In [1]:
!pip install datasets bitsandbytes trl==0.12.1 transformers peft huggingface-hub accelerate safetensors pandas matplotlib numpy==1.26.4

Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting trl==0.12.1
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Downloading trl-0.12.1-py3-none-any.whl (310 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.9/310.9 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m100.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m33.

# Install necessary libraries

In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    #AutoPeftModelForCausalLM, # Removed from transformers
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from trl import SFTTrainer, SFTConfig
# from trl.trainer.utils import DataCollatorForCompletionOnlyLM
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM, PeftConfig # Added to peft
from huggingface_hub import notebook_login
from trl import SFTTrainer, SFTConfig, setup_chat_format, DataCollatorForCompletionOnlyLM


# Check for bf16 support and set compute dtype


In [2]:
support = torch.cuda.is_bf16_supported(including_emulation=False)
calculate_dtype = torch.bfloat16 if support else torch.float32

In [3]:
print(calculate_dtype)

torch.bfloat16


#bnb config for loading 4 bit model with nf4 quant type
* loading model with quantization config
* device map to cuda
* 4bit true

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= calculate_dtype, #calculate_dtype can be bf16 or float32- use bf16 if supported
    bnb_4bit_use_double_quant= True
    )
repo = "microsoft/DialoGPT-small"
model = AutoModelForCausalLM.from_pretrained(repo, quantization_config= bnb_config, device_map= "cuda:0")

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

#Check model memory footprint

In [5]:
print(model.get_memory_footprint()/1024/1024)

127.8501205444336


#model config

In [6]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Linear4bit(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear4bit(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear4bit(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear4bit(in_features=3072, out_features=768, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affin

#Prepare model for kbit training
##Use Lora Config


1.   rank [4,8,16,32] - choose one
2.   lora_alpha is a scalling factor which should be 2x the rank of matrix.
3.   dropout range from 0.03 to 0.10 which helps prevent overfit
4.   module - choose module as per requirement


In [7]:
# Prepare model for quantized training
model = prepare_model_for_kbit_training(model)

# DialoGPT/GPT-2 uses different layer names than newer models
config = LoraConfig(
    r = 8,  # rank of LoRA - [4-16]
    bias = "none",  # ["all", "lora_only"] - for train bias term
    lora_alpha = 16,  # scaling factor
    lora_dropout = 0.6,  # prevent overfit - used for regularisation

    # CORRECTED target modules for DialoGPT/GPT-2 architecture
    target_modules = [
        "c_attn",    # Combined Q, K, V projection (replaces q_proj, k_proj, v_proj)
        "c_proj",    # Output projection (replaces o_proj)
        "c_fc",      # Feed-forward layer 1 (replaces gate_proj/up_proj)
        "c_proj"     # Feed-forward layer 2 (replaces down_proj) - Note: same name used twice in GPT-2
    ],

    task_type = "CAUSAL_LM"
)

# Apply LoRA to the model
model = get_peft_model(model, config)
print("DialoGPT model successfully configured with LoRA!")

DialoGPT model successfully configured with LoRA!


#once again check memory footprint

In [8]:
print(model.get_memory_footprint()/1024/1024)

207.7002182006836


#Print base model to compare

In [9]:
print(model.get_base_model)

<bound method PeftModel.get_base_model of PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=768, out_features=2304, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.6, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A):

In [10]:
print(model.get_memory_footprint()/1e6)

217.789464


#Check for trainable Parameters and its percentage for a mathematical view.

In [11]:
trainable_params, total_params = model.get_nb_trainable_parameters()
percentage = (trainable_params / total_params) * 100

print(f"Trainable Parameters: {trainable_params:,}")
print(f"Total Parameters: {total_params:,}")
print(f"Percentage Trainable: {percentage:.2f}%")

Trainable Parameters: 1,179,648
Total Parameters: 125,619,456
Percentage Trainable: 0.94%


#ETL Process for Dataset Prep stage, Tokenizer load and define chat template if needed.

In [13]:
from datasets import load_dataset
from transformers import AutoTokenizer
import torch

# Load the tokenizer for DialoGPT-small
model_name = "microsoft/DialoGPT-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load financial Q&A dataset - PERFECT for wealth management & investment advisory!
raw_dataset = load_dataset("ChanceFocus/flare-finqa", split="train[:1000]")  # 1000 samples for optimal training

print(f"Financial Q&A dataset loaded: {len(raw_dataset)} samples")
print(f"Sample entry: {raw_dataset[0]}")

def format_financial_advisor_prompt(example):
    """Format financial Q&A data for wealth management advisor training"""
    question = example["query"]  # Fixed: dataset uses 'query' not 'question'
    answer = example["answer"]

    # Create a wealth management advisor conversation format
    prompt = f"<|user|> As my financial advisor, please help me understand: {question} <|bot|> As your financial advisor, let me explain: {answer} This information will help you make informed investment decisions.<|endoftext|>"

    return {"text": prompt}

def tokenize_financial_advisor_function(examples):
    """Tokenize the formatted financial advisor conversations"""
    # Tokenize with proper padding for DialoGPT
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",  # Consistent tensor sizes
        max_length=320,        # Medium length for detailed financial advice
        return_tensors=None
    )

    # For DialoGPT, labels are same as input_ids but ignore padded tokens
    labels = []
    for input_ids, attention_mask in zip(tokenized["input_ids"], tokenized["attention_mask"]):
        # Copy input_ids for labels
        label = input_ids.copy() if isinstance(input_ids, list) else input_ids[:]
        # Set padded positions to -100 (ignored in loss calculation)
        for i, mask in enumerate(attention_mask):
            if mask == 0:  # Padded token
                label[i] = -100
        labels.append(label)

    tokenized["labels"] = labels

    return tokenized

# Format the financial advisor dataset
print("Formatting financial advisor conversations...")
formatted_dataset = raw_dataset.map(format_financial_advisor_prompt)

# Show a formatted example
print(f"\nFormatted example:\n{formatted_dataset[0]['text'][:250]}...")

# Tokenize the financial advisor dataset
print("Tokenizing financial advisor dataset...")
tokenized_dataset = formatted_dataset.map(
    tokenize_financial_advisor_function,
    batched=True,
    remove_columns=formatted_dataset.column_names,  # Clean up
    desc="Tokenizing financial advisor conversations"
)

# This is your final dataset for training
final_dataset = tokenized_dataset

print(f"\n✅ Financial advisor dataset ready!")
print(f"📊 Total samples: {len(final_dataset)}")
print(f"🔤 Sample token length: {len(final_dataset[0]['input_ids'])}")
print(f"💼 Dataset contains: Financial advisory, investment guidance, wealth management Q&A")
print(f"🎯 Perfect for: Wealth management, investment advisory, private banking, financial planning roles")

Financial Q&A dataset loaded: 1000 samples
Sample entry: {'id': 'finqa0', 'query': 'Please answer the given financial question based on the context.\nContext: interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) . if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million . foreign currency exposure as more fully described in note 2i . in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s . dollar-based exposures by entering into forward foreign currency exchange contracts . the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months . currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local currency denomin

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]


Formatted example:
<|user|> As my financial advisor, please help me understand: Please answer the given financial question based on the context.
Context: interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) a...
Tokenizing financial advisor dataset...


Tokenizing financial advisor conversations:   0%|          | 0/1000 [00:00<?, ? examples/s]


✅ Financial advisor dataset ready!
📊 Total samples: 1000
🔤 Sample token length: 320
💼 Dataset contains: Financial advisory, investment guidance, wealth management Q&A
🎯 Perfect for: Wealth management, investment advisory, private banking, financial planning roles


In [14]:
# My SFT Trainer Configuration for DialoGPT-small Financial Q&A Fine-tuning
# No evaluation split needed - using full dataset for training only

# My optimized parameters for LoRA training
min_effective_batch_size = 8  # I kept your preferred batch size
lr = 2e-4  # I kept optimal learning rate for DialoGPT-small
max_seq_length = 320  # I updated to match Q&A tokenization
collator_fn = None  # I'm not using a custom collator since I pre-pad in tokenization
packing = False  # I disabled packing since I'm using fixed-length sequences
steps = 20  # My logging and saving frequency
num_train_epochs = 3  # I kept your preferred 3 epochs
warmup_ratio = 0.05  # I kept optimal warmup for smaller dataset

# My SFT configuration with updated paths and names
sft_config = SFTConfig(
    # I'm saving my model to a new directory for this financial advisor experiment
    output_dir = '/content/drive/MyDrive/finance-models/DialoGPT-Financial-Wealth-Advisor-finetuned',

    # My data processing settings
    packing = packing,
    max_seq_length = max_seq_length,

    # I disabled gradient checkpointing to fix potential errors
    gradient_checkpointing = False,

    # My training batch and precision settings
    per_device_train_batch_size = min_effective_batch_size,
    auto_find_batch_size = True,  # I let the trainer find optimal batch size
    fp16 = True,  # I use fp16 for DialoGPT better compatibility

    # My training schedule
    num_train_epochs = num_train_epochs,
    learning_rate = lr,
    lr_scheduler_type = "linear",  # I use linear scheduler for conversational models
    warmup_ratio = warmup_ratio,
    weight_decay = 0.01,  # I use standard weight decay
    max_grad_norm = 1.0,  # I use standard gradient clipping

    # My logging and monitoring setup
    report_to = 'wandb',  # I'm tracking my experiments with Weights & Biases
    run_name = "DialoGPT-Financial-Wealth-Management-Advisor-LoRA",  # My updated run name

    # My logging directory (updated path)
    logging_dir = '/content/drive/MyDrive/finance-models/DialoGPT-Financial-Wealth-Advisor-finetuned/logs',

    # My checkpoint and logging strategy
    logging_strategy = 'steps',
    save_strategy = 'steps',
    logging_steps = steps,  # I log every 20 steps
    save_steps = steps,     # I save checkpoint every 20 steps
    save_total_limit = 2,   # I keep only the last 2 checkpoints to save space
)

# I create my trainer with the prepared dataset and configuration
trainer = SFTTrainer(
    model = model,                    # My loaded model (should be already loaded)
    train_dataset = final_dataset,    # My prepared dataset from the previous script
    processing_class = tokenizer,     # My tokenizer for text processing
    data_collator = collator_fn,      # My data collator (None for default)
    args = sft_config,               # My training configuration
)

# I start the training process
print("Starting my DialoGPT financial advisor fine-tuning...")
trainer.train()
print("Training completed! My model is saved to:", sft_config.output_dir)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Starting my DialoGPT financial advisor fine-tuning...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mchaubey-amit017[0m ([33mhectorlabs[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
20,14.3107
40,10.464
60,7.5353
80,6.1618
100,5.5751
120,5.1884
140,4.9014
160,4.6953
180,4.5112
200,4.5007


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Training completed! My model is saved to: /content/drive/MyDrive/finance-models/DialoGPT-Financial-Wealth-Advisor-finetuned


In [15]:
# Step 1: I'm saving my trained model locally first
print("Saving my trained DialoGPT financial advisor model...")
trainer.save_model('/content/dialogpt-advisor-saved')

# Step 2: I load and merge the LoRA adapter with the base model
from peft import AutoPeftModelForCausalLM

print("Loading my PEFT model and merging adapter...")
# I load the saved PEFT model (use the same path as Step 1)
peft_model = AutoPeftModelForCausalLM.from_pretrained('/content/dialogpt-advisor-saved')

# I merge and unload the adapter to get a single model
merged_model = peft_model.merge_and_unload()

# Step 3: I save the merged model with tokenizer
print("Saving my merged model...")
merged_model.save_pretrained('/content/dialogpt-advisor-merged')
tokenizer.save_pretrained('/content/dialogpt-advisor-merged')

# Step 4: I upload my model to Hugging Face Hub
from huggingface_hub import HfApi

print("Uploading my model to Hugging Face Hub...")
api = HfApi()
api.upload_folder(
    folder_path='/content/dialogpt-advisor-merged',
    repo_id="sweatSmile/DialoGPT-Financial-Wealth-Management-Advisor",  # My new repo name
    repo_type="model",
    commit_message="Upload DialoGPT-small fine-tuned on financial Q&A dataset for wealth management and investment advisory with LoRA"
)

print("Model upload completed! 🎉")
print("Model is now available at: https://huggingface.co/sweatSmile/DialoGPT-Financial-Wealth-Management-Advisor")

Saving my trained DialoGPT financial advisor model...
Loading my PEFT model and merging adapter...




Saving my merged model...
Uploading my model to Hugging Face Hub...


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...pt-advisor-merged/model.safetensors:   0%|          |  550kB /  498MB            

Model upload completed! 🎉
Model is now available at: https://huggingface.co/sweatSmile/DialoGPT-Financial-Wealth-Management-Advisor
