In [1]:
!pip install datasets bitsandbytes trl==0.12.1 transformers peft huggingface-hub accelerate safetensors pandas matplotlib numpy==1.26.4

Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting trl==0.12.1
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Downloading trl-0.12.1-py3-none-any.whl (310 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.9/310.9 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m101.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m38.

# Install necessary libraries

In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    #AutoPeftModelForCausalLM, # Removed from transformers
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from trl import SFTTrainer, SFTConfig
# from trl.trainer.utils import DataCollatorForCompletionOnlyLM
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM, PeftConfig # Added to peft
from huggingface_hub import notebook_login
from trl import SFTTrainer, SFTConfig, setup_chat_format, DataCollatorForCompletionOnlyLM


# Check for bf16 support and set compute dtype


In [2]:
support = torch.cuda.is_bf16_supported(including_emulation=False)
calculate_dtype = torch.bfloat16 if support else torch.float32

In [3]:
print(calculate_dtype)

torch.bfloat16


#bnb config for loading 4 bit model with nf4 quant type
* loading model with quantization config
* device map to cuda
* 4bit true

In [11]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= calculate_dtype, #calculate_dtype can be bf16 or float32- use bf16 if supported
    bnb_4bit_use_double_quant= True
    )
repo = "microsoft/DialoGPT-medium"
model = AutoModelForCausalLM.from_pretrained(repo, quantization_config= bnb_config, device_map= "cuda:0")

#Check model memory footprint

In [12]:
print(model.get_memory_footprint()/1024/1024)

268.7715301513672


#model config

In [13]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Linear4bit(in_features=1024, out_features=3072, bias=True)
          (c_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear4bit(in_features=1024, out_features=4096, bias=True)
          (c_proj): Linear4bit(in_features=4096, out_features=1024, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, element

#Prepare model for kbit training
##Use Lora Config


1.   rank [4,8,16,32] - choose one
2.   lora_alpha is a scalling factor which should be 2x the rank of matrix.
3.   dropout range from 0.03 to 0.10 which helps prevent overfit
4.   module - choose module as per requirement


In [15]:
# Prepare model for quantized training
model = prepare_model_for_kbit_training(model)

# DialoGPT/GPT-2 uses different layer names than newer models
config = LoraConfig(
    r = 8,  # rank of LoRA - [4-16]
    bias = "none",  # ["all", "lora_only"] - for train bias term
    lora_alpha = 16,  # scaling factor
    lora_dropout = 0.10,  # prevent overfit - used for regularisation

    # CORRECTED target modules for DialoGPT/GPT-2 architecture
    target_modules = [
        "c_attn",    # Combined Q, K, V projection (replaces q_proj, k_proj, v_proj)
        "c_proj",    # Output projection (replaces o_proj)
        "c_fc",      # Feed-forward layer 1 (replaces gate_proj/up_proj)
        "c_proj"     # Feed-forward layer 2 (replaces down_proj) - Note: same name used twice in GPT-2
    ],

    task_type = "CAUSAL_LM"
)

# Apply LoRA to the model
model = get_peft_model(model, config)
print("DialoGPT model successfully configured with LoRA!")

DialoGPT model successfully configured with LoRA!


#once again check memory footprint

In [16]:
print(model.get_memory_footprint()/1024/1024)

381.5430145263672


#Print base model to compare

In [17]:
print(model.get_base_model)

<bound method PeftModel.get_base_model of PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 1024)
        (wpe): Embedding(1024, 1024)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-23): 24 x GPT2Block(
            (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=1024, out_features=3072, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3072, bias=False)
                )
                (lora_embeddin

In [18]:
print(model.get_memory_footprint()/1e6)

400.076848


#Check for trainable Parameters and its percentage for a mathematical view.

In [19]:
trainable_params, total_params = model.get_nb_trainable_parameters()
percentage = (trainable_params / total_params) * 100

print(f"Trainable Parameters: {trainable_params:,}")
print(f"Total Parameters: {total_params:,}")
print(f"Percentage Trainable: {percentage:.2f}%")

Trainable Parameters: 3,145,728
Total Parameters: 357,968,896
Percentage Trainable: 0.88%


#ETL Process for Dataset Prep stage, Tokenizer load and define chat template if needed.

In [20]:
# Load the tokenizer for DialoGPT-medium
tokenizer = AutoTokenizer.from_pretrained(repo)

# Add padding token if it doesn't exist (DialoGPT needs this)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load finance dataset - perfect for UK fintech jobs!
raw_dataset = load_dataset("gbharti/finance-alpaca", split="train[:2000]")  # 2000 samples for faster training

print(f"Finance dataset loaded: {len(raw_dataset)} samples")
print(f"Sample entry: {raw_dataset[0]}")

def format_finance_prompt(example):
    """Format financial data for conversational AI training"""
    instruction = example["instruction"]
    input_text = example["input"] if example["input"] else ""
    output = example["output"]

    # Create a finance-focused conversation format
    if input_text:
        # For questions with context (market data, company info, etc.)
        prompt = f"<|user|> {instruction} Context: {input_text} <|bot|> {output}<|endoftext|>"
    else:
        # For general financial questions
        prompt = f"<|user|> {instruction} <|bot|> {output}<|endoftext|>"

    return {"text": prompt}

def tokenize_finance_function(examples):
    """Tokenize the formatted financial conversations"""
    # Tokenize with proper padding for DialoGPT
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",  # Consistent tensor sizes
        max_length=512,        # Good balance for financial conversations
        return_tensors=None
    )

    # For DialoGPT, labels are same as input_ids but ignore padded tokens
    labels = []
    for input_ids, attention_mask in zip(tokenized["input_ids"], tokenized["attention_mask"]):
        # Copy input_ids for labels
        label = input_ids.copy() if isinstance(input_ids, list) else input_ids[:]
        # Set padded positions to -100 (ignored in loss calculation)
        for i, mask in enumerate(attention_mask):
            if mask == 0:  # Padded token
                label[i] = -100
        labels.append(label)

    tokenized["labels"] = labels

    return tokenized

# Format the financial dataset
print("Formatting financial conversations...")
formatted_dataset = raw_dataset.map(format_finance_prompt)

# Show a formatted example
print(f"\nFormatted example:\n{formatted_dataset[0]['text'][:200]}...")

# Tokenize the financial dataset
print("Tokenizing financial dataset...")
tokenized_dataset = formatted_dataset.map(
    tokenize_finance_function,
    batched=True,
    remove_columns=formatted_dataset.column_names,  # Clean up
    desc="Tokenizing finance conversations"
)

# This is your final dataset for training
final_dataset = tokenized_dataset

print(f"\n✅ Financial dataset ready!")
print(f"📊 Total samples: {len(final_dataset)}")
print(f"🔤 Sample token length: {len(final_dataset[0]['input_ids'])}")
print(f"💰 Dataset contains: Financial advice, market analysis, investment guidance")

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

README.md:   0%|          | 0.00/831 [00:00<?, ?B/s]

Cleaned_date.json:   0%|          | 0.00/42.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/68912 [00:00<?, ? examples/s]

Finance dataset loaded: 2000 samples
Sample entry: {'instruction': 'For a car, what scams can be plotted with 0% financing vs rebate?', 'input': '', 'output': "The car deal makes money 3 ways. If you pay in one lump payment. If the payment is greater than what they paid for the car, plus their expenses, they make a profit. They loan you the money. You make payments over months or years, if the total amount you pay is greater than what they paid for the car, plus their expenses, plus their finance expenses they make money. Of course the money takes years to come in, or they sell your loan to another business to get the money faster but in a smaller amount. You trade in a car and they sell it at a profit. Of course that new transaction could be a lump sum or a loan on the used car... They or course make money if you bring the car back for maintenance, or you buy lots of expensive dealer options. Some dealers wave two deals in front of you: get a 0% interest loan. These tend to be shorter

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]


Formatted example:
<|user|> For a car, what scams can be plotted with 0% financing vs rebate? <|bot|> The car deal makes money 3 ways. If you pay in one lump payment. If the payment is greater than what they paid for th...
Tokenizing financial dataset...


Tokenizing finance conversations:   0%|          | 0/2000 [00:00<?, ? examples/s]


✅ Financial dataset ready!
📊 Total samples: 2000
🔤 Sample token length: 512
💰 Dataset contains: Financial advice, market analysis, investment guidance


In [21]:
# My SFT Trainer Configuration for DialoGPT-medium Finance Fine-tuning
# No evaluation split needed - using full dataset for training only

# My optimized parameters for LoRA training
min_effective_batch_size = 4  # I reduced this for DialoGPT-medium (smaller model)
lr = 2e-4  # I increased learning rate slightly for DialoGPT
max_seq_length = 512  # I updated this to match my tokenization settings
collator_fn = None  # I'm not using a custom collator since I pre-pad in tokenization
packing = False  # I disabled packing since I'm using fixed-length sequences
steps = 20  # My logging and saving frequency
num_train_epochs = 2  # I reduced epochs since DialoGPT converges faster
warmup_ratio = 0.05  # I reduced warmup for smaller model

# My SFT configuration with updated paths and names
sft_config = SFTConfig(
    # I'm saving my model to a new directory for this finance experiment
    output_dir = '/content/drive/MyDrive/finance-models/DialoGPT-Finance-Assistant-finetuned',

    # My data processing settings
    packing = packing,
    max_seq_length = max_seq_length,

    # I disabled gradient checkpointing to fix potential errors
    gradient_checkpointing = False,

    # My training batch and precision settings
    per_device_train_batch_size = min_effective_batch_size,
    auto_find_batch_size = True,  # I let the trainer find optimal batch size
    fp16 = True,  # I use fp16 for DialoGPT better compatibility

    # My training schedule
    num_train_epochs = num_train_epochs,
    learning_rate = lr,
    lr_scheduler_type = "linear",  # I use linear scheduler for conversational models
    warmup_ratio = warmup_ratio,
    weight_decay = 0.01,  # I add weight decay for regularization
    max_grad_norm = 1.0,  # I clip gradients to prevent exploding gradients

    # My logging and monitoring setup
    report_to = 'wandb',  # I'm tracking my experiments with Weights & Biases
    run_name = "DialoGPT-Finance-Assistant-UK-Fintech-LoRA",  # My updated run name

    # My logging directory (updated path)
    logging_dir = '/content/drive/MyDrive/finance-models/DialoGPT-Finance-Assistant-finetuned/logs',

    # My checkpoint and logging strategy
    logging_strategy = 'steps',
    save_strategy = 'steps',
    logging_steps = steps,  # I log every 20 steps
    save_steps = steps,     # I save checkpoint every 20 steps
    save_total_limit = 2,   # I keep only the last 2 checkpoints to save space
)

# I create my trainer with the prepared dataset and configuration
trainer = SFTTrainer(
    model = model,                    # My loaded model (should be already loaded)
    train_dataset = final_dataset,    # My prepared dataset from the previous script
    processing_class = tokenizer,     # My tokenizer for text processing
    data_collator = collator_fn,      # My data collator (None for default)
    args = sft_config,               # My training configuration
)

# I start the training process
print("Starting my DialoGPT finance fine-tuning...")
trainer.train()
print("Training completed! My model is saved to:", sft_config.output_dir)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Starting my DialoGPT finance fine-tuning...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mchaubey-amit017[0m ([33mhectorlabs[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
20,8.0722
40,6.8428
60,5.5886
80,4.7558
100,4.3539
120,4.2383
140,4.0638
160,3.9996
180,3.9391
200,3.9183


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*a

Training completed! My model is saved to: /content/drive/MyDrive/finance-models/DialoGPT-Finance-Assistant-finetuned


In [22]:
# Step 1: I'm saving my trained model locally first
print("Saving my trained DialoGPT finance model...")
trainer.save_model('/content/dialogpt-finance-saved')

# Step 2: I load and merge the LoRA adapter with the base model
from peft import AutoPeftModelForCausalLM

print("Loading my PEFT model and merging adapter...")
# I load the saved PEFT model (use the same path as Step 1)
peft_model = AutoPeftModelForCausalLM.from_pretrained('/content/dialogpt-finance-saved')

# I merge and unload the adapter to get a single model
merged_model = peft_model.merge_and_unload()

# Step 3: I save the merged model with tokenizer
print("Saving my merged model...")
merged_model.save_pretrained('/content/dialogpt-finance-merged')
tokenizer.save_pretrained('/content/dialogpt-finance-merged')

# Step 4: I upload my model to Hugging Face Hub
from huggingface_hub import HfApi

print("Uploading my model to Hugging Face Hub...")
api = HfApi()
api.upload_folder(
    folder_path='/content/dialogpt-finance-merged',
    repo_id="sweatSmile/DialoGPT-FinTech-Investment-Banking-Assistant",  # My new repo name
    repo_type="model",
    commit_message="Upload DialoGPT-medium fine-tuned on finance dataset for investment banking and fintech applications with LoRA"
)

print("Model upload completed! 🎉")
print("Model is now available at: https://huggingface.co/sweatSmile/DialoGPT-FinTech-Investment-Banking-Assistant")

Saving my trained DialoGPT finance model...
Loading my PEFT model and merging adapter...




Saving my merged model...
Uploading my model to Hugging Face Hub...


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...pt-finance-merged/model.safetensors:   0%|          |  551kB / 1.42GB            

Model upload completed! 🎉
Model is now available at: https://huggingface.co/sweatSmile/DialoGPT-FinTech-Investment-Banking-Assistant
