In [24]:
!pip install datasets bitsandbytes trl==0.12.1 transformers peft huggingface-hub accelerate safetensors pandas matplotlib numpy==1.26.4



# Install necessary libraries

In [25]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    #AutoPeftModelForCausalLM, # Removed from transformers
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from trl import SFTTrainer, SFTConfig
# from trl.trainer.utils import DataCollatorForCompletionOnlyLM
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM, PeftConfig # Added to peft
from huggingface_hub import notebook_login
from trl import SFTTrainer, SFTConfig, setup_chat_format, DataCollatorForCompletionOnlyLM


# Check for bf16 support and set compute dtype


In [26]:
support = torch.cuda.is_bf16_supported(including_emulation=False)
calculate_dtype = torch.bfloat16 if support else torch.float32

In [27]:
print(calculate_dtype)

torch.bfloat16


#bnb config for loading 4 bit model with nf4 quant type
* loading model with quantization config
* device map to cuda
* 4bit true

In [28]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= calculate_dtype, #calculate_dtype can be bf16 or float32- use bf16 if supported
    bnb_4bit_use_double_quant= True
    )
repo = "microsoft/DialoGPT-small"
model = AutoModelForCausalLM.from_pretrained(repo, quantization_config= bnb_config, device_map= "cuda:0")

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

#Check model memory footprint

In [29]:
print(model.get_memory_footprint()/1024/1024)

127.8501205444336


#model config

In [30]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Linear4bit(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear4bit(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear4bit(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear4bit(in_features=3072, out_features=768, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affin

#Prepare model for kbit training
##Use Lora Config


1.   rank [4,8,16,32] - choose one
2.   lora_alpha is a scalling factor which should be 2x the rank of matrix.
3.   dropout range from 0.03 to 0.10 which helps prevent overfit
4.   module - choose module as per requirement


In [31]:
# Prepare model for quantized training
model = prepare_model_for_kbit_training(model)

# DialoGPT/GPT-2 uses different layer names than newer models
config = LoraConfig(
    r = 8,  # rank of LoRA - [4-16]
    bias = "none",  # ["all", "lora_only"] - for train bias term
    lora_alpha = 16,  # scaling factor
    lora_dropout = 0.10,  # prevent overfit - used for regularisation

    # CORRECTED target modules for DialoGPT/GPT-2 architecture
    target_modules = [
        "c_attn",    # Combined Q, K, V projection (replaces q_proj, k_proj, v_proj)
        "c_proj",    # Output projection (replaces o_proj)
        "c_fc",      # Feed-forward layer 1 (replaces gate_proj/up_proj)
        "c_proj"     # Feed-forward layer 2 (replaces down_proj) - Note: same name used twice in GPT-2
    ],

    task_type = "CAUSAL_LM"
)

# Apply LoRA to the model
model = get_peft_model(model, config)
print("DialoGPT model successfully configured with LoRA!")

DialoGPT model successfully configured with LoRA!


#once again check memory footprint

In [32]:
print(model.get_memory_footprint()/1024/1024)

207.7002182006836


#Print base model to compare

In [33]:
print(model.get_base_model)

<bound method PeftModel.get_base_model of PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=768, out_features=2304, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A):

In [34]:
print(model.get_memory_footprint()/1e6)

217.789464


#Check for trainable Parameters and its percentage for a mathematical view.

In [35]:
trainable_params, total_params = model.get_nb_trainable_parameters()
percentage = (trainable_params / total_params) * 100

print(f"Trainable Parameters: {trainable_params:,}")
print(f"Total Parameters: {total_params:,}")
print(f"Percentage Trainable: {percentage:.2f}%")

Trainable Parameters: 1,179,648
Total Parameters: 125,619,456
Percentage Trainable: 0.94%


#ETL Process for Dataset Prep stage, Tokenizer load and define chat template if needed.

In [36]:
# Load the tokenizer for DialoGPT-small
tokenizer = AutoTokenizer.from_pretrained(repo)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load financial sentiment dataset - perfect for trading/market analysis roles!
raw_dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", split="train[:1500]")  # 1500 samples for optimal training

print(f"Financial sentiment dataset loaded: {len(raw_dataset)} samples")
print(f"Sample entry: {raw_dataset[0]}")

def format_financial_sentiment_prompt(example):
    """Format financial sentiment data for trading assistant training"""
    text = example["text"]
    sentiment = example["label"]  # 0=negative, 1=neutral, 2=positive

    # Map sentiment labels to readable format
    sentiment_map = {0: "bearish", 1: "neutral", 2: "bullish"}
    sentiment_label = sentiment_map[sentiment]

    # Create a market analysis conversation format
    prompt = f"<|user|> What's the market sentiment for this news: {text} <|bot|> Based on the financial news analysis, the market sentiment appears {sentiment_label}. This suggests {sentiment_label} market conditions for the mentioned assets.<|endoftext|>"

    return {"text": prompt}

def tokenize_financial_sentiment_function(examples):
    """Tokenize the formatted financial sentiment conversations"""
    # Tokenize with proper padding for DialoGPT
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",  # Consistent tensor sizes
        max_length=256,        # Shorter for sentiment analysis
        return_tensors=None
    )

    # For DialoGPT, labels are same as input_ids but ignore padded tokens
    labels = []
    for input_ids, attention_mask in zip(tokenized["input_ids"], tokenized["attention_mask"]):
        # Copy input_ids for labels
        label = input_ids.copy() if isinstance(input_ids, list) else input_ids[:]
        # Set padded positions to -100 (ignored in loss calculation)
        for i, mask in enumerate(attention_mask):
            if mask == 0:  # Padded token
                label[i] = -100
        labels.append(label)

    tokenized["labels"] = labels

    return tokenized

# Format the financial sentiment dataset
print("Formatting financial sentiment conversations...")
formatted_dataset = raw_dataset.map(format_financial_sentiment_prompt)

# Show a formatted example
print(f"\nFormatted example:\n{formatted_dataset[0]['text'][:200]}...")

# Tokenize the financial sentiment dataset
print("Tokenizing financial sentiment dataset...")
tokenized_dataset = formatted_dataset.map(
    tokenize_financial_sentiment_function,
    batched=True,
    remove_columns=formatted_dataset.column_names,  # Clean up
    desc="Tokenizing financial sentiment conversations"
)

# This is your final dataset for training
final_dataset = tokenized_dataset

print(f"\n✅ Financial sentiment dataset ready!")
print(f"📊 Total samples: {len(final_dataset)}")
print(f"🔤 Sample token length: {len(final_dataset[0]['input_ids'])}")
print(f"💹 Dataset contains: Market sentiment analysis, trading insights, financial news interpretation")

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sent_train.csv: 0.00B [00:00, ?B/s]

sent_valid.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/9543 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2388 [00:00<?, ? examples/s]

Financial sentiment dataset loaded: 1500 samples
Sample entry: {'text': '$BYND - JPMorgan reels in expectations on Beyond Meat https://t.co/bd0xbFGjkT', 'label': 0}
Formatting financial sentiment conversations...


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]


Formatted example:
<|user|> What's the market sentiment for this news: $BYND - JPMorgan reels in expectations on Beyond Meat https://t.co/bd0xbFGjkT <|bot|> Based on the financial news analysis, the market sentiment app...
Tokenizing financial sentiment dataset...


Tokenizing financial sentiment conversations:   0%|          | 0/1500 [00:00<?, ? examples/s]


✅ Financial sentiment dataset ready!
📊 Total samples: 1500
🔤 Sample token length: 256
💹 Dataset contains: Market sentiment analysis, trading insights, financial news interpretation


In [38]:
# My SFT Trainer Configuration for DialoGPT-small Financial Sentiment Fine-tuning
# No evaluation split needed - using full dataset for training only

# My optimized parameters for LoRA training
min_effective_batch_size = 8  # I increased slightly for DialoGPT-small efficiency
lr = 3e-4  # I increased learning rate for smaller model and dataset
max_seq_length = 256  # I reduced to match sentiment analysis tokenization
collator_fn = None  # I'm not using a custom collator since I pre-pad in tokenization
packing = False  # I disabled packing since I'm using fixed-length sequences
steps = 25  # My logging and saving frequency
num_train_epochs = 3  # I kept epochs low to prevent overfitting on 1500 samples
warmup_ratio = 0.03  # I reduced warmup for smaller dataset

# My SFT configuration with updated paths and names
sft_config = SFTConfig(
    # I'm saving my model to a new directory for this sentiment experiment
    output_dir = '/content/drive/MyDrive/finance-models/DialoGPT-Financial-Market-Sentiment-finetuned',

    # My data processing settings
    packing = packing,
    max_seq_length = max_seq_length,

    # I disabled gradient checkpointing to fix potential errors
    gradient_checkpointing = False,

    # My training batch and precision settings
    per_device_train_batch_size = min_effective_batch_size,
    auto_find_batch_size = True,  # I let the trainer find optimal batch size
    fp16 = True,  # I use fp16 for DialoGPT better compatibility

    # My training schedule
    num_train_epochs = num_train_epochs,
    learning_rate = lr,
    lr_scheduler_type = "linear",  # I use linear scheduler for conversational models
    warmup_ratio = warmup_ratio,
    weight_decay = 0.005,  # I reduced weight decay for smaller dataset
    max_grad_norm = 0.5,  # I reduced gradient clipping for stability

    # My logging and monitoring setup
    report_to = 'wandb',  # I'm tracking my experiments with Weights & Biases
    run_name = "DialoGPT-Financial-Market-Sentiment-Trading-LoRA",  # My updated run name

    # My logging directory (updated path)
    logging_dir = '/content/drive/MyDrive/finance-models/DialoGPT-Financial-Market-Sentiment-finetuned/logs',

    # My checkpoint and logging strategy
    logging_strategy = 'steps',
    save_strategy = 'steps',
    logging_steps = steps,  # I log every 25 steps
    save_steps = steps,     # I save checkpoint every 25 steps
    save_total_limit = 2,   # I keep only the last 2 checkpoints to save space
)

# I create my trainer with the prepared dataset and configuration
trainer = SFTTrainer(
    model = model,                    # My loaded model (should be already loaded)
    train_dataset = final_dataset,    # My prepared dataset from the previous script
    processing_class = tokenizer,     # My tokenizer for text processing
    data_collator = collator_fn,      # My data collator (None for default)
    args = sft_config,               # My training configuration
)

# I start the training process
print("Starting my DialoGPT financial sentiment fine-tuning...")
trainer.train()
print("Training completed! My model is saved to:", sft_config.output_dir)

Starting my DialoGPT financial sentiment fine-tuning...


  return fn(*args, **kwargs)


Step,Training Loss
25,3.1546
50,2.5519
75,2.3787
100,2.2897
125,2.1892
150,2.1238
175,1.9944
200,1.9048
225,1.988
250,1.9406


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Training completed! My model is saved to: /content/drive/MyDrive/finance-models/DialoGPT-Financial-Market-Sentiment-finetuned


In [39]:
# Step 1: I'm saving my trained model locally first
print("Saving my trained DialoGPT financial sentiment model...")
trainer.save_model('/content/dialogpt-sentiment-saved')

# Step 2: I load and merge the LoRA adapter with the base model
from peft import AutoPeftModelForCausalLM

print("Loading my PEFT model and merging adapter...")
# I load the saved PEFT model (use the same path as Step 1)
peft_model = AutoPeftModelForCausalLM.from_pretrained('/content/dialogpt-sentiment-saved')

# I merge and unload the adapter to get a single model
merged_model = peft_model.merge_and_unload()

# Step 3: I save the merged model with tokenizer
print("Saving my merged model...")
merged_model.save_pretrained('/content/dialogpt-sentiment-merged')
tokenizer.save_pretrained('/content/dialogpt-sentiment-merged')

# Step 4: I upload my model to Hugging Face Hub
from huggingface_hub import HfApi

print("Uploading my model to Hugging Face Hub...")
api = HfApi()
api.upload_folder(
    folder_path='/content/dialogpt-sentiment-merged',
    repo_id="sweatSmile/DialoGPT-Financial-Market-Sentiment-Trading-Assistant",  # My new repo name
    repo_type="model",
    commit_message="Upload DialoGPT-small fine-tuned on financial sentiment dataset for market analysis and trading insights with LoRA"
)

print("Model upload completed! 🎉")
print("Model is now available at: https://huggingface.co/sweatSmile/DialoGPT-Financial-Market-Sentiment-Trading-Assistant")

Saving my trained DialoGPT financial sentiment model...
Loading my PEFT model and merging adapter...




Saving my merged model...
Uploading my model to Hugging Face Hub...


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...-sentiment-merged/model.safetensors:   0%|          |  550kB /  498MB            

Model upload completed! 🎉
Model is now available at: https://huggingface.co/sweatSmile/DialoGPT-Financial-Market-Sentiment-Trading-Assistant
