This notebook was prepared and run in Kaggle notebooks using a P100 GPU accelerator with 15GB RAM
- https://www.kaggle.com/code/alfpercar/finetunepeft-lora/edit

In [1]:
!pip install -q -U bitsandbytes transformers peft accelerate datasets scipy einops evaluate trl

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m106.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m342.1/342.1 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.6/37.6 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.4/64.4 kB[0m [31m4.3 MB/s[0m eta [36

In [2]:
import os
# disable Weights and Biases
os.environ['WANDB_DISABLED']="true"

In [3]:
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [4]:

from huggingface_hub import interpreter_login

interpreter_login()



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|



Enter your token (input will not be visible):  ········
Add token as git credential? (Y/n)  n


In [5]:
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

print_gpu_utilization()

GPU memory occupied: 114 MB.


# Load Financial News DataSet

In [6]:
import json
import pandas as pd
# Load original dataset
JSON_PATH = '/kaggle/input/financial-news-with-ticker-level-sentiment/polygon_news_sample.json'

with open(JSON_PATH, 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data)

## Extract Relevant Fields

In [None]:
# Extract relevant fields: description (input), tickers, and sentiment (label)
def process_data(entry):
    return {
        "text": entry["description"],  # Input text
        "tickers": entry["tickers"],  # Stock tickers mentioned
        "sentiment": entry["insights"][0]["sentiment"] if entry["insights"] else "neutral",  # Use first insight sentiment
    }

# Apply the function to each row
processed_data = [process_data(entry) for entry in data]

# Convert to DataFrame
df = pd.DataFrame(processed_data)

# Display the processed data
print(df.head())

## Convert to HF DataSet

In [None]:
from datasets import Dataset

# Convert Pandas DataFrame to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df)

## DS Split

In [None]:
from sklearn.model_selection import train_test_split

# Define split ratio (80% train, 20% eval)
train_size = 0.8  

# Perform train-test split
train_data, eval_data = train_test_split(hf_dataset, test_size=(1 - train_size), random_state=42)

# Convert them back into Hugging Face datasets
train_dataset = Dataset.from_dict(train_data)
eval_dataset = Dataset.from_dict(eval_data)

# Print dataset sizes
print(f"Training size: {len(train_dataset)}, Evaluation size: {len(eval_dataset)}")


# Load 4-bit Quantized Model

In [7]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )
device_map = {"": 0}


In [8]:
model_name='microsoft/phi-2'
original_model = AutoModelForCausalLM.from_pretrained(model_name, 
                                                      device_map=device_map,
                                                      quantization_config=bnb_config,
                                                      trust_remote_code=True,
                                                      use_auth_token=True)





config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
model2_name = "/kaggle/input/mistral/pytorch/7b-v0.1-hf/1"

tokenizer = AutoTokenizer.from_pretrained(model2_name)
model2 = AutoModelForCausalLM.from_pretrained(
        model2_name,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
    )

In [9]:
print_gpu_utilization()

GPU memory occupied: 2334 MB.


# Setup the PEFT/LoRA model for Fine-Tuning¶

In [10]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))



trainable model parameters: 262364160
all model parameters: 1521392640
percentage of trainable model parameters: 17.24%


In [11]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

config = LoraConfig(
    r=32, #Rank
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

# 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
original_model.gradient_checkpointing_enable()

# 2 - Using the prepare_model_for_kbit_training method from PEFT
original_model = prepare_model_for_kbit_training(original_model)

peft_model = get_peft_model(original_model, config)

In [12]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 20971520
all model parameters: 1542364160
percentage of trainable model parameters: 1.36%


## Preparing Prompt/Completion DataSet

In [22]:
from datasets import Dataset

# Function to format dataset for text generation
def format_example(row):
    # Extract tickers and sentiment
    tickers = row["tickers"]
    sentiment = row["insights"][0]["sentiment"] if row["insights"] else "neutral"
    
    # Create structured output
    json_output = json.dumps({"tickers": tickers, "sentiment": sentiment})

    return {
        "input_text": row["description"],  # Use news description as input
        "output_text": json_output  # Expected JSON output
    }

# Apply formatting to the DataFrame
formatted_data = df.apply(format_example, axis=1)

# Convert to Hugging Face dataset
hf_dataset = Dataset.from_pandas(pd.DataFrame(formatted_data.tolist()))

# Split into train/test sets
hf_dataset = hf_dataset.train_test_split(test_size=0.2)

train_dataset = hf_dataset["train"]
eval_dataset = hf_dataset["test"]

# Display sample
print(train_dataset[0])


{'input_text': "CNBC's 'Final Trades' featured recommendations on Home Depot, Wynn Resorts, Charles Schwab, and JPMorgan Chase. Analysts discussed positive outlooks for these companies, citing factors like potential dividend increases and settlement of lawsuits.", 'output_text': '{"tickers": ["HD", "WYNN", "SCHW", "SCHWpD", "SCHWpJ", "AMJB", "JPM", "JPMpC", "JPMpD", "JPMpJ", "JPMpK", "JPMpL", "JPMpM"], "sentiment": "positive"}'}


## Tokenize DataSet

In [23]:
from transformers import AutoTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import torch

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

# If tokenizer doesn't have a pad_token, set pad_token to eos_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Function to tokenize the dataset
def tokenize_function(examples):
    # Tokenize inputs and outputs
    model_inputs = tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=512)
    labels = tokenizer(examples['output_text'], padding='max_length', truncation=True, max_length=512)
    
    # Ensure that both input and output sequences are the same length
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

# Apply the tokenization to train and eval datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
train_dataset = train_dataset.remove_columns(['input_text', 'output_text'])
eval_dataset = eval_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.remove_columns(['input_text', 'output_text'])

Map:   0%|          | 0/4438 [00:00<?, ? examples/s]

Map:   0%|          | 0/1110 [00:00<?, ? examples/s]

In [24]:
# Check lengths of inputs and outputs in the dataset
for example in train_dataset:
    input_length = len(example['input_ids'])
    label_length = len(example['labels'])
    print(f"Input length: {input_length}, Label length: {label_length}")
    break


Input length: 512, Label length: 512


In [25]:
print_gpu_utilization()

GPU memory occupied: 3938 MB.


# Fine-Tune PEFT Model

In [28]:
# Make sure the model is in training mode
#peft_model.train()

# Trainer setup
training_args = TrainingArguments(
    output_dir="./peft-ticker_sentiment_NLP",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    #gradient_accumulation_steps=16,  # Accumulate gradients over 16 small batches
    num_train_epochs=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_steps=50,
    save_steps=100,
    eval_steps=100,
    learning_rate=2e-4,
    report_to="none",
    overwrite_output_dir=True,
    remove_unused_columns=False,  # This is important to ensure the columns remain
    fp16=True,  # Enable mixed precision
)

# Data collator for seq2seq tasks
data_collator = DataCollatorForSeq2Seq(tokenizer)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train the model
trainer.train()


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
100,0.2486,0.185904
200,0.1709,0.170815
300,0.1678,0.179524
400,0.1447,0.151411
500,0.1411,0.149035
600,0.1696,0.148018
700,0.1369,0.138719
800,0.1498,0.140757
900,0.1232,0.133865
1000,0.1564,0.133698


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


KeyboardInterrupt: 

# Evaluate

In [31]:
test_dataset = hf_dataset["test"]
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1110 [00:00<?, ? examples/s]

In [52]:
import json
import torch
from sklearn.metrics import accuracy_score

# Ensure model is in eval mode & moved to CUDA
peft_model.eval()
peft_model.to("cuda")

# Function to predict structured output
def extract_ticker_sentiment(text):
    prompt = f"Extract the tickers and sentiment from this news:\n{text}\nOutput JSON: "

    # Tokenize & move to CUDA
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

    # Generate output
    output = peft_model.generate(input_ids, max_length=256)
    
    # Decode & parse JSON
    pred_text = tokenizer.decode(output[0], skip_special_tokens=True)
    try:
        return json.loads(pred_text)  # Ensure valid JSON
    except json.JSONDecodeError:
        return {"tickers": [], "sentiment": "neutral"}  # Default safe output

# Initialize evaluation lists
true_tickers = []
true_sentiments = [] 
predicted_tickers = []
predicted_sentiments = []

# Evaluate on test set
for item in test_dataset:
    # Load ground truth
    true_data = json.loads(item["output_text"])
    true_tickers.append(true_data["tickers"])
    true_sentiments.append(true_data["sentiment"])

    # Model prediction
    pred = extract_ticker_sentiment(item["input_text"])
    predicted_tickers.append(pred["tickers"])
    predicted_sentiments.append(pred["sentiment"])

# Compute Sentiment Accuracy
sentiment_accuracy = accuracy_score(true_sentiments, predicted_sentiments)
print(f"🎯 Sentiment Accuracy: {sentiment_accuracy:.4f}")

# Compute Ticker Extraction Accuracy (Exact Match)
exact_ticker_match = sum(1 for true, pred in zip(true_tickers, predicted_tickers) if set(true) == set(pred)) / len(true_tickers)
print(f"📈 Ticker Extraction Accuracy: {exact_ticker_match:.4f}")


🎯 Sentiment Accuracy: 0.8405
📈 Ticker Extraction Accuracy: 0.95
