<a href="https://colab.research.google.com/github/amanzoni1/fine_tuning/blob/main/SFT_Gemma_7B_it.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-Tuning Gemma-7B-Instruct for Financial Sentiment Classification

In this notebook we take **Gemma-7B-Instruct** and teach it to become an accurate financial sentiment analyzer.  We begin by loading and exploring the Financial PhraseBank headlines—using the Hugging Face **Datasets** library to dig into sentence lengths, vocabulary, and **Transformer** tokenization patterns.  Next, we transform each headline into a prompt→JSON completion pair, preparing our data for training.

For the fine-tuning itself, we combine 4-bit quantization (via **bitsandbytes**) with LoRA adapters (via **PEFT** and **TRL**), letting us update only a small set of adapter weights while keeping VRAM usage in check through activation checkpointing.  During training we’ll periodically evaluate on a validation set to monitor generalization, and at the end we’ll save and push our final model to the Hugging Face Hub.  

The goal is to have a memory-efficient, domain-specialized Gemma-7B that reliably classifies headlines as POSITIVE, NEUTRAL or NEGATIVE, and better understand every step of the training.  


In [None]:
import getpass
import os

# Prompt for the token
hf_token = getpass.getpass('Enter your HF access token and press enter: ')

# Set the environment variable
os.environ['HF_TOKEN'] = hf_token

print("HF_TOKEN environment variable has been set.")

In [None]:
model_name = "google/gemma-7b-it"
dataset_name = "financial_phrasebank"

# Load & Inspect Dataset
After installing the libraries, we load the Financial PhraseBank “sentences_75agree” split and preview its schema, size, and a few example headlines to understand the label distribution. Then we compute and visualize basic text statistics (lengths, word counts), clean and plot the top tokens (overall and finance-specific), and chart sentiment counts to spot any imbalance.  


In [None]:
!pip install --quiet seaborn matplotlib
!pip install -U --quiet torch transformers datasets accelerate bitsandbytes peft trl

In [None]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datasets import load_dataset, load_dataset_builder

In [None]:
# Configure plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("pastel")

In [None]:
# Preview dataset metadata
ds_builder = load_dataset_builder(dataset_name, "sentences_75agree", trust_remote_code=True)

print("Description:\n", ds_builder.info.description[:500], "...\n")
print("Features:", ds_builder.info.features)

In [None]:
# Load the data
raw = load_dataset(dataset_name, "sentences_75agree")

print("\nRaw dataset object:\n", raw)

In [None]:
# Extract the train split
dataset = load_dataset(dataset_name, "sentences_75agree", split="train")

print("\nDataset object:\n", dataset)
print(f"\nUsing 'train' split with {len(dataset)} examples.")

### Sample Sentences


In [None]:
# get the label names from the ClassLabel feature
label_names = dataset.features["label"].names

for label_id, label_name in enumerate(label_names):
    print(f"\n{label_name.upper()} examples:")
    # filter down to this label, then select the first 3
    examples = (
        dataset
        .filter(lambda ex: ex["label"] == label_id)
        .select(range(3))
    )
    for i, ex in enumerate(examples, start=1):
        print(f"  {i}. {ex['sentence']}")

### Sentiment Distribution



In [None]:
from collections import Counter

# Build label→count mapping
label_names = dataset.features["label"].names
counts = Counter(label_names[l] for l in dataset["label"])
total = sum(counts.values())

# Print counts and percentages
for name, cnt in counts.items():
    pct = cnt / total * 100
    print(f"{name.capitalize():9s}: {cnt} ({pct:.1f}%)")

# Plot count distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
# Bar chart (Count)
sns.barplot(x=list(counts.keys()), y=list(counts.values()), ax=ax1)
ax1.set_title("Sentiment Distribution (Count)")
ax1.set_xlabel("Sentiment")
ax1.set_ylabel("Count")

# Pie chart (Percentage)
ax2.pie(counts.values(), labels=counts.keys(), autopct="%1.1f%%", startangle=90)
ax2.set_title("Sentiment Distribution (Percentage)")

plt.tight_layout()
plt.show()

### Text Characteristics Analysis

In [None]:
df = pd.DataFrame(dataset)

# Compute metrics
df["char_count"]     = df["sentence"].str.len()
df["word_count"]     = df["sentence"].str.split().str.len()
df["avg_word_length"]= df["sentence"].apply(lambda s: np.mean([len(w) for w in s.split()]))

# Print overall stats
print("Overall statistics:")
print(f" • Avg characters   : {df['char_count'].mean():.1f}")
print(f" • Avg words        : {df['word_count'].mean():.1f}")
print(f" • Avg word length  : {df['avg_word_length'].mean():.1f}")

# Distribution by sentiment
print("\nBy-sentiment averages:")
stats = df.groupby(df["label"].map(lambda i: dataset.features["label"].names[i])) \
           .agg({
             "char_count":     ["mean","std"],
             "word_count":     ["mean","std"],
             "avg_word_length":["mean","std"]
           }).round(1)
print(stats)

# Plotting
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
sns.histplot(df, x="char_count",   hue=df["label"].map(lambda i: dataset.features["label"].names[i]),
             kde=False, ax=axes[0], bins=20).set(title="Char Count")
sns.histplot(df, x="word_count",   hue=df["label"].map(lambda i: dataset.features["label"].names[i]),
             kde=False, ax=axes[1], bins=15).set(title="Word Count")
sns.boxplot(x=df["label"].map(lambda i: dataset.features["label"].names[i]),
            y="avg_word_length", data=df, ax=axes[2]).set(title="Avg. Word Length")
plt.tight_layout()
plt.show()

In [None]:
import re
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Build stopword set + punctuation
stopwords = set(ENGLISH_STOP_WORDS) | set(string.punctuation)

# Tokenize & filter out stopwords, tokens containing digits, and very short tokens
tokens = []
for sent in df["sentence"].str.lower():
    words = re.findall(r'\b\w+\b', sent)
    for w in words:
        if (
            w not in stopwords           # not a stopword/punctuation
            and not re.search(r'\d', w)  # no digits
            and len(w) > 2               # longer than 2 chars
        ):
            tokens.append(w)

# Get top 20 tokens now
freq = Counter(tokens)
top20 = freq.most_common(20)


print("Cleaned vocabulary (no numbers, no stopwords)")

for word, count in top20:
    print(f"{word:12s}: {count}")

# Plot
plt.figure(figsize=(8, 4))
sns.barplot(x=[w for w,_ in top20], y=[c for _,c in top20])
plt.title("Top 20 Tokens (Filtered)")
plt.xticks(rotation=45)
plt.xlabel("Token")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
# Define your terms and prepare a list of (term, sentiment, count)
financial_terms = [
    'revenue', 'profit', 'loss', 'earnings', 'growth', 'decline',
    'stock', 'share', 'dividend', 'quarter', 'market', 'sales',
    'analyst', 'forecast', 'billion', 'million', 'percent', 'business',
    'beat', 'miss', 'guidance', 'outlook', 'performance', 'results'
]

records = []
label_names = dataset.features["label"].names

for label_id, label_name in enumerate(label_names):
    # extract all sentences for this sentiment
    sub = df[df["label"] == label_id]["sentence"].str.lower()
    # count each term
    term_counts = {term: sub.str.contains(term, regex=False).sum() for term in financial_terms}
    for term, cnt in term_counts.items():
        records.append({"term": term, "sentiment": label_name, "count": cnt})

# Build a DataFrame and pick top N terms overall
term_df = pd.DataFrame(records)
overall = term_df.groupby("term").sum().sort_values("count", ascending=False).head(10).index.tolist()
plot_df = term_df[term_df["term"].isin(overall)]

# Plot grouped bar chart
plt.figure(figsize=(8, 4))
sns.barplot(
    data=plot_df,
    x="term", y="count", hue="sentiment",
    order=overall
)
plt.title("Top 10 Financial Terms by Sentiment", fontsize=14, fontweight='bold')
plt.xlabel("Financial Term")
plt.ylabel("Count in Headlines")
plt.xticks(rotation=45, ha='right')
plt.legend(title="Sentiment")
plt.tight_layout()
plt.show()

# Tokenizer
We load Gemma’s tokenizer and inspect its vocabulary size, maximum sequence length, and special tokens, then tokenize all headlines (with truncation) to measure token counts and plot overall and per-sentiment distributions.  
Next, we identify the top 20 subword tokens by frequency and display sample headline tokenizations to see how the tokenizer handles punctuation, numbers, and financial terms.  


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="right")

In [None]:
# Tokenizer Properties

print(f"Vocab size       : {tokenizer.vocab_size}")
print(f"Max model length : {tokenizer.model_max_length}")
print(f"Special tokens   : {tokenizer.all_special_tokens}")

### Token Length Analysis

In [None]:
# Compute token counts for each headline
token_lengths = [
    len(tokenizer(sentence, truncation=True, max_length=256)["input_ids"])
    for sentence in df["sentence"]
]
df["token_count"] = token_lengths

print("\nOverall token count stats:")
print(df["token_count"].describe().round(1))


# Plot side-by-side distributions
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))

# a) All headlines
ax1.hist(token_lengths, bins=20, alpha=0.7, edgecolor='black')
ax1.set_title("Token Count Distribution (All)")
ax1.set_xlabel("Tokens per Headline")
ax1.set_ylabel("Frequency")
ax1.axvline(np.mean(token_lengths), color="red", linestyle="--", label=f"Mean: {np.mean(token_lengths):.1f}")
ax1.legend()

# b) By
plot_order = [1, 2, 0]
for label_id in plot_order:
    lengths = df[df["label"] == label_id]["token_count"]
    ax2.hist(lengths, bins=15, alpha=0.6, label=label_names[label_id], edgecolor='black')

ax2.set_title("Token Count by Sentiment")
ax2.set_xlabel("Tokens per Headline")
ax2.set_ylabel("Frequency")
ax2.legend()

plt.tight_layout()
plt.show()

### Tokenization Examples

In [None]:
# Flatten all token IDs, then convert to tokens
all_ids = [
    tid
    for sentence in df["sentence"]
    for tid in tokenizer(sentence)["input_ids"]
    if tid not in tokenizer.all_special_ids
]
all_tokens = [tokenizer.convert_ids_to_tokens(tid) for tid in all_ids]

subword_counts = Counter(all_tokens).most_common(20)
subwords, counts = zip(*subword_counts)

print("Top 20 Subwords (Filtered)")

for tok, cnt in subword_counts:
    print(f"{tok:12s}: {cnt}")

plt.figure(figsize=(8, 4))
sns.barplot(x=list(subwords), y=list(counts))
plt.title("Top 20 Subwords by Frequency")
plt.xticks(rotation=45, ha="right")
plt.xlabel("Subword Token")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
print("Sample Sentence Tokenizations")

sample_sentences = [
    "Apple reports record Q3 revenue growth of 15%",
    "Tesla stock plunges after disappointing earnings",
    "Fed raises rates by 0.25 bps amid inflation concerns"
]

for sentence in sample_sentences:
    toks = tokenizer.tokenize(sentence)
    print(f"\nSentence: {sentence}")
    print(f"Tokens ({len(toks)}): {toks}")

# Formatting Data
We implement a `format_example` function to convert each headline + label into a single prompt→JSON completion, tokenize it with padding/truncation, and generate attention and label masks so the model only trains on the completion.  
Then we test this on a small sample—decoding both full sequences and masked labels—to confirm that only the sentiment field is supervised during fine-tuning.  


In [None]:
# Mapping Function

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

label_names = dataset.features["label"].names
label_map = {i: name.upper() for i, name in enumerate(label_names)}

def format_example(example):

    # Map label →  sentiment
    idx = example["label"]
    sentiment_label = label_map[idx]

    # Build full text as one string
    prompt = (
        'Analyze the sentiment of this financial headline and respond with JSON format.\n\n'
        f'Input: "{example["sentence"]}"\n'
        'Output: {"sentiment": '
    )
    completion = f'"{sentiment_label}"}}'
    full = prompt + completion

    # Tokenize once, with truncation+padding
    toks = tokenizer(
        full,
        truncation=True,
        max_length=256,
        padding="max_length",
        return_tensors=None,
        add_special_tokens=True
    )
    input_ids = toks["input_ids"]
    attention_mask = toks["attention_mask"]

    # Figure out where the completion starts
    prompt_ids = tokenizer(
        prompt,
        truncation=True,
        max_length=256,
        padding=False,
        add_special_tokens=True
    )["input_ids"]
    prompt_len = len(prompt_ids)

    # Build labels mask, handling edge cases
    labels = [-100] * len(input_ids)  # Start with all ignored

    # Only supervise tokens that are part of the completion AND not padding
    for i in range(prompt_len, len(input_ids)):
        if input_ids[i] != tokenizer.pad_token_id:
            labels[i] = input_ids[i]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }


### Formatting Check


In [None]:
# Tokenized length stats
def analyze_tokenized_lengths(dataset_sample):
    lengths = []
    for example in dataset_sample:
        out = format_example(example)
        # count non-pad tokens
        actual_len = sum(1 for tok in out["input_ids"] if tok != tokenizer.pad_token_id)
        lengths.append(actual_len)
    print(f"Sample size: {len(lengths)}")
    print(f"Mean tokens (non-pad): {np.mean(lengths):.1f}")
    print(f"Max tokens  (non-pad): {np.max(lengths)}")
    return lengths

In [None]:
# Run on first 100 examples of your training data
sample_ds = dataset.select(range(100))
sample_lengths = analyze_tokenized_lengths(sample_ds)

# Single-example formatting
pos_idx = label_names.index("positive")
test_example = {
    "sentence": "Apple stock surges 15% after earnings beat",
    "label":    pos_idx
}
formatted = format_example(test_example)

In [None]:
def inspect_example(example):
    out = format_example(example)
    # Full prompt + completion
    full = tokenizer.decode(out["input_ids"], skip_special_tokens=False)
    # Only the part we supervise
    label_ids = [tok for tok, lab in zip(out["input_ids"], out["labels"]) if lab != -100]
    supervised = tokenizer.decode(label_ids, skip_special_tokens=True)

    print("── Full Prompt+Completion ──\n", full, "\n")
    print("── Supervised Portion ──\n", supervised)

In [None]:
# Debug a real dataset example
print("\n--- Debug Real Dataset Example ---")
inspect_example(dataset[0])

# Split Dataset
We split the original train split into 80/10/10 train/validation/test, then check each subset’s class distribution to ensure our hold-out sets are representative.


In [None]:
# Capture original columns so we can remove them after mapping
orig_cols = dataset.column_names

# Split into 80/10/10 train/val/test
splits = dataset.train_test_split(test_size=0.20, seed=42)
train_ds, temp_ds = splits["train"], splits["test"]

val_test_splits = temp_ds.train_test_split(test_size=0.5, seed=42)
val_ds, test_ds = val_test_splits["train"], val_test_splits["test"]

In [None]:
# Check split sizes
print(f"Train size: {len(train_ds)}")
print(f"Val size: {len(val_ds)}")
print(f"Test size: {len(test_ds)}")

In [None]:
# Check class distribution in the datasets
for split_name, split_ds in [
    ("Train", train_ds),
    ("Validation", val_ds),
    ("Test", test_ds)
]:
    counts = Counter(split_ds["label"])
    total = sum(counts.values())

    print(f"\n{split_name} class distribution:")
    for idx, cnt in counts.items():
        name = label_names[idx]
        print(f"  {name:8s}: {cnt} ({cnt/total*100:.1f}%)")

# Mapping
We apply `format_example` to every example in each split—dropping the old columns—so each record now contains `input_ids`, `attention_mask`, and `labels`, ready for the trainer.


In [None]:
# map with the formatting function
train_ds = train_ds.map(format_example, remove_columns=orig_cols)
val_ds = val_ds.map(format_example, remove_columns=orig_cols)
test_ds = test_ds.map(format_example, remove_columns=orig_cols)

In [None]:
# Inspect a couple of formatted examples

for i in [50, 51]:
    print(f"\n--- Mapped Example ---")
    inp_ids = train_ds[i]["input_ids"]
    lbls = train_ds[i]["labels"]

    # full prompt+completion (with padding)
    print("Full text:\n", tokenizer.decode(inp_ids, skip_special_tokens=False))

    # supervised portion only
    supervised = [tok for tok, lab in zip(inp_ids, lbls) if lab != -100]
    print("Supervised part:", tokenizer.decode(supervised, skip_special_tokens=True))

# Train
Finally, we quantize Gemma-7B to 4-bit, inject LoRA adapters, configure our `SFTTrainer` with evaluation and checkpointing steps, and launch the fine-tuning loop to produce our domain-specialized sentiment classifier.  

In [None]:
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig

In [None]:
# Quantize in 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype = "float16",
)

In [None]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map = "auto",
    attn_implementation = "eager", # "sdpa"
    quantization_config = bnb_config,
    torch_dtype = "auto",
)

In [None]:
# Inject LoRA adapters
peft_cfg = LoraConfig(
    r = 16,
    lora_alpha = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_dropout = 0,
    bias = "none",
    task_type = "CAUSAL_LM",
)

model = get_peft_model(model, peft_cfg)

model.gradient_checkpointing_enable()
model.enable_input_require_grads()
model.config.use_cache = False

In [None]:
# Define training arguments
sft_config = SFTConfig(
    seed = 42,
    num_train_epochs = 3.0,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 4,
    learning_rate = 2e-4,
    lr_scheduler_type = "linear",
    warmup_ratio = 0.03,
    weight_decay = 0.01,
    dataloader_num_workers = 2,
    dataloader_pin_memory = True,
    fp16 = True,
    do_eval = True,
    eval_strategy="steps",
    eval_steps = 50,
    per_device_eval_batch_size = 8,
    logging_steps = 10,
    report_to = "none",
    output_dir = "gemma-sentiment-qlora",
    overwrite_output_dir = True,
    save_strategy = "epoch",
    push_to_hub = True,
    hub_model_id = "AManzoni/gemma-sentiment-qlora",
)

In [None]:
# Instantiate SFTTrainer
trainer = SFTTrainer(
    model = model,
    args = sft_config,
    train_dataset = train_ds,
    eval_dataset = val_ds,
    processing_class = tokenizer,
)

In [None]:
# Memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
# attn_implementation="sdpa"

In [None]:
trainer.train()

In [None]:
# Final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

print(f"{trainer.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
# test, inference, saving methods

In [None]:


# 9) Built-in evaluation on validation set
metrics = trainer.evaluate()
print("\n=== Validation Metrics ===")
print(f"  * eval_loss      = {metrics['eval_loss']:.4f}")
print(f"  * perplexity     = {math.exp(metrics['eval_loss']):.2f}")

# 10) Quick accuracy check by greedy-decoding the sentiment token
def greedy_sentiment_accuracy(ds, n=200):
    from tqdm.auto import tqdm

    correct = 0
    total   = 0
    for ex in tqdm(ds.select(range(min(len(ds), n)))):
        prompt = tokenizer.decode(ex["input_ids"], skip_special_tokens=True)
        # generate just the sentiment token + closing brace
        out = trainer.model.generate(
            ex["input_ids"][None, :],
            max_new_tokens=5,
            num_beams=1,
            do_sample=False,
        )
        gen = tokenizer.decode(out[0][ex["input_ids"].index(tokenizer.eos_token_id)+1 :], skip_special_tokens=True)
        # extract the first word inside quotes
        pred = gen.strip().split('"')[1]
        true = label_names[ex["labels"].index(next(l for l in ex["labels"] if l!=-100))]
        if pred.upper() == true.upper():
            correct += 1
        total += 1
    return correct / total

acc = greedy_sentiment_accuracy(val_ds, n=200)
print(f"\nGreedy decoding accuracy on 200 val examples: {acc*100:.1f}%")