# Fine-tune `google/gemma-3-4b-it` for Financial Sentiment Analysis
---
This notebook fine-tunes **Gemma-3-4B-IT** on a financial sentiment analysis dataset using Amazon SageMaker.

**Prerequisites:**
- Run notebook 1 (`01_data_analysis.ipynb`) first to generate `sentiment_training_data.csv`

**What this notebook does:**
1. Load the sentiment training data from notebook 1
2. Split into 90% training / 10% test sets
3. Convert to messages format (JSONL)
4. Upload training data to S3
5. Launch a SageMaker training job using QLoRA (PEFT)

---

**Model:** [google/gemma-3-4b-it](https://huggingface.co/google/gemma-3-4b-it)  
**Training Method:** PEFT QLoRA (4-bit quantization with LoRA adapters)  
**Instance:** ml.g5.2xlarge


## 1. Setup and Dependencies


In [None]:
import os
import json
import pandas as pd
import boto3
import sagemaker
from datasets import Dataset
from sklearn.model_selection import train_test_split


In [None]:
region = boto3.Session().region_name

sess = sagemaker.Session(boto3.Session(region_name=region))

sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    # Fallback to an explicit SageMaker execution role ARN if not using a SageMaker execution role
    role = "arn:aws:iam::889772146711:role/SageMakerExecutionRole"


In [None]:
print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")


## 2. Data Preparation

Load the sentiment training data from notebook 1, split into train/test, and convert to messages format.


In [None]:
# Create output directory
dataset_parent_path = os.path.join(os.getcwd(), "tmp_cache_local_dataset")
os.makedirs(dataset_parent_path, exist_ok=True)


In [None]:
# Load sentiment training data from notebook 1
csv_path = os.path.join(os.getcwd(), "sentiment_training_data.csv")
df = pd.read_csv(csv_path)

print(f"Loaded {len(df)} samples from sentiment_training_data.csv")
print(f"\nColumns: {list(df.columns)}")
print(f"\nLabel distribution:")
print(df['assistant'].value_counts())


In [None]:
# Split data: 90% training, 10% test
# The training data will be further split internally by sft.py (90/10) for train/eval
train_df, test_df = train_test_split(
    df, 
    test_size=0.10, 
    random_state=42, 
    stratify=df['assistant']  # Maintain label distribution
)

print(f"Training samples: {len(train_df)} (90%)")
print(f"Test samples: {len(test_df)} (10%)")
print(f"\nTraining label distribution:")
print(train_df['assistant'].value_counts())


### Convert to Messages Format

The SFT training script expects data in the `messages` format:
```json
{
  "messages": [
    { "role": "system", "content": "..." },
    { "role": "user", "content": "..." },
    { "role": "assistant", "content": "..." }
  ]
}
```


In [None]:
def convert_to_messages(row):
    """Convert a row to messages format."""
    return {
        "messages": [
            {"role": "system", "content": row["system"]},
            {"role": "user", "content": row["user"]},
            {"role": "assistant", "content": row["assistant"]}
        ]
    }

# Convert DataFrames to Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply conversion
train_dataset = train_dataset.map(convert_to_messages, remove_columns=train_dataset.column_names)
test_dataset = test_dataset.map(convert_to_messages, remove_columns=test_dataset.column_names)

print(f"Train dataset: {len(train_dataset)} samples")
print(f"Test dataset: {len(test_dataset)} samples")


In [None]:
# Preview a sample
print("Sample training example:")
print(json.dumps(train_dataset[0], indent=2))


In [None]:
# Save datasets to JSONL files
train_filename = os.path.join(dataset_parent_path, "train_data.jsonl")
test_filename = os.path.join(dataset_parent_path, "test_data.jsonl")

train_dataset.to_json(train_filename, lines=True)
test_dataset.to_json(test_filename, lines=True)

print(f"Saved training data to: {train_filename}")
print(f"Saved test data to: {test_filename}")


## 3. Upload Training Data to S3


In [None]:
from sagemaker.s3 import S3Uploader


In [None]:
data_s3_uri = f"s3://{sess.default_bucket()}/gemma-sentiment-finetune/dataset"

# Check if data already exists at this S3 location
s3_client = boto3.client('s3')
bucket = sess.default_bucket()
prefix = "gemma-sentiment-finetune/dataset/"

response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix, MaxKeys=1)
if response.get('KeyCount', 0) > 0:
    existing_files = [obj['Key'] for obj in response.get('Contents', [])]
    raise FileExistsError(
        f"S3 path already contains data!\n"
        f"Location: s3://{bucket}/{prefix}\n"
        f"Found: {existing_files}\n\n"
        f"To overwrite, manually delete the existing data first:\n"
        f"  aws s3 rm s3://{bucket}/{prefix} --recursive"
    )

uploaded_s3_uri = S3Uploader.upload(
    local_path=train_filename,
    desired_s3_uri=data_s3_uri
)
print(f"Uploaded training data to: {uploaded_s3_uri}")


## 4. Configure and Launch Training Job


In [None]:
from sagemaker.modules.configs import (
    CheckpointConfig,
    Compute,
    InputData,
    OutputDataConfig,
    SourceCode,
    StoppingCondition,
)
from sagemaker.modules.train import ModelTrainer
from getpass import getpass


In [None]:
MODEL_ID = "google/gemma-3-4b-it"
# Enter your HuggingFace token (required for gated models like Gemma)
hf_token = getpass("Enter your HuggingFace token: ")
# Metrics will be reported to tensorboard
reports_to = "tensorboard"
# Job name
job_name = MODEL_ID.replace('/', '--').replace('.', '-')
print(f"job_name: {job_name}")


In [None]:
# Training environment variables
training_env = {
    "HF_TOKEN": hf_token,
    "FI_EFA_USE_DEVICE_RDMA": "1",
    "NCCL_DEBUG": "INFO",
    "NCCL_SOCKET_IFNAME": "eth0",
    "FI_PROVIDER": "efa",
    "NCCL_PROTO": "simple",
    "NCCL_NET_GDR_LEVEL": "5"
}


In [None]:
%%writefile sagemaker_code/requirements.txt
transformers==4.55.0
peft==0.17.0
accelerate==1.10.0
bitsandbytes==0.46.1
datasets==4.0.0
deepspeed==0.16.4
evaluate==0.4.5
hf-transfer==0.1.8
hf_xet
liger-kernel==0.6.1
lm-eval[api]==0.4.9
kernels>=0.9.0
mlflow
safetensors>=0.6.2
sagemaker==2.251.1
sagemaker-mlflow==0.1.0
sentencepiece==0.2.0
scikit-learn==1.7.1
tokenizers>=0.21.4
triton
trl==0.21.0
py7zr
nvidia-ml-py
wandb
git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels
vllm==0.10.1
poetry
yq
psutil
nvidia-ml-py
pyrsmi


### Training Configuration


In [None]:
# Training arguments for PEFT QLoRA
args = [
    "--config",
    "hf_recipes/google/gemma-3-4b-it--vanilla-peft-qlora.yaml",
]

# Instance configuration
training_instance_type = "ml.g5.2xlarge"
training_instance_count = 1

print(f"Training instance: {training_instance_type} x {training_instance_count}")


In [None]:
# Get the PyTorch training image
pytorch_image_uri = sagemaker.image_uris.retrieve(
    framework="pytorch",
    region=sess.boto_session.region_name,
    version="2.7.1",
    instance_type=training_instance_type,
    image_scope="training",
)
print(f"Using image: {pytorch_image_uri}")


In [None]:
# Configure the ModelTrainer
source_code = SourceCode(
    source_dir="./sagemaker_code",
    command=f"bash sm_accelerate_train.sh {' '.join(args)}",
)

compute_configs = Compute(
    instance_type=training_instance_type,
    instance_count=training_instance_count,
    keep_alive_period_in_seconds=1800,
    volume_size_in_gb=300
)

base_job_name = f"{job_name}-sentiment-finetune"
output_path = f"s3://{sess.default_bucket()}/{base_job_name}"

model_trainer = ModelTrainer(
    training_image=pytorch_image_uri,
    source_code=source_code,
    base_job_name=base_job_name,
    compute=compute_configs,
    stopping_condition=StoppingCondition(max_runtime_in_seconds=18000),
    output_data_config=OutputDataConfig(
        s3_output_path=output_path,
    ),
    checkpoint_config=CheckpointConfig(
        s3_uri=os.path.join(
            output_path,
            "sentiment-analysis",
            job_name,
            "checkpoints"
        ), 
        local_path="/opt/ml/checkpoints"
    ),
    role=role,
    environment=training_env
)

print(f"base_job_name: {base_job_name}")
print(f"output_path: {output_path}")


## 5. Launch Training Job


In [None]:
# Launch the training job
model_trainer.train(
    input_data_config=[
        InputData(
            channel_name="training",
            data_source=uploaded_s3_uri,  
        )
    ], 
    wait=False  # Set to True to wait for completion
)

print("\nTraining job launched!")


## Next Steps

Once the training job completes:

1. The fine-tuned model will be saved to `{output_path}/model/`
2. Run notebook 3 to deploy the model and run evaluations against the test set (`test_data.jsonl`)

**Test data location:** `tmp_cache_local_dataset/test_data.jsonl` (200 samples held out for evaluation)
