# PEFT Fine-tuning Meta-Llama3.1-8B on Dolphin Dataset using Nemo Megatron-LM

This notebook shows how to do parameter efficient fine tuning (PEFT) of [Meta-Llama3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) model on [dolphin](https://huggingface.co/datasets/cognitivecomputations/dolphin) dataset using [Nemo](https://github.com/NVIDIA/NeMo) [Megatron-LM](https://github.com/NVIDIA/Megatron-LM).

## Setup and Imports

In [None]:
! pip install kubernetes
! pip install boto3

In [None]:
import os
import subprocess
import sys

# Set working directory
os.chdir(os.path.expanduser('~/amazon-eks-machine-learning-with-terraform-and-kubeflow'))
print(f"Working directory: {os.getcwd()}")

# Get the src directory
src_dir = os.path.join(os.getcwd(), "src")
sys.path.insert(0, src_dir)

from k8s.utils import wait_for_helm_release_pods

# Get notebook directory
notebook_dir = os.path.join(os.getcwd(), 'examples', 'legacy', 'nemo-megatron', 'llama31-8b-peft-dolphin')
print(f"Notebook directory: {notebook_dir}")

# Initialize key variables
release_name = 'nemo-llama31-8b-peft-dolphin'
namespace = 'kubeflow-user-example-com'
hf_model_id = 'meta-llama/Llama-3.1-8B'

## Step 1: Download Hugging Face Llama-3.1-8B Model Weights

**Note:** Set your Hugging Face token below before running cell.

In [None]:
# Replace with your actual Hugging Face token
HF_TOKEN = None
assert HF_TOKEN, "Please set HF_TOKEN"

cmd = [
    'helm', 'install', '--debug', release_name,
    'charts/machine-learning/model-prep/hf-snapshot',
    '--set-json', f'env=[{{"name":"HF_MODEL_ID","value":"{hf_model_id}"}},{{"name":"HF_TOKEN","value":"{HF_TOKEN}"}}]',
    '-n', namespace
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

In [None]:
# Wait for model download to complete
wait_for_helm_release_pods(release_name, namespace)

In [None]:
# Uninstall the model download job
cmd = ['helm', 'uninstall', release_name, '-n', namespace]
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

## Step 2: Preprocess Dolphin Dataset

In [None]:
cmd = [
    'helm', 'install', '--debug', release_name,
    'charts/machine-learning/data-prep/data-process',
    '-f', f'{notebook_dir}/preprocess.yaml',
    '-n', namespace
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

In [None]:
# Wait for preprocessing to complete
wait_for_helm_release_pods(release_name, namespace)

In [None]:
# Uninstall the preprocessing job
cmd = ['helm', 'uninstall', release_name, '-n', namespace]
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

## Step 3: Convert HuggingFace Checkpoint to Nemo Checkpoint

In [None]:
cmd = [
    'helm', 'install', '--debug', release_name,
    'charts/machine-learning/data-prep/data-process',
    '-f', f'{notebook_dir}/hf_to_nemo.yaml',
    '-n', namespace
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

In [None]:
# Wait for conversion to complete
wait_for_helm_release_pods(release_name, namespace)

In [None]:
# Uninstall the conversion job
cmd = ['helm', 'uninstall', release_name, '-n', namespace]
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

## Step 4: Run PEFT Fine-tuning

In [None]:
cmd = [
    'helm', 'install', '--debug', release_name,
    'charts/machine-learning/training/pytorchjob-distributed',
    '--set', f'hf_token={HF_TOKEN}',
    '-f', f'{notebook_dir}/peft.yaml',
    '-n', namespace
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

In [None]:
# Wait for PEFT training to complete
wait_for_helm_release_pods(release_name, namespace, interval=300, timeout=3600*4)

In [None]:
# Uninstall the training job
cmd = ['helm', 'uninstall', release_name, '-n', namespace]
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

## Step 5: Evaluate PEFT Model

In [None]:
cmd = [
    'helm', 'install', '--debug', release_name,
    'charts/machine-learning/training/pytorchjob-distributed',
    '--set', f'hf_token={HF_TOKEN}',
    '-f', f'{notebook_dir}/peft_eval.yaml',
    '-n', namespace
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

In [None]:
# Wait for evaluation to complete
wait_for_helm_release_pods(release_name, namespace, interval=300, timeout=3600*4)

In [None]:
# Uninstall the evaluation job
cmd = ['helm', 'uninstall', release_name, '-n', namespace]
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

## Step 6: Merge PEFT Model to Base Model

In [None]:
cmd = [
    'helm', 'install', '--debug', release_name,
    'charts/machine-learning/data-prep/data-process',
    '--set', f'hf_token={HF_TOKEN}',
    '-f', f'{notebook_dir}/merge_peft.yaml',
    '-n', namespace
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

In [None]:
# Wait for merge to complete
wait_for_helm_release_pods(release_name, namespace)

In [None]:
# Uninstall the merge job
cmd = ['helm', 'uninstall', release_name, '-n', namespace]
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

## Step 7: Convert Nemo Checkpoint to HuggingFace Checkpoint

In [None]:
cmd = [
    'helm', 'install', '--debug', release_name,
    'charts/machine-learning/data-prep/data-process',
    '--set', f'hf_token={HF_TOKEN}',
    '-f', f'{notebook_dir}/nemo_to_hf.yaml',
    '-n', namespace
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)

In [None]:
# Wait for conversion to complete
wait_for_helm_release_pods(release_name, namespace)

In [None]:
# Uninstall the conversion job
cmd = ['helm', 'uninstall', release_name, '-n', namespace]
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)