# Fine-tune Gemma 2b using LoRA

## Setup

In [1]:
import os
from google.colab import userdata, drive

In [2]:
COLAB = True
KAGGLE = True
DOWNLOAD_DATA = True
SAVE_TO_GITHUB = True
GIT_REPOSITORY = "CS221-project"
FILE_NAME = "colab_tuning_legacy.ipynb"


In [3]:
if COLAB:
    %cd /content
    drive.mount('/content/drive', force_remount=True)

In [4]:
if COLAB:
    PARENT_DIRECTORY_PATH = "/content"
    # In case you want to clone in your drive:
    PARENT_DIRECTORY_PATH = "/content/drive/MyDrive"
    PROJECT_PATH = PARENT_DIRECTORY_PATH + "/" + GIT_REPOSITORY
    %cd "{PARENT_DIRECTORY_PATH}"

In [6]:
if COLAB:
    import json
    import os

    with open(f"{PARENT_DIRECTORY_PATH}/Git/git.json", "r") as f:
        parsed_json = json.load(f)

    GIT_USER_NAME = parsed_json["GIT_USER_NAME"]
    GIT_TOKEN = parsed_json["GIT_TOKEN"]
    GIT_USER_EMAIL = parsed_json["GIT_USER_EMAIL"]

    GIT_PATH = (
        f"https://{GIT_TOKEN}@github.com/{GIT_USER_NAME}/{GIT_REPOSITORY}.git"
    )

    %cd "{PARENT_DIRECTORY_PATH}"

    if os.path.exists(f"{PARENT_DIRECTORY_PATH}/{GIT_REPOSITORY}"):
        %cd "{PROJECT_PATH}"
        !git pull
    else:
        !git clone "{GIT_PATH}"  # Clone the github repository
        %cd "{PROJECT_PATH}"

In [8]:
if COLAB:
    import os
    os.environ["KAGGLE_CONFIG_DIR"] = f"{PARENT_DIRECTORY_PATH}/Kaggle/kaggle.json"

### Set environment variables

In [None]:
# os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
# os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')

# Read the kaggle.json file
# with open("kaggle.json") as f:
#     kaggle_info = json.load(f)

# Set the environment variables
# os.environ["KAGGLE_USERNAME"] = kaggle_info["username"]
# os.environ["KAGGLE_KEY"] = kaggle_info["key"]

### Install dependencies

In [9]:
# Install Keras 3 last. See https://keras.io/getting_started/ for more details.
!pip install -q -U keras-nlp
!pip install -q -U keras>=3

### Select a backend

In [10]:
os.environ["KERAS_BACKEND"] = "jax"  # Or "torch" or "tensorflow".
# Avoid memory fragmentation on JAX backend.
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"

### Import packages

In [11]:
import keras
import keras_nlp

## Load Dataset

Preprocess the data.

In [18]:
!pip install python-dotenv

In [15]:
!pip install anthropic

In [1]:
from utils import preprocess_qa_data
with open("qa_data.txt") as file:
        content = file.read()

In [2]:
data = preprocess_qa_data(content)

## Load Model

In [None]:
gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma_2b_en")
gemma_lm.summary()

## Inference before fine tuning

### Probability Prompt

In [None]:
template = "Instruction:\n{instruction}\n\nResponse:\n{response}"

prompt = template.format(
    instruction="What is the difference between permutations and combinations?",
    response="",
)

sampler = keras_nlp.samplers.TopKSampler(k=5, seed=2)
gemma_lm.compile(sampler=sampler)
print(gemma_lm.generate(prompt, max_length=256))

### Supervised Learning Prompt

In [None]:
prompt = template.format(
    instruction="What is Supervised Learning?",
    response="",
)
print(gemma_lm.generate(prompt, max_length=256))

## LoRA Fine-tuning

In [None]:
# Enable LoRA for the model and set the LoRA rank to 4.
gemma_lm.backbone.enable_lora(rank=4)
gemma_lm.summary()

In [None]:
# Limit the input sequence length to 512 (to control memory usage).
gemma_lm.preprocessor.sequence_length = 512
# Use AdamW (a common optimizer for transformer models).
optimizer = keras.optimizers.AdamW(
    learning_rate=5e-5,
    weight_decay=0.01,
)
# Exclude layernorm and bias terms from decay.
optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])

gemma_lm.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=optimizer,
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
)
gemma_lm.fit(data, epochs=1, batch_size=1)

In [None]:
# Save the fine-tuned model
gemma_lm.save("/content/drive/MyDrive/Colab Notebooks/cs221/fine_tuned_model_1.keras")


In [None]:
# Uncomment the line below if you want to enable mixed precision training on GPUs
# keras.mixed_precision.set_global_policy('mixed_bfloat16')

In [None]:
# Load the fine-tuned model

# loaded_model = keras.models.load_model("/content/drive/MyDrive/Colab Notebooks/cs221/fine_tuned_model_1.keras")

# Use the loaded model for generation
# prompt = template.format(
#     instruction="What is Supervised Learning?",
#     response="",
# )
# sampler = keras_nlp.samplers.TopKSampler(k=5, seed=2)
# loaded_model.compile(sampler=sampler)
# generated_text = loaded_model.generate(prompt, max_length=256)
# print(generated_text)

## Inference after fine-tuning

### Probability Prompt

In [None]:
template = "Instruction:\n{instruction}\n\nResponse:\n{response}"
prompt = template.format(
    instruction="What is the difference between permutations and combinations?",
    response="",
)
sampler = keras_nlp.samplers.TopKSampler(k=5, seed=2)
gemma_lm.compile(sampler=sampler)
print(gemma_lm.generate(prompt, max_length=256))

### Supervised Learning Prompt

In [None]:
prompt = template.format(
    instruction="What is Supervised Learning?",
    response="",
)
print(gemma_lm.generate(prompt, max_length=256))

To get better responses from the fine-tuned model, you can experiment with:

1. Increasing the size of the fine-tuning dataset
2. Training for more steps (epochs)
3. Setting a higher LoRA rank
4. Modifying the hyperparameter values such as `learning_rate` and `weight_decay`.

Try Alpaca's configuration below

| Hyperparameter | LLaMA-7B | LLaMA-13B |
|----------------|----------|-----------|
| Batch size     | 128      | 128       |
| Learning rate  | 2e-5     | 1e-5      |
| Epochs         | 3        | 5         |
| Max length     | 512      | 512       |
| Weight decay   | 0        | 0         |


In [21]:
if SAVE_TO_GITHUB:
    !git add {FILE_NAME}
    !git config --global user.email {GIT_USER_EMAIL}
    !git config --global user.name {GIT_USER_NAME}
    !git commit -am "update {FILE_NAME}"
    !git push

In [22]:
if SAVE_TO_GITHUB:
    !git add "{FILE_NAME}"
    !git config --global user.email "{GIT_USER_EMAIL}"
    !git config --global user.name "{GIT_USER_NAME}"
    !git commit -am "update {FILE_NAME}"


    # Authenticate using GitHub token
    !git remote set-url origin "https://{GIT_USER_NAME}:{GIT_TOKEN}@github.com/{GIT_USER_NAME}/{GIT_REPOSITORY}.git"

    !git push