# Fine-tune Gemma 2b using LoRA

## Setup

In [1]:
import os
import json
from google.colab import userdata, drive

In [2]:
COLAB = True
KAGGLE = True
DOWNLOAD_DATA = True
SAVE_TO_GITHUB = True
GIT_REPOSITORY = "CS221-project"
FILE_NAME = "colab_tuning.ipynb"


In [3]:
if COLAB:
    %cd /content
    drive.mount('/content/drive', force_remount=True)

In [4]:
if COLAB:
    PARENT_DIRECTORY_PATH = "/content"
    # In case you want to clone in your drive:
    PARENT_DIRECTORY_PATH = "/content/drive/MyDrive"
    PROJECT_PATH = PARENT_DIRECTORY_PATH + "/" + GIT_REPOSITORY
    %cd "{PARENT_DIRECTORY_PATH}"

In [11]:
if COLAB:
    import json
    import os

    with open(f"{PARENT_DIRECTORY_PATH}/Git/git.json", "r") as f:
        parsed_json = json.load(f)

    GIT_USER_NAME = parsed_json["GIT_USER_NAME"]
    GIT_TOKEN = parsed_json["GIT_TOKEN"]
    GIT_USER_EMAIL = parsed_json["GIT_USER_EMAIL"]

    GIT_PATH = (
        f"https://{GIT_TOKEN}@github.com/{GIT_USER_NAME}/{GIT_REPOSITORY}.git"
    )

    %cd "{PARENT_DIRECTORY_PATH}"

    if os.path.exists(f"{PARENT_DIRECTORY_PATH}/{GIT_REPOSITORY}"):
        %cd "{PROJECT_PATH}"
        !git pull
    else:
        !git clone "{GIT_PATH}"  # Clone the github repository
        %cd "{PROJECT_PATH}"

In [5]:
if COLAB:
    import os
    os.environ["KAGGLE_CONFIG_DIR"] = f"{PARENT_DIRECTORY_PATH}/Kaggle/kaggle.json"

### Set environment variables

In [6]:
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')

### Install dependencies

In [7]:
# Install Keras 3 last. See https://keras.io/getting_started/ for more details.
!pip install -q -U keras-nlp
!pip install -q -U keras>=3
!pip install rouge

### Select a backend

In [8]:
os.environ["KERAS_BACKEND"] = "jax"  # Or "torch" or "tensorflow".
# Avoid memory fragmentation on JAX backend.
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"

### Import packages

In [9]:
import keras
import keras_nlp

## Load Model

In [12]:
gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma_2b_en")
gemma_lm.summary()

## LoRA Fine-tuning

In [13]:
# from utils import preprocess_qa_data, compute_rouge_l

with open("data/qa_train_data_ft.txt") as file:
    contents = file.read()

data = []
qa_pairs = contents.split("\nInstruction:\n")[1:]  # Split and remove the first empty question

for qa_pair in qa_pairs:
    parts = qa_pair.split("\nResponse:\n", maxsplit=1)
    if len(parts) == 2:
        question, answer = parts
        instruction = question.strip()
        response = answer.strip()
        template = "Instruction:\n{}\nResponse:\n{}"
        data.append(template.format(instruction, response))
    else:
        print(f"Skipping malformed question-answer pair: {qa_pair}")


In [14]:
# Enable LoRA for the model and set the LoRA rank to 4.
gemma_lm.backbone.enable_lora(rank=4)
gemma_lm.summary()

In [16]:
# Limit the input sequence length to 512 (to control memory usage).
gemma_lm.preprocessor.sequence_length = 512
# Use AdamW (a common optimizer for transformer models).
optimizer = keras.optimizers.AdamW(
    learning_rate=5e-5,
    weight_decay=0.01,
)
# Exclude layernorm and bias terms from decay.
optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])

gemma_lm.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=optimizer,
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
)
gemma_lm.fit(data, epochs=1, batch_size=1)

## Evaluation after fine-tuning

In [19]:
from src.utils import compute_rouge_l
# Load data from JSON file
with open("data/qa_test_data.json", "r") as file:
    test_data = json.load(file)

In [20]:
# Define the template
template = "Instruction:\n{instruction}\n\nResponse:\n{response}"

# Define the sampler
sampler = keras_nlp.samplers.TopKSampler(k=5, seed=2)
gemma_lm.compile(sampler=sampler)

results = []
rouge_l_scores = []
i = 0
# Loop through the test data
for item in test_data:
    prompt = template.format(instruction=item["instruction"], response="")
    model_response = gemma_lm.generate(prompt, max_length=256)

    # Extract the text after "Response:\n"
    response_start_index = model_response.find("Response:\n")
    if response_start_index != -1:
        model_response = model_response[response_start_index + len("Response:\n"):].strip()

    rouge_l_score = compute_rouge_l(item["response"], model_response)
    rouge_l_scores.append(rouge_l_score)

    result = {
        "instruction": item["instruction"],
        "model_response": model_response,
        "original_response": item["response"],
        "rouge_l_score": rouge_l_score
    }
    i += 1
    print(f"Completed {i}.")
    results.append(result)

In [21]:
# Save results to a file
with open("outputs/evaluation_ft.json", "w") as outfile:
    json.dump(results, outfile, indent=4)

# Output average ROUGE-L metric
average_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
print(f"Average ROUGE-L Metric: {average_rouge_l}")

## Save the fine-tuned model

In [32]:
# Finetuned model
MODEL_NAME = "MODEL_ft"
FINETUNED_MODEL_DIR = f"{MODEL_NAME}"
FINETUNED_WEIGHTS_PATH = f"{MODEL_NAME}/model.weights.h5"
FINETUNED_VOCAB_PATH = f"{MODEL_NAME}/vocabulary.spm"

In [36]:
import transformers

# Make sure the directory exists
%mkdir -p $FINETUNED_MODEL_DIR

gemma_lm.save_weights(FINETUNED_WEIGHTS_PATH)

gemma_lm.preprocessor.tokenizer.save_assets(FINETUNED_MODEL_DIR)

In [18]:
del gemma_lm

In [None]:
if SAVE_TO_GITHUB:
    !git add {FILE_NAME}
    !git config --global user.email {GIT_USER_EMAIL}
    !git config --global user.name {GIT_USER_NAME}
    !git commit -am "update {FILE_NAME}"
    !git push

In [None]:
if SAVE_TO_GITHUB:
    !git add "{FILE_NAME}"
    !git config --global user.email "{GIT_USER_EMAIL}"
    !git config --global user.name "{GIT_USER_NAME}"
    !git commit -am "update {FILE_NAME}"
    # Authenticate using GitHub token
    !git remote set-url origin "https://{GIT_USER_NAME}:{GIT_TOKEN}@github.com/{GIT_USER_NAME}/{GIT_REPOSITORY}.git"

    !git push