# Finetune Llama-3 with LLaMA Factory

Please use a **free** Tesla T4 Colab GPU to run this!

Project homepage: https://github.com/hiyouga/LLaMA-Factory

In [None]:
%cd /content/
%rm -rf LLaMA-Factory
!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
%cd LLaMA-Factory
%ls
!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1
!pip uninstall -y jax
!pip install -e .[torch,bitsandbytes,liger-kernel]

In [None]:
# Cell 2: Data Preparation, Augmentation, and Splitting

import pandas as pd
import json
import re
import os
from sklearn.model_selection import train_test_split  # CHANGED: Imported train_test_split
import nltk  # ADDED: Importing nltk for data augmentation
from nltk.corpus import wordnet
import random

# ADDED: Download NLTK data files
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')  # FIXED: Downloading 'punkt' instead of 'punkt_tab'
nltk.download('punkt_tab')

%cd /content

# Function to remove links from text
def remove_links(text):
    # Regular expression to match URLs
    return re.sub(r'http[s]?://\S+', '', text)

# ADDED: Function to perform synonym replacement for data augmentation
def synonym_replacement(text, n=2):
    words = nltk.word_tokenize(text)  # FIXED: Ensure 'punkt' is downloaded for word_tokenize
    new_words = words.copy()
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = wordnet.synsets(random_word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()  # Get the first synonym
            if synonym != random_word:
                new_words = [synonym if word == random_word else word for word in new_words]
                num_replaced += 1
        if num_replaced >= n:
            break

    return ' '.join(new_words)

# Path to your CSV file
file_path = 'COS-ECE-470-fa-2024-Finalized-Dataset.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Select the relevant columns for task description and desired solution
task_columns = df[['Describe your task.',
                  'Desired solution to your task:  Provide a solution that clearly explains the rationale and logic of solving the problem. The solution should be instructional, with reasoning steps for audience comprehension.\n(Any format is acceptable, e.g. a link, attached file, or text). ']]

# ADDED: Apply data augmentation by paraphrasing
augmented_tasks = []
augmented_solutions = []

for _, row in task_columns.iterrows():
    task = row['Describe your task.'] if pd.notna(row['Describe your task.']) else "No task description provided."
    solution = row['Desired solution to your task:  Provide a solution that clearly explains the rationale and logic of solving the problem. The solution should be instructional, with reasoning steps for audience comprehension.\n(Any format is acceptable, e.g. a link, attached file, or text). ']
    solution = remove_links(solution) if pd.notna(solution) else "No solution provided."
    
    # Original data
    augmented_tasks.append(task)
    augmented_solutions.append(solution)
    
    # Augmented data
    augmented_tasks.append(synonym_replacement(task))
    augmented_solutions.append(synonym_replacement(solution))

augmented_df = pd.DataFrame({
    'Describe your task.': augmented_tasks,
    'Desired solution to your task:  Provide a solution that clearly explains the rationale and logic of solving the problem. The solution should be instructional, with reasoning steps for audience comprehension.\n(Any format is acceptable, e.g. a link, attached file, or text). ': augmented_solutions
})

# Split the data into training and validation sets (80% train, 20% val)
train_df, val_df = train_test_split(augmented_df, test_size=0.2, random_state=42)

# CHANGED: Function to prepare ShareGPT format data remains the same
def prepare_sharegpt_data(dataframe):
    sharegpt_data = []
    for _, row in dataframe.iterrows():
        # Handle missing task description
        user_content = row['Describe your task.'] if pd.notna(row['Describe your task.']) else "No task description provided."

        # Clean and handle desired solution
        gpt_content = remove_links(row['Desired solution to your task:  Provide a solution that clearly explains the rationale and logic of solving the problem. The solution should be instructional, with reasoning steps for audience comprehension.\n(Any format is acceptable, e.g. a link, attached file, or text). '])

        # Provide default message if solution is empty
        if not gpt_content.strip():
            gpt_content = "No solution provided."

        # Create entry for ShareGPT format
        entry = {
            "conversations": [
                {
                    "from": "human",
                    "value": user_content
                },
                {
                    "from": "gpt",
                    "value": gpt_content
                }
            ]
        }

        # Append only if both contents are valid
        if user_content and gpt_content:
            sharegpt_data.append(entry)
    return sharegpt_data

# CHANGED: Prepare training data
train_sharegpt_data = prepare_sharegpt_data(train_df)

# CHANGED: Prepare validation data
val_sharegpt_data = prepare_sharegpt_data(val_df)

# Change directory to LLaMA-Factory data folder
%cd /content/LLaMA-Factory/data/

# Save the prepared training data
with open('ECE470_train_data.json', 'w', encoding='utf-8') as f:
    json.dump(train_sharegpt_data, f, indent=2, ensure_ascii=False)

# Save the prepared validation data
with open('ECE470_val_data.json', 'w', encoding='utf-8') as f:
    json.dump(val_sharegpt_data, f, indent=2, ensure_ascii=False)

print("Saved the training dataset to 'ECE470_train_data.json' and validation dataset to 'ECE470_val_data.json'.")

In [None]:
import torch
try:
  assert torch.cuda.is_available() is True
except AssertionError:
  print("Please set up a GPU before using LLaMA Factory: https://medium.com/mlearning-ai/training-yolov4-on-google-colab-316f8fff99c6")

In [None]:
import json

args = dict(
  stage="sft",                        # do supervised fine-tuning
  do_train=True,
  do_eval=True,                        # Enable evaluation on validation set
  model_name_or_path="unsloth/llama-3-8b-Instruct-bnb-4bit", # use bnb-4bit-quantized Llama-3-8B-Instruct model
  dataset="ECE470_train_data",     # REPLACED: 'train_file' with 'dataset'
  eval_dataset="ECE470_val_data",  # Kept 'eval_dataset' as is
  template="llama3",                   # use llama3 prompt template
  finetuning_type="lora",              # use LoRA adapters to save memory
  lora_target="all",                   # attach LoRA adapters to all linear layers
  output_dir="llama3_lora",            # the path to save LoRA adapters
  per_device_train_batch_size=2,       # the batch size
  gradient_accumulation_steps=4,       # the gradient accumulation steps
  lr_scheduler_type="cosine",          # use cosine learning rate scheduler
  logging_steps=10,                    # log every 10 steps
  warmup_ratio=0.1,                    # use warmup scheduler
  save_steps=1000,                     # save checkpoint every 1000 steps
  learning_rate=5e-5,                  # the learning rate
  num_train_epochs=3.0,                # Increased epochs for better training
  max_samples=500,                     # use 500 examples in each dataset
  max_grad_norm=1.0,                   # clip gradient norm to 1.0
  loraplus_lr_ratio=16.0,              # use LoRA+ algorithm with lambda=16.0
  fp16=True,                           # use float16 mixed precision training
  #use_liger_kernel=True,              # use liger kernel for efficient training
  report_to="wandb",                   # Report to Weights & Biases for visualization
)

# Optionally, specify dataset_dir if your datasets are in a different directory
# args["dataset_dir"] = "data"

json.dump(args, open("train_llama3.json", "w", encoding="utf-8"), indent=2)

%cd /content/LLaMA-Factory/

!llamafactory-cli train train_llama3.json