# Fine-tune meta-llama/Llama-3.2-1B-Instruct using mlx framework and llm-datasets

In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
# set model name etc.

MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
MODEL_SHORT_NAME = MODEL_NAME.split('/')[-1]
SUFFIX = "FinGreyLit-MLX"
#SLICE = 1
print(MODEL_NAME)
print(MODEL_SHORT_NAME)

meta-llama/Llama-3.2-1B-Instruct
Llama-3.2-1B-Instruct


In [2]:
# Load and prepare fine-tuning dataset

import json
import glob
import random

random.seed(42)  # for deterministic sampling of test set

train_files = glob.glob("../../llm-dataset/*-train.jsonl") + glob.glob("../../llm-dataset/norrep.jsonl")
test_files = glob.glob("../../llm-dataset/*-test.jsonl")

EVAL_SIZE = 6  # how many documents to evaluate (i.e. calculate loss) on during fine-tuning
SYSTEM_PROMPT = "You are a skilled librarian specialized in meticulous cataloguing of digital documents."
INSTRUCTION = "Extract metadata from this document. Return as JSON."

def preprocess_sample(sample):
    output = json.dumps(sample["ground_truth"])
    input_ = json.dumps(sample["content"])
    # ShareGPT format
    conversations = [
        {'from': 'system', 'value': SYSTEM_PROMPT},
        {'from': 'user', 'value': INSTRUCTION + "\n\n" + input_},
        {'from': 'gpt', 'value': output}
    ]
    return {"conversations": conversations}

def dataset_to_records(files):
    records = []
    for filename in files:
        with open(filename) as infile:
            for line in infile:
                sample = json.loads(line)
                records.append(preprocess_sample(sample))
    return records

def write_jsonl(records, filename):
    with open(filename, "w") as outfile:
        for record in records:
            json.dump(record, outfile)
            outfile.write("\n")

train_recs = dataset_to_records(train_files)
random.shuffle(train_recs)
write_jsonl(train_recs, "./data/mlx-train.jsonl")
print(f"Wrote {len(train_recs)} train records")

test_recs = dataset_to_records(test_files)
write_jsonl(test_recs, "./data/mlx-test.jsonl")
print(f"Wrote {len(test_recs)} test records")

eval_recs = random.sample(test_recs, EVAL_SIZE)
write_jsonl(eval_recs, "./data/mlx-eval.jsonl")
print(f"Wrote {len(eval_recs)} eval records")

Wrote 64 train records
Wrote 15 test records
Wrote 6 eval records


In [3]:
!python convert-datasets-to-llama.py

/Users/aamanlamba/Documents/Code/FinGreyLit/experiments/mlx-finetune-llm
./data
Traceback (most recent call last):
  File "/Users/aamanlamba/Documents/Code/FinGreyLit/experiments/mlx-finetune-llm/convert-datasets-to-llama.py", line 16, in <module>
    convs = entry['conversations']
            ~~~~~^^^^^^^^^^^^^^^^^
KeyError: 'conversations'


In [4]:
# Load and finetune LLM
from mlx_lm import generate,load
from huggingface_hub import login

import os
from dotenv import load_dotenv
load_dotenv()

# Initialize the client with your API key
hf_accesstoken = os.environ.get("mlx_finetuning_api_key")
print(hf_accesstoken)
login(hf_accesstoken)

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


In [5]:
#load model
model, tokenizer = load("meta-llama/Llama-3.2-1B-Instruct") 
#load("google/gemma-2-2b-it")
#generate prompt and response
prompt = "You are a skilled librarian \
        specialized in meticulous cataloguing of digital documents.\
        Extract metadata from this document. Return as JSON.\
        \{\"pdfinfo\": {\"author\": \"Johanna Elisabet Glader\", \
        \"creationDate\": \"D:20200226153952+02'00'\", \"modDate\": \
        \"D:20200226154128+02'00'\"}, \"pages\": [{\"page\": 1, \"text\": \
        \"This is an electronic reprint of the original article. \
        This reprint may differ from the original in pagination and typographic \
        detail.\\nPlease cite the original version:\\nCynthia S\\u00f6derbacka (2019). \
            Energy storage demo environment in Technobothnia. Vaasa\\nInsider, 11.12.2019.\"}, \
                {\"page\": 2, \"text\": \"Energy Storage Demo Environment in\\nTechnobothnia\\nWhen \
                discourses about the benefits that come with renewable energy sources come up, \
                    one cannot avoid the discussion going into the irregular tendencies of these \
                        sources especially with reference to solar and wind energy and the challenges \
                            that come with that.\\nUnfortunately, the future does not have much room for \
                                fossil fuels, the era of renewable energy sources is here and will continue to grow. \
                                    According to the International Renewable\\nNovia University of Applied Sciences \
                                        (Novia UAS), \\u00c5bo Akademi University (\\u00c5A), and\\nVaasa University \
                                            of Applied Sciences (VAMK) have partnered up under \
                                                the \\u201cEnergy\\nEducation & Research Laboratory.\"},\
                                                      {\"page\": 4, \"text\": \"A thesis worker from \\u00c5A is \
                                                      focusing on delivering a Phase Changing Materials \
                                                        (PCM)\"}, {\"page\": 5, \"text\": \"The Lab Engineer at \
                                                            Novia UAS is in charge of this.\\nAuthor: \
                                                                Cynthia S\\u00f6derbacka- Project Manager \
                                                                    & Lecturer at Novia UAS\"}]}\"}, \
                                                                        {\"from\": \"gpt\", \"value\": \"{\"language\": \
                                                                        \"en\", \"title\": \"Energy storage demo environment in \
                                                                        Technobothnia\", \"creator\": [\"S\\u00f6derbacka, Cynthia\"],\
                                                                              \"year\": \"2019\", \"publisher\": [\"Vaasa Insider\"],\
                                                                                \"type_coar\": \"newspaper article\"}\"}]}    "
print(prompt)
messages = [{"role": "user", "content": prompt}]

prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

response = generate(model, tokenizer, prompt=prompt, verbose=True, max_tokens = 100)
print(response)

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

You are a skilled librarian         specialized in meticulous cataloguing of digital documents.        Extract metadata from this document. Return as JSON.        \{"pdfinfo": {"author": "Johanna Elisabet Glader",         "creationDate": "D:20200226153952+02'00'", "modDate":         "D:20200226154128+02'00'"}, "pages": [{"page": 1, "text":         "This is an electronic reprint of the original article.         This reprint may differ from the original in pagination and typographic         detail.\nPlease cite the original version:\nCynthia S\u00f6derbacka (2019).             Energy storage demo environment in Technobothnia. Vaasa\nInsider, 11.12.2019."},                 {"page": 2, "text": "Energy Storage Demo Environment in\nTechnobothnia\nWhen                 discourses about the benefits that come with renewable energy sources come up,                     one cannot avoid the discussion going into the irregular tendencies of these                         sources especially with refe

In [6]:
# Load datasets
from datasets import load_dataset
ds = load_dataset("json", 
                  data_files={"train": "./data/output_mlx-train.jsonl", 
                              "test": "./data/output_mlx-test.jsonl", 
                              "eval": "./data/output_mlx-eval.jsonl"})

# convert dataset to dataframe
import pandas as pd
df_train = pd.DataFrame(ds['train'])
df_test = pd.DataFrame(ds['test'])
df_eval = pd.DataFrame(ds['eval'])
print(df_train.head())
print(df_test.head())
print(df_eval.head())
# convert dataframe to list for mlx
#convert dataset to list for mlx
train_set = df_train['text'].tolist()
dev_set = df_eval['text'].tolist()
test_set = df_test['text'].tolist()

def preprocess(dataset):
    return dataset["text"].tolist()
#train_list = df_train.to_dict(orient='records')
#test_list = df_test.to_dict(orient='records')
#eval_list = df_eval.to_dict(orient='records')
#train_set, dev_set, test_set = map(preprocess, (df_train,df_eval,df_test))

print(train_set[0])
print(dev_set[0])
print(test_set[0])



Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

                                                text
0  <|begin_of_text|><|start_header_id|>system<|en...
1  <|begin_of_text|><|start_header_id|>system<|en...
2  <|begin_of_text|><|start_header_id|>system<|en...
3  <|begin_of_text|><|start_header_id|>system<|en...
4  <|begin_of_text|><|start_header_id|>system<|en...
                                                text
0  <|begin_of_text|><|start_header_id|>system<|en...
1  <|begin_of_text|><|start_header_id|>system<|en...
2  <|begin_of_text|><|start_header_id|>system<|en...
3  <|begin_of_text|><|start_header_id|>system<|en...
4  <|begin_of_text|><|start_header_id|>system<|en...
                                                text
0  <|begin_of_text|><|start_header_id|>system<|en...
1  <|begin_of_text|><|start_header_id|>system<|en...
2  <|begin_of_text|><|start_header_id|>system<|en...
3  <|begin_of_text|><|start_header_id|>system<|en...
4  <|begin_of_text|><|start_header_id|>system<|en...
<|begin_of_text|><|start_header_id|>system<|en

In [7]:
# Model finetuning setup
import matplotlib.pyplot as plt
import mlx.optimizers as optim
from mlx.utils import tree_flatten
from mlx_lm import load, generate
from mlx_lm.tuner import train, TrainingArgs 
from mlx_lm.tuner import linear_to_lora_layers
from pathlib import Path
import json, time
adapter_path = Path("./adapters")
adapter_path.mkdir(parents=True, exist_ok=True)
#set LORA parameters
lora_config = {
 "lora_layers": 8,
 "num_layers": 8,
 "lora_parameters": {
    "rank": 8,
    "scale": 20.0,
    "dropout": 0.0,
}}
with open(adapter_path / "adapter_config.json", "w") as fid:
    json.dump(lora_config, fid, indent=4)    
# Set training parameters
training_args = TrainingArgs(
    adapter_file=adapter_path / "adapters.safetensors",
    iters=200,
    steps_per_eval=50
)
#Freeze base model
model.freeze()

linear_to_lora_layers(model, lora_config["lora_layers"], lora_config["lora_parameters"])
num_train_params = (
    sum(v.size for _, v in tree_flatten(model.trainable_parameters()))
)
print(f"Number of trainable parameters: {num_train_params}")



Number of trainable parameters: 851968


In [8]:
# setup training model and optimizer - Adam
model.train()
opt = optim.Adam(learning_rate=1e-5)
#create metrics class to measure fine-tuning progress
class Metrics:
    train_losses = []
    val_losses = []
    def on_train_loss_report(self, info):
        self.train_losses.append((info["iteration"], info["train_loss"]))
    def on_val_loss_report(self, info):
        self.val_losses.append((info["iteration"], info["val_loss"]))
        
metrics = Metrics()


In [None]:
# start fine-tuning

# Start fine-tuning
start_time = time.time()

train(
    model = model,
    tokenizer = tokenizer,
    args = training_args,
    optimizer = opt,
    train_dataset = train_set,
    val_dataset = dev_set,
    training_callback = metrics
)
end_time = time.time()
duration = end_time - start_time
print(f"Completed finetuning Training in {duration/60:.2f} minutes")

# plot graph of fine-tuning
train_its, train_losses = zip(*metrics.train_losses)
val_its, val_losses = zip(*metrics.val_losses)
plt.plot(train_its, train_losses, '-o')
plt.plot(val_its, val_losses, '-o')
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.legend(['Train', "Valid"])
plt.show() 


Starting training..., iters: 200
Iter 1: Val loss 3.180, Val took 8.159s
