# Fine-tune meta-llama/Llama-3.2-1B-Instruct using mlx framework and llm-datasets

In [4]:
%pip install -r requirements.txt

Collecting python-dotenv (from -r requirements.txt (line 11))
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Using cached python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
# set model name etc.

MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
MODEL_SHORT_NAME = MODEL_NAME.split('/')[-1]
SUFFIX = "FinGreyLit-MLX"
#SLICE = 1
print(MODEL_NAME)
print(MODEL_SHORT_NAME)

meta-llama/Llama-3.2-1B-Instruct
Llama-3.2-1B-Instruct


In [1]:
# Load and prepare fine-tuning dataset

import json
import glob
import random

random.seed(42)  # for deterministic sampling of test set

train_files = glob.glob("../../llm-dataset/*-train.jsonl") + glob.glob("../../llm-dataset/norrep.jsonl")
test_files = glob.glob("../../llm-dataset/*-test.jsonl")

EVAL_SIZE = 6  # how many documents to evaluate (i.e. calculate loss) on during fine-tuning
SYSTEM_PROMPT = "You are a skilled librarian specialized in meticulous cataloguing of digital documents."
INSTRUCTION = "Extract metadata from this document. Return as JSON."

def preprocess_sample(sample):
    output = json.dumps(sample["ground_truth"])
    input_ = json.dumps(sample["content"])
    # ShareGPT format
    conversations = [
        {'from': 'system', 'value': SYSTEM_PROMPT},
        {'from': 'user', 'value': INSTRUCTION + "\n\n" + input_},
        {'from': 'gpt', 'value': output}
    ]
    return {"conversations": conversations}

def dataset_to_records(files):
    records = []
    for filename in files:
        with open(filename) as infile:
            for line in infile:
                sample = json.loads(line)
                records.append(preprocess_sample(sample))
    return records

def write_jsonl(records, filename):
    with open(filename, "w") as outfile:
        for record in records:
            json.dump(record, outfile)
            outfile.write("\n")

train_recs = dataset_to_records(train_files)
random.shuffle(train_recs)
write_jsonl(train_recs, "./data/mlx-train.jsonl")
print(f"Wrote {len(train_recs)} train records")

test_recs = dataset_to_records(test_files)
write_jsonl(test_recs, "./data/mlx-test.jsonl")
print(f"Wrote {len(test_recs)} test records")

eval_recs = random.sample(test_recs, EVAL_SIZE)
write_jsonl(eval_recs, "./data/mlx-eval.jsonl")
print(f"Wrote {len(eval_recs)} eval records")

Wrote 64 train records
Wrote 15 test records
Wrote 6 eval records


In [2]:
# Load and finetune LLM
from mlx_lm import generate,load
from huggingface_hub import login

import os
from dotenv import load_dotenv
load_dotenv()

# Initialize the client with your API key
hf_accesstoken = os.environ.get("mlx_finetuning_api_key")
print(hf_accesstoken)
login(hf_accesstoken)

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


In [4]:
#load model
model, tokenizer = load("meta-llama/Llama-3.2-1B-Instruct") 
#load("google/gemma-2-2b-it")
#generate prompt and response
prompt = "You are a skilled librarian \
        specialized in meticulous cataloguing of digital documents.\
        Extract metadata from this document. Return as JSON.\
        \{\"pdfinfo\": {\"author\": \"Johanna Elisabet Glader\", \
        \"creationDate\": \"D:20200226153952+02'00'\", \"modDate\": \
        \"D:20200226154128+02'00'\"}, \"pages\": [{\"page\": 1, \"text\": \
        \"This is an electronic reprint of the original article. \
        This reprint may differ from the original in pagination and typographic \
        detail.\\nPlease cite the original version:\\nCynthia S\\u00f6derbacka (2019). \
            Energy storage demo environment in Technobothnia. Vaasa\\nInsider, 11.12.2019.\"}, \
                {\"page\": 2, \"text\": \"Energy Storage Demo Environment in\\nTechnobothnia\\nWhen \
                discourses about the benefits that come with renewable energy sources come up, \
                    one cannot avoid the discussion going into the irregular tendencies of these \
                        sources especially with reference to solar and wind energy and the challenges \
                            that come with that.\\nUnfortunately, the future does not have much room for \
                                fossil fuels, the era of renewable energy sources is here and will continue to grow. \
                                    According to the International Renewable\\nNovia University of Applied Sciences \
                                        (Novia UAS), \\u00c5bo Akademi University (\\u00c5A), and\\nVaasa University \
                                            of Applied Sciences (VAMK) have partnered up under \
                                                the \\u201cEnergy\\nEducation & Research Laboratory.\"},\
                                                      {\"page\": 4, \"text\": \"A thesis worker from \\u00c5A is \
                                                      focusing on delivering a Phase Changing Materials \
                                                        (PCM)\"}, {\"page\": 5, \"text\": \"The Lab Engineer at \
                                                            Novia UAS is in charge of this.\\nAuthor: \
                                                                Cynthia S\\u00f6derbacka- Project Manager \
                                                                    & Lecturer at Novia UAS\"}]}\"}, \
                                                                        {\"from\": \"gpt\", \"value\": \"{\"language\": \
                                                                        \"en\", \"title\": \"Energy storage demo environment in \
                                                                        Technobothnia\", \"creator\": [\"S\\u00f6derbacka, Cynthia\"],\
                                                                              \"year\": \"2019\", \"publisher\": [\"Vaasa Insider\"],\
                                                                                \"type_coar\": \"newspaper article\"}\"}]}    "
print(prompt)
messages = [{"role": "user", "content": prompt}]

prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

response = generate(model, tokenizer, prompt=prompt, verbose=True, max_tokens = 100)
print(response)

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

You are a skilled librarian         specialized in meticulous cataloguing of digital documents.        Extract metadata from this document. Return as JSON.        \{"pdfinfo": {"author": "Johanna Elisabet Glader",         "creationDate": "D:20200226153952+02'00'", "modDate":         "D:20200226154128+02'00'"}, "pages": [{"page": 1, "text":         "This is an electronic reprint of the original article.         This reprint may differ from the original in pagination and typographic         detail.\nPlease cite the original version:\nCynthia S\u00f6derbacka (2019).             Energy storage demo environment in Technobothnia. Vaasa\nInsider, 11.12.2019."},                 {"page": 2, "text": "Energy Storage Demo Environment in\nTechnobothnia\nWhen                 discourses about the benefits that come with renewable energy sources come up,                     one cannot avoid the discussion going into the irregular tendencies of these                         sources especially with refe

In [2]:
# Load datasets
from datasets import load_dataset
ds = load_dataset("json", data_files={"train": "./data/mlx-train.jsonl", "test": "./data/mlx-test.jsonl", "eval": "./data/mlx-eval.jsonl"})

# convert dataset to dataframe
import pandas as pd
df_train = pd.DataFrame(ds['train'])
df_test = pd.DataFrame(ds['test'])
df_eval = pd.DataFrame(ds['eval'])

# convert dataframe to list for mlx
train_list = df_train.to_dict(orient='records')
test_list = df_test.to_dict(orient='records')
eval_list = df_eval.to_dict(orient='records')
print(train_list[0])
print(test_list[0])
print(eval_list[0])



Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

{'conversations': [{'from': 'system', 'value': 'You are a skilled librarian specialized in meticulous cataloguing of digital documents.'}, {'from': 'user', 'value': 'Extract metadata from this document. Return as JSON.\n\n{"pdfinfo": {"creationDate": "D:20130109151801+02\'00\'", "modDate": "D:20130109151805+02\'00\'"}, "pages": [{"page": 1, "text": "Fiscal sustainability projections for Finland 7 December 2012 nen \\u2013 Petri M\\u00e4ki-Fr\\u00e4nti \\u2013 ola Finland has substantial problems with private sector indebtedness. \\u00ad fiscal sustainability. The weak economic A further assumption is that, over the situation in the next few years, combined same period, no new decisions will be with economic growth that will remain made that would improve fiscal sustain\\u00ad subdued also in the long term and rising ability. Hence, the calculations illustrate expenditure pressures denote a consider\\u00ad the pressures to strengthen public able need to strengthen the financial finances

#### * To-Do: Change conversion/convert-to-LLM-dataset to LLama Question-Answer format
