## Finetuning Gemma2 for lease agreement data extraction

### Install dependencies

In [1]:
%pip install pandas peft scikit-learn transformers datasets torch trl accelerate bitsandbytes huggingface-hub -q -U

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

def messages_from_df(dataframe: pd.DataFrame) -> dict[str, list[dict[str, str]]]:
    # Transform the DataFrame into the desired format
    messages = []
    for _, row in dataframe.iterrows():
        message = [
            {
                "role": "user",
                "content": "Extract lease agreement data from the following text:\n" + row['extracted_text']
            },
            {
                "role": "assistant",
                "content": row['extracted_fields']
            }
        ]
        messages.append(message)
    return {"messages": messages}

### Build the dataframe
Build the dataframe from the collected data

In [3]:

import sqlite3

# Path to the SQLite database
db_path = "../output/extracted_lease_agreements.db"

# Connect to the SQLite database
conn = sqlite3.connect(db_path)

# Query to select all data from the extracted_data table
query = "SELECT * FROM extracted_data"

# Read the data into a DataFrame
df = pd.read_sql_query(query, conn, index_col="id")

# Close the database connection
conn.close()

### Split the dataset

In [4]:
from sklearn.model_selection import train_test_split

# First split: 80% for training, 20% for temp (which will be split into eval and test)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Second split: 50% of temp for eval and 50% for test (10% of the original data each)
eval_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Check the sizes of the splits
print(f"Training set size: {len(train_df)}")
print(f"Evaluation set size: {len(eval_df)}")
print(f"Test set size: {len(test_df)}")


Training set size: 205
Evaluation set size: 26
Test set size: 26


### Load the base model

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import notebook_login
import torch

# login to access gated model (gemma-2b-it)
notebook_login()

# Load the tokenizer and model
model_id = "google/gemma-2b-it"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = 'right'

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)  # Change num_labels based on your task


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

`low_cpu_mem_usage` was None, now set to True since model is quantized.
`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Next, let's setup LORA config

In [6]:
from peft import LoraConfig, PeftModelForCausalLM

peft_config = LoraConfig(
    lora_alpha = 16,
    lora_dropout=0.1,
    r=16,
    task_type='CAUSAL_LM'
)

### Build the datasets

In [7]:
from datasets import Dataset

# Create datasets with the extracted text and labels
train_dataset = Dataset.from_dict(messages_from_df(train_df))
eval_dataset = Dataset.from_dict(messages_from_df(eval_df))
test_dataset = Dataset.from_dict(messages_from_df(test_df))


### Train (fine tune) the base model

In [8]:
from transformers import TrainingArguments
from trl import SFTTrainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    logging_strategy="steps",
    lr_scheduler_type="constant",
    logging_steps=20,
    eval_steps=20,
    save_steps=20,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    eval_accumulation_steps=4,
    num_train_epochs=1,
    fp16=True,
    group_by_length = True,
    optim="paged_adamw_32bit",
    max_steps = 100
)

trainer = SFTTrainer(
    model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    neftune_noise_alpha=5,
    max_seq_length=500,
    args=training_args
)

trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/205 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss


### Save the model

In [None]:
trainer.model.save_pretrained("./finetuned_gemma2")

### Check model accuracy

In [None]:
# Evaluate on the test set
test_results = trainer.evaluate(test_dataset)

# Print the evaluation results
print("Test results:")
print(f"Loss: {test_results['eval_loss']}")
print(f"Accuracy: {test_results['eval_accuracy']}")