## Finetuning Gemma2 for lease agreement data extraction

### Install dependencies

In [1]:
%pip install -U pandas scikit-learn transformers datasets torch --quiet

Note: you may need to restart the kernel to use updated packages.


### Build the dataframe
Build the dataframe from the collected data

In [2]:
import pandas as pd
import sqlite3

# Path to the SQLite database
db_path = "../output/extracted_lease_agreements.db"

# Connect to the SQLite database
conn = sqlite3.connect(db_path)

# Query to select all data from the extracted_data table
query = "SELECT * FROM extracted_data"

# Read the data into a DataFrame
df = pd.read_sql_query(query, conn, index_col="id")

# Close the database connection
conn.close()

# Display the DataFrame
df.head()

Unnamed: 0_level_0,extracted_text,extracted_fields
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Name, address and phone number of managing age...","{'tenant_name': 'Yolanda Strobert', 'unit_addr..."
2,dupusit: or 2) return the remaining portion (i...,"{'tenant_name': 'Comunque Bolas', 'unit_addres..."
3,"You'll pay for all other utilities, related de...","{'tenant_name': None, 'unit_address': None, 'u..."
4,APARTMENT LEASE CONTRACT\nNAA NATIONAL APARTME...,"{'tenant_name': 'Dominique Boles', 'unit_addre..."
5,Disposition or Sale. Except for animals and pr...,"{'tenant_name': None, 'unit_address': None, 'u..."


### Split the dataset

In [3]:
from sklearn.model_selection import train_test_split

# First split: 80% for training, 20% for temp (which will be split into eval and test)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Second split: 50% of temp for eval and 50% for test (10% of the original data each)
eval_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Check the sizes of the splits
print(f"Training set size: {len(train_df)}")
print(f"Evaluation set size: {len(eval_df)}")
print(f"Test set size: {len(test_df)}")


Training set size: 205
Evaluation set size: 26
Test set size: 26


### Load the base model

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the tokenizer and model
model_name = "google/gemma-2-2b"  # Adjust this if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, num_labels=2)  # Change num_labels based on your task


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

### Tokenize the datasets

In [6]:
import datasets

# Tokenize the data
def tokenize_function(examples):
    model_input = examples["extracted_text"]
    labels = examples["extracted_fields"]
    
    # Tokenizing inputs and labels
    model_input_enc = tokenizer(model_input, padding="max_length", truncation=True, max_length=512)
    labels_enc = tokenizer(labels, padding="max_length", truncation=True, max_length=512)
    
    return {
        "input_ids": model_input_enc["input_ids"],
        "attention_mask": model_input_enc["attention_mask"],
        "labels": labels_enc["input_ids"],
    }

# Create datasets with the extracted text and labels
train_dataset = datasets.Dataset.from_pandas(train_df[['extracted_text', 'extracted_fields']])
eval_dataset = datasets.Dataset.from_pandas(eval_df[['extracted_text', 'extracted_fields']])
test_dataset = datasets.Dataset.from_pandas(test_df[['extracted_text', 'extracted_fields']])

# Tokenize the test, training and evaluation datasets
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Ensure to set the proper format for datasets if needed
tokenized_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Map:   0%|          | 0/205 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

### Train (fine tune) the base model

In [None]:
import torch
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

num_cores = 4  # Replace with the number of cores you want to use
torch.set_num_threads(num_cores)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    num_train_epochs=3,
    weight_decay=0.01,
    learning_rate=5e-5,  # Start with a learning rate of 5e-5
    warmup_steps=0,  # Adjust this based on your total training steps
    lr_scheduler_type="linear",  # Use a linear scheduler
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
)

trainer.train()

  return fn(*args, **kwargs)


ValueError: Expected input batch_size (4) to match target batch_size (2048).

### Check model accuracy

In [None]:
# Evaluate on the test set
test_results = trainer.evaluate(tokenized_test_dataset)

# Print the evaluation results
print("Test results:")
print(f"Loss: {test_results['eval_loss']}")
print(f"Accuracy: {test_results['eval_accuracy']}")