## Finetuning Gemma2 for lease agreement data extraction

### Install dependencies

In [24]:
%pip install pandas peft scikit-learn transformers datasets torch trl accelerate bitsandbytes huggingface-hub -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


### Build the dataframe
Build the dataframe from the collected data

In [25]:

import sqlite3

# Path to the SQLite database
db_path = "../output/extracted_lease_agreements.db"

# Connect to the SQLite database
conn = sqlite3.connect(db_path)

# Query to select all data from the extracted_data table
query = "SELECT * FROM extracted_data"

# Read the data into a DataFrame
df = pd.read_sql_query(query, conn, index_col="id")

# Close the database connection
conn.close()

df.head()

Unnamed: 0_level_0,extracted_text,extracted_fields
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Name, address and phone number of managing age...","{'tenant_name': 'Yolanda Strobert', 'unit_addr..."
2,dupusit: or 2) return the remaining portion (i...,"{'tenant_name': 'Comunque Bolas', 'unit_addres..."
3,"You'll pay for all other utilities, related de...","{'tenant_name': None, 'unit_address': None, 'u..."
4,APARTMENT LEASE CONTRACT\nNAA NATIONAL APARTME...,"{'tenant_name': 'Dominique Boles', 'unit_addre..."
5,Disposition or Sale. Except for animals and pr...,"{'tenant_name': None, 'unit_address': None, 'u..."


### Split the dataset

In [26]:
from sklearn.model_selection import train_test_split

# First split: 80% for training, 20% for temp (which will be split into eval and test)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Second split: 50% of temp for eval and 50% for test (10% of the original data each)
eval_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Check the sizes of the splits
print(f"Training set size: {len(train_df)}")
print(f"Evaluation set size: {len(eval_df)}")
print(f"Test set size: {len(test_df)}")


Training set size: 205
Evaluation set size: 26
Test set size: 26


### Load the base model

In [30]:

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import notebook_login
import torch

# login to access gated model
notebook_login()

# Load the tokenizer and model
model_id = "google/gemma-2b-it"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# empty GPU memory
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)  # Change num_labels based on your task


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Next, let's setup LORA config

In [31]:
from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha = 8 * 2, # increased alpha
    lora_dropout=0.1, # decreased dropout
    r=8, # increased rank
    task_type='CAUSAL_LM',
    bias="none",
    target_modules=['k_proj', 'q_proj', 'gate_proj', 'o_proj', 'v_proj', 'down_proj', 'up_proj'], # target as many layers as possible
)

### Build the datasets

In [32]:
from datasets import Dataset

# Create datasets with the extracted text and labels
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset


Dataset({
    features: ['extracted_text', 'extracted_fields', 'id'],
    num_rows: 205
})

### Train (fine tune) the base model

In [36]:
from transformers import TrainingArguments, DataCollatorForLanguageModeling
from trl import SFTTrainer

def formatting_func(example):
    output_texts = []
    for i in range(len(example['extracted_text'])):
        text= f"### USER:\n Extract lease agreement data in JSON format from the following OCR text: {example['extracted_text'][i]}\n### ASSISTANT:\n{example['extracted_fields'][i]}"
        output_texts.append(text)
    return output_texts

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",  # Changed to evaluate based on steps
    eval_steps=1,  # Evaluate every 50 steps
    learning_rate=2e-4,
    lr_scheduler_type="linear",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    logging_steps=2,
    warmup_steps = 2,
    max_steps = 10,
    use_cpu=True,
    save_strategy="steps",
    save_steps=1,  # Save a checkpoint every 1 step
    optim="paged_adamw_32bit",
)

trainer = SFTTrainer(
    model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    neftune_noise_alpha=5,
    max_seq_length=512,
    args=training_args,
    formatting_func=formatting_func,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,  # No masked language modeling, suitable for CausalLM
        return_tensors="pt"  # Return PyTorch tensors
    )
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/205 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


### Save the model

In [None]:
trainer.model.save_pretrained("./saved_models/finetuned_phi3_lease_data_extraction")

### Check model accuracy

In [None]:
from peft import PeftModelForCausalLM
from datasets import load_metric

finetuned_model = PeftModelForCausalLM.from_pretrained(model=model, model_id="./saved_models/finetuned_phi3_lease_data_extraction")

# Set the model to evaluation mode
finetuned_model.eval()

# Initialize BLEU metric
bleu_metric = load_metric("bleu", trust_remote_code=True)

# Prepare lists to store outputs and true labels
finetuned_outputs = []
normal_outputs = []
true_labels = []

def format_prompt(message):
    return f"### USER:\n Extract lease agreement data in JSON format from the following OCR text: {message['extracted_text']}\n### ASSISTANT:"

# Disable gradient calculation for evaluation
with torch.no_grad():
    for message in test_dataset:
        # Prepare input using apply_chat_template
        input_ids = tokenizer.apply_chat_template(format_prompt(message), add_generation_prompt=True, tokenize=True, return_tensors="pt").to("cuda")
        print("Input IDs generated")
        # Get the true output
        true_output = message['extracted_fields']
        print("True output: ", true_output)
        true_labels.append(true_output)

        # Generate output from the fine-tuned model
        outputs_finetuned = finetuned_model.generate(input_ids=input_ids, max_new_tokens=1024, do_sample=False)
        finetuned_output = tokenizer.decode(outputs_finetuned[0]).split('<start_of_turn>model\n')[-1]
        print("Finetuned output: ", finetuned_output)
        finetuned_outputs.append(finetuned_output)

        # Generate output from the normal model
        outputs_normal = model.generate(input_ids=input_ids, max_new_tokens=1024, do_sample=False)
        normal_output = tokenizer.decode(outputs_normal[0]).split('<start_of_turn>model\n')[-1]
        print("Normal output: ", normal_output)
        normal_outputs.append(normal_output)

# Calculate BLEU scores
bleu_score_finetuned = bleu_metric.compute(predictions=finetuned_outputs, references=[[true] for true in true_labels])
bleu_score_normal = bleu_metric.compute(predictions=normal_outputs, references=[[true] for true in true_labels])

# Print BLEU scores
print(f"Finetuned Model BLEU Score: {bleu_score_finetuned['bleu']:.4f}")
print(f"Normal Model BLEU Score: {bleu_score_normal['bleu']:.4f}")

# Print generated outputs for comparison
for finetuned_output, normal_output, true_output in zip(finetuned_outputs, normal_outputs, true_labels):
    print("Finetuned: " + finetuned_output)
    print("Normal   : " + normal_output)
    print("True     : " + true_output)
    print("-" * 40)

  bleu_metric = load_metric("bleu", trust_remote_code=True)


Input IDs generated
True output:  {'tenant_name': None, 'unit_address': '333 H Street, Ste. 5000, Chula Vista, CA 91910', 'unit_number': None, 'unit_type': None, 'agreement_date': None, 'lease_start': None, 'lease_end': '2024-03-31', 'lease_auto_renew': None, 'hourly_rate': None, 'monthly_rent': 3350.0, 'prorated_rent': 2121.67, 'security_deposit': 2500.0, 'lease_rent': None, 'monthly_payment_breakdown': {'Liability to Landlord Insurance': 9.5, 'Admin Fee - Liability to Landlord Insurance': 3.0, 'Rent Income': 3350.0, 'Total': 3362.5}, 'utility_charges': None}
Finetuned output:  This context does not provide any lease agreement data, so I cannot extract the requested information from the context.<eos>
Normal output:  This context does not provide any lease agreement data, so I cannot extract the requested information from the context.<eos>
Input IDs generated
True output:  {'tenant_name': 'Saray Ramos Gutierrez', 'unit_address': 'Crystal Terrace', 'unit_number': None, 'unit_type': None

KeyboardInterrupt: 