## Finetuning Flan T5 XL for lease agreement data extraction

### Install dependencies

In [1]:
%pip install pandas numpy peft scikit-learn transformers datasets torch accelerate bitsandbytes huggingface-hub -q

Note: you may need to restart the kernel to use updated packages.


### Build the dataframe
Build the dataframe from the collected data

In [2]:
import pandas as pd
import sqlite3

# Path to the SQLite database
db_path = "../output/extracted_lease_agreements.db"

# Connect to the SQLite database
conn = sqlite3.connect(db_path)

# Query to select all data from the extracted_data table
query = "SELECT * FROM extracted_data"

# Read the data into a DataFrame
df = pd.read_sql_query(query, conn, index_col="id")

# Close the database connection
conn.close()

df.head()

Unnamed: 0_level_0,extracted_text,extracted_fields
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Name, address and phone number of managing age...","{'tenant_name': 'Yolanda Strobert', 'unit_addr..."
2,dupusit: or 2) return the remaining portion (i...,"{'tenant_name': 'Comunque Bolas', 'unit_addres..."
3,"You'll pay for all other utilities, related de...","{'tenant_name': None, 'unit_address': None, 'u..."
4,APARTMENT LEASE CONTRACT\nNAA NATIONAL APARTME...,"{'tenant_name': 'Dominique Boles', 'unit_addre..."
5,Disposition or Sale. Except for animals and pr...,"{'tenant_name': None, 'unit_address': None, 'u..."


### Split the dataset

In [3]:
from sklearn.model_selection import train_test_split

# First split: 80% for training, 20% for temp (which will be split into eval and test)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Second split: 50% of temp for eval and 50% for test (10% of the original data each)
eval_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Check the sizes of the splits
print(f"Training set size: {len(train_df)}")
print(f"Evaluation set size: {len(eval_df)}")
print(f"Test set size: {len(test_df)}")


Training set size: 205
Evaluation set size: 26
Test set size: 26


### Load the base model

In [4]:
from peft import LoraConfig, TaskType

peft_config = LoraConfig(
    lora_alpha=16, # Higher alpha to match larger model capacity
    lora_dropout=0.1, # Consistent dropout rate to prevent overfitting
    r=8, # Rank, kept the same for balance between performance and efficiency
    task_type=TaskType.SEQ_2_SEQ_LM, # Change to SEQ_2_SEQ_LM for seq2seq models
    bias="none", # Keeping bias as none, similar to your original setup
    target_modules=[
        'q', 'v', 'k', 'o', # Attention layers (query, value, key, output projections)
        'wi', 'wo', # Feedforward layers (input, output projections)
        'wq', 'wv', 'wk', 'wo', # Additional T5-specific projection layers
    ], # Target modules relevant to T5 architecture
)


In [5]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
from huggingface_hub import notebook_login
from peft import get_peft_model, prepare_model_for_kbit_training
import torch

# login to access gated model
notebook_login()

# Load the tokenizer and model
model_id = "google/flan-t5-xl"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# empty GPU memory
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

model = AutoModelForSeq2SeqLM.from_pretrained(model_id, quantization_config=bnb_config)  # Change num_labels based on your task

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 12,189,696 || all params: 2,861,946,880 || trainable%: 0.4259


### Build the datasets

In [6]:
extraction_json_schema = {
  "title": "ExtractFieldsResult",
  "type": "object",
  "properties": {
    "tenant_name": {
      "type": ["string", "null"],
      "description": "The name of the tenant, found in the OCR text."
    },
    "unit_address": {
      "type": ["string", "null"],
      "description": "The unit address found in the OCR text."
    },
    "unit_number": {
      "type": ["string", "null"],
      "description": "The unit number found in the OCR text."
    },
    "unit_type": {
      "type": ["string", "null"],
      "description": "The unit type found in the OCR text."
    },
    "agreement_date": {
      "type": ["string", "null"],
      "format": "date"
    },
    "lease_start": {
      "type": ["string", "null"],
      "format": "date",
      "description": "The date when the lease starts, found in the OCR text."
    },
    "lease_end": {
      "type": ["string", "null"],
      "format": "date",
      "description": "The date when the lease ends, found in the OCR text."
    },
    "lease_auto_renew": {
      "type": ["string", "null"],
      "description": "The type of lease auto renewal, found in the OCR text."
    },
    "hourly_rate": {
      "type": ["number", "null"],
      "description": "The hourly rate found in the OCR text."
    },
    "monthly_rent": {
      "type": ["number", "null"],
      "description": "The monthly rent found in the OCR text."
    },
    "prorated_rent": {
      "type": ["number", "null"],
      "description": "The prorated rent found in the OCR text."
    },
    "security_deposit": {
      "type": ["number", "null"],
      "description": "The security deposit found in the OCR text."
    },
    "lease_rent": {
      "type": ["number", "null"],
      "description": "The security deposit found in the OCR text."
    },
    "monthly_payment_breakdown": {
      "type": ["object", "null"],
      "description": "The monthly payment breakdown data found in the OCR text.",
      "additionalProperties": {}
    },
    "utility_charges": {
      "type": ["object", "null"],
      "description": "The utility charges found in the OCR text. This is a dictionary with utility charges as the key, and their price as the value.",
      "additionalProperties": {
        "type": ["number", "null"]
      }
    }
  },
  "required": []
}


In [7]:
from datasets import Dataset

prompt_template = """
### TASK:
You are a data extraction model designed to parse and extract specific information from text. Your task is to extract relevant data fields from the following lease agreement document provided as OCR text.

### INPUT TEXT:
Below is the OCR text from a lease agreement. Use this text to extract the required data fields.

OCR Text:
{extracted_text}

### RESPONSE FORMAT:
The extracted data should be in JSON, adhering to the JSON schema below:

```json
{extraction_json_schema}
```

### REQUIRED OUTPUT:
Provide the extracted data in JSON format, strictly following the schema above.

### RESPONSE:        
"""

def preprocess_function(examples):
    inputs = [prompt_template.format(extracted_text=extracted_text, extraction_json_schema=extraction_json_schema) for extracted_text in examples["extracted_text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # The "labels" are the tokenized outputs:
    labels = tokenizer(
        text_target=examples["extracted_fields"], 
        max_length=512,         
        truncation=True
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Create datasets with the extracted text and labels
train_dataset = Dataset.from_pandas(train_df)
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = Dataset.from_pandas(eval_df)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)
test_dataset = Dataset.from_pandas(test_df)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

train_dataset


Map:   0%|          | 0/205 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Dataset({
    features: ['extracted_text', 'extracted_fields', 'id'],
    num_rows: 205
})

### Train (fine tune) the base model

In [8]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="steps",  # Changed to evaluate based on steps
    eval_steps=1,  # Evaluate every 50 steps
    learning_rate=1e-3,
    lr_scheduler_type="linear",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    logging_strategy="steps",
    logging_steps=2,
    disable_tqdm=False,  # Ensure tqdm progress bar is enabled
    warmup_steps = 2,
    max_steps = 10,
    use_cpu=True,
    save_strategy="steps",
    save_steps=1,  # Save a checkpoint every 1 step
    # predict_with_generate=True, # need this for ROUGE/ BLEU metrics
    optim="paged_adamw_32bit",
)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        return_tensors="pt",  # Return PyTorch tensors
    )
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


### Save the model

In [None]:
trainer.model.save_pretrained("./saved_models/finetuned_flan_t5_xl_lease_data_extraction")

### Evaluate the model

In [None]:
from transformers import Seq2SeqTrainer
import numpy as np
from sklearn.metrics import f1_score

# Load the model and tokenizer
model = trainer.model
tokenizer = trainer.tokenizer

# Define evaluation function
def evaluate_model(test_dataset):
    # Create Seq2SeqTrainer with the model and tokenizer
    eval_trainer = Seq2SeqTrainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=trainer.data_collator,
    )
    
    # Generate predictions
    raw_predictions = eval_trainer.predict(test_dataset)
    predictions = raw_predictions.predictions
    
    # Decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Get the true labels
    decoded_labels = [label for label in test_dataset["extracted_fields"]]
    
    return decoded_preds, decoded_labels

# Helper function to compute F1 score
def compute_f1_scores(preds, labels):
    f1_scores = []
    for pred, label in zip(preds, labels):
        pred_dict = eval(pred)  # Convert JSON string to dict
        label_dict = eval(label)  # Convert JSON string to dict

        # Extract common keys
        common_keys = set(pred_dict.keys()).intersection(set(label_dict.keys()))

        # Prepare flat lists for F1 calculation
        y_true = []
        y_pred = []

        for key in common_keys:
            y_true.append(label_dict[key])
            y_pred.append(pred_dict[key])
        
        # F1 for each field
        if len(y_true) > 0:
            f1 = f1_score(y_true, y_pred, average='micro')
            f1_scores.append(f1)

    # Return the average F1 score across all examples
    return np.mean(f1_scores) if f1_scores else 0.0

# Define compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    # Calculate Exact Match (EM)
    exact_match = np.mean([pred == label for pred, label in zip(predictions, labels)])
    
    # Calculate F1 score
    f1 = compute_f1_scores(predictions, labels)
    
    return {
        "exact_match": exact_match,
        "f1_score": f1,
    }

# Perform evaluation
decoded_preds, decoded_labels = evaluate_model(test_dataset)

# Print a few examples of predictions vs actual
for i in range(5):  # change the range if you want to see more/less examples
    print(f"Prediction {i}: {decoded_preds[i]}")
    print(f"Actual {i}: {decoded_labels[i]}")
    print("-----")

# Calculate accuracy or other metrics
metrics = compute_metrics((decoded_preds, decoded_labels))
print(metrics)


Input IDs generated
True output:  {'tenant_name': None, 'unit_address': '333 H Street, Ste. 5000, Chula Vista, CA 91910', 'unit_number': None, 'unit_type': None, 'agreement_date': None, 'lease_start': None, 'lease_end': '2024-03-31', 'lease_auto_renew': None, 'hourly_rate': None, 'monthly_rent': 3350.0, 'prorated_rent': 2121.67, 'security_deposit': 2500.0, 'lease_rent': None, 'monthly_payment_breakdown': {'Liability to Landlord Insurance': 9.5, 'Admin Fee - Liability to Landlord Insurance': 3.0, 'Rent Income': 3350.0, 'Total': 3362.5}, 'utility_charges': None}
Finetuned output:  ### USER:
 Extract lease agreement data in JSON format from the following OCR text: amount of 1/30th of the monthly rent per day.
MONTHLY CHARGES:
Liability to Landlord Insurance
$9.50
Admin Fee - Liability to Landlord Insurance
$3.00
Rent Income
$3,350.00
Total:
$3,362.50
PRORATED RENT: (If Applicable) :selected: (If checked) The tenancy did not start on the 1st of the month therefore the resident is to pay th

ValueError: Got a string but expected a list instead: '### USER:
 Extract lease agreement data in JSON format from the following OCR text: amount of 1/30th of the monthly rent per day.
MONTHLY CHARGES:
Liability to Landlord Insurance
$9.50
Admin Fee - Liability to Landlord Insurance
$3.00
Rent Income
$3,350.00
Total:
$3,362.50
PRORATED RENT: (If Applicable) :selected: (If checked) The tenancy did not start on the 1st of the month therefore the resident is to pay the following prorated rent at time of move-in:
PRORATED CHARGES:
Liability to Landlord Insurance
$9.50
Admin Fee - Liability to Landlord Insurance
$3.00
Rent Income
$2,121.67
Total:
$2,134.17
C :unselected: (If checked) in addition to the prorated rent listed above, rent for the following month is due at time of move-in. This amount equals the RENT plus ADDITIONAL CHARGES as listed above.
LATE CHARGE (Applied if payments have not been received within 3 days of their due date): $125.00 SECURITY DEPOSIT: $ $2,500.00
PAYMENT INSTRUCTIONS: All amounts due Landlord are payable to HeetWave Properties | 333 H Street, Ste. 5000, Chula Vista, CA 91910 | (888)557-HEET (4338). Payment must be made by: Money Order, Cashiers Check, Electronic Payment via Resident Portal, or Personal Check- No personal checks will be accepted after the 10th day of the month or in response to a notice to pay rent or quit or a notice to perform covenant or quit requiring payment. The normal hours available to make payments in person are from 8:30am to 5:00pm, on all non-holiday Weekdays, n/a Saturdays, and n/a Sundays. NOTE: There is no drop by available at this address above.
Landlord may, but is not required, to accept payments electronically or by credit card, either directly or through a third party payment service system. Residents interested in these payment methods should request information about Landlord's current electronic and credit card payment acceptance policy from the management office. See the Payment Detail section below.
1.5 RENT CONCESSIONS
(If checked) RENT CONCESSIONS: Resident is granted a $3,350 one-time concession Move in Special of :selected: One Month Free Rent as incentive for signing a 13 months lease. Concession to be posted to your account at move-in. The monthly Rent identified above is the amount due before application of the rent concession.
Rent Concession is subject to fulfillment of lease term in section 1.2. If lease is terminated prior to lease end date of 03/31/2024 , including if early termination option is chosen, concession amount of $3,350 will immediately become due back to Landlord.
2
### ASSISTANT:
The lease agreement data is not provided in the context, therefore I cannot extract the requested data from the context.'