## Finetuning Flan T5 XL for lease agreement data extraction

### Install dependencies

In [1]:
%pip install pandas numpy peft scikit-learn transformers datasets torch accelerate bitsandbytes huggingface-hub -q

Note: you may need to restart the kernel to use updated packages.


### Define the extraction JSON schema

In [6]:
import json

extraction_json_schema = {
  "title": "ExtractedLeaseData",
  "type": "object",
  "properties": {
    "tenant_name": {
      "type": ["string", "null"],
      "description": "The name of the tenant, found in the OCR text."
    },
    "unit_address": {
      "type": ["string", "null"],
      "description": "The unit address found in the OCR text."
    },
    "unit_number": {
      "type": ["string", "null"],
      "description": "The unit number found in the OCR text."
    },
    "unit_type": {
      "type": ["string", "null"],
      "description": "The unit type found in the OCR text."
    },
    "agreement_date": {
      "type": ["string", "null"],
      "format": "date"
    },
    "lease_start": {
      "type": ["string", "null"],
      "format": "date",
      "description": "The date when the lease starts, found in the OCR text."
    },
    "lease_end": {
      "type": ["string", "null"],
      "format": "date",
      "description": "The date when the lease ends, found in the OCR text."
    },
    "lease_auto_renew": {
      "type": ["string", "null"],
      "description": "The type of lease auto renewal, found in the OCR text."
    },
    "hourly_rate": {
      "type": ["number", "null"],
      "description": "The hourly rate found in the OCR text."
    },
    "monthly_rent": {
      "type": ["number", "null"],
      "description": "The monthly rent found in the OCR text."
    },
    "prorated_rent": {
      "type": ["number", "null"],
      "description": "The prorated rent found in the OCR text."
    },
    "security_deposit": {
      "type": ["number", "null"],
      "description": "The security deposit found in the OCR text."
    },
    "lease_rent": {
      "type": ["number", "null"],
      "description": "The security deposit found in the OCR text."
    },
    "monthly_payment_breakdown": {
      "type": ["object", "null"],
      "description": "The monthly payment breakdown data found in the OCR text.",
      "additionalProperties": {}
    },
    "utility_charges": {
      "type": ["object", "null"],
      "description": "The utility charges found in the OCR text. This is a dictionary with utility charges as the key, and their price as the value.",
      "additionalProperties": {
        "type": ["number", "null"]
      }
    }
  },
  "required": ["tenant_name", "unit_address", "unit_number", "unit_type", "agreement_date", "lease_start", "lease_end", "lease_auto_renew", "hourly_rate", "monthly_rent", "prorated_rent", "security_deposit", "lease_rent", "monthly_payment_breakdown", "utility_charges"]
}

extraction_json_schema_str = json.dumps(extraction_json_schema, indent=2)

### Build the Dataset
Build the dataset from the collected data in the Sqlite3 database

In [1]:
import pandas as pd
import sqlite3

# Path to the SQLite database
db_path = "../output/extracted_lease_agreements.db"

# Connect to the SQLite database
conn = sqlite3.connect(db_path)

# Query to select all data from the extracted_data table
query = "SELECT * FROM extracted_data"

# Read the data into a DataFrame
df = pd.read_sql_query(query, conn, index_col="id")

# Close the database connection
conn.close()

df

Unnamed: 0_level_0,extracted_text,extracted_fields
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"22.PARKING. We may regulate the time, manner, ...","{""tenant_name"": null, ""unit_address"": null, ""u..."
2,38.MISCELLANEOUS.\nA. Exercising one remedy wo...,"{""tenant_name"": null, ""unit_address"": null, ""u..."
3,"You'll pay for all other utilities, related de...","{""tenant_name"": null, ""unit_address"": null, ""u..."
4,Disposition or Sale. Except for animals and pr...,"{""tenant_name"": null, ""unit_address"": null, ""u..."
5,dupusit: or 2) return the remaining portion (i...,"{""tenant_name"": ""Comunque Bolas"", ""unit_addres..."
...,...,...
362,36. OTHER CHARGES\nResident agrees to pay Mana...,"{""tenant_name"": null, ""unit_address"": null, ""u..."
363,Resident shall not deliberately or negligently...,"{""tenant_name"": ""Taylor Whittle"", ""unit_addres..."
364,21. WAIVER OF VENUE\nIn the event that Managem...,"{""tenant_name"": ""Taylor Whittle"", ""unit_addres..."
365,41. DRUG FREE HOUSING / CRIMINAL ACTIVITY\nRes...,"{""tenant_name"": null, ""unit_address"": null, ""u..."


#### Preprocess the dataset
We need to add in missing fields in the dataset

In [None]:
# Function to ensure all required fields are present in the extracted_fields column
def ensure_all_fields(extracted_fields, required_fields):
    # Convert the extracted_fields string to a dictionary
    extracted_data = json.loads(extracted_fields)
    
    # Check and add any missing fields with a value of None
    for field in required_fields:
        if field not in extracted_data:
            extracted_data[field] = None
    
    # Convert the dictionary back to a JSON string
    return json.dumps(extracted_data)

# List of required fields from the extraction_json_schema
extraction_fields = list(extraction_json_schema['properties'].keys())

# Apply the function to each row in the extracted_fields column
df['extracted_fields'] = df['extracted_fields'].apply(ensure_all_fields, required_fields=extraction_fields)

# Display the updated DataFrame
df

Unnamed: 0_level_0,extracted_text,extracted_fields
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"22.PARKING. We may regulate the time, manner, ...","{""tenant_name"": null, ""unit_address"": null, ""u..."
2,38.MISCELLANEOUS.\nA. Exercising one remedy wo...,"{""tenant_name"": null, ""unit_address"": null, ""u..."
3,"You'll pay for all other utilities, related de...","{""tenant_name"": null, ""unit_address"": null, ""u..."
4,Disposition or Sale. Except for animals and pr...,"{""tenant_name"": null, ""unit_address"": null, ""u..."
5,dupusit: or 2) return the remaining portion (i...,"{""tenant_name"": ""Comunque Bolas"", ""unit_addres..."
...,...,...
362,36. OTHER CHARGES\nResident agrees to pay Mana...,"{""tenant_name"": null, ""unit_address"": null, ""u..."
363,Resident shall not deliberately or negligently...,"{""tenant_name"": ""Taylor Whittle"", ""unit_addres..."
364,21. WAIVER OF VENUE\nIn the event that Managem...,"{""tenant_name"": ""Taylor Whittle"", ""unit_addres..."
365,41. DRUG FREE HOUSING / CRIMINAL ACTIVITY\nRes...,"{""tenant_name"": null, ""unit_address"": null, ""u..."


### Split the dataset

In [2]:
from sklearn.model_selection import train_test_split

# First split: 80% for training, 20% for temp (which will be split into eval and test)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Second split: 50% of temp for eval and 50% for test (10% of the original data each)
eval_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Check the sizes of the splits
print(f"Training set size: {len(train_df)}")
print(f"Evaluation set size: {len(eval_df)}")
print(f"Test set size: {len(test_df)}")


ImportError: initialization failed

### Load the base model

In [3]:
from peft import LoraConfig, TaskType

peft_config = LoraConfig(
    lora_alpha=16, # Higher alpha to match larger model capacity
    lora_dropout=0.1, # Consistent dropout rate to prevent overfitting
    r=8, # Rank, kept the same for balance between performance and efficiency
    task_type=TaskType.SEQ_2_SEQ_LM, # Change to SEQ_2_SEQ_LM for seq2seq models
    bias="none", # Keeping bias as none, similar to your original setup
    target_modules=[
        'q', 'v', 'k', 'o', # Attention layers (query, value, key, output projections)
        'wi', 'wo', # Feedforward layers (input, output projections)
        'wq', 'wv', 'wk', 'wo', # Additional T5-specific projection layers
    ], # Target modules relevant to T5 architecture
)


In [4]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
from huggingface_hub import notebook_login
from peft import get_peft_model, prepare_model_for_kbit_training
import torch

# login to access gated model
notebook_login()

# Load the tokenizer and model
model_id = "google/flan-t5-xl"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# empty GPU memory
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

model = AutoModelForSeq2SeqLM.from_pretrained(model_id, quantization_config=bnb_config)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 12,189,696 || all params: 2,861,946,880 || trainable%: 0.4259


### Build the datasets

In [14]:
prompt_template = """
### TASK:
You are a specialized model for extracting specific information from lease agreement text. Your goal is to accurately extract data fields from the provided OCR text of a lease agreement. Additionally, correct any obvious OCR errors you encounter during extraction.

### INPUT TEXT:
Below is the OCR text extracted from a lease agreement. Carefully analyze this text, and extract the relevant data fields.

OCR Text:
```
{extracted_text}
```

### RESPONSE FORMAT:
Return the extracted data as a JSON object, adhering strictly to the following JSON schema:

```json
{extraction_json_schema_str}
```
"""

In [6]:
from datasets import Dataset

def preprocess_function(examples):
    inputs = [prompt_template.format(extracted_text=extracted_text, extraction_json_schema_str=extraction_json_schema_str) for extracted_text in examples["extracted_text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # The "labels" are the tokenized outputs:
    labels = tokenizer(
        text_target=examples["extracted_fields"], 
        max_length=512,         
        truncation=True
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Create datasets with the extracted text and labels
train_dataset = Dataset.from_pandas(train_df)
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = Dataset.from_pandas(eval_df)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)
test_dataset = Dataset.from_pandas(test_df)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

train_dataset


Map:   0%|          | 0/205 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Dataset({
    features: ['extracted_text', 'extracted_fields', 'id'],
    num_rows: 205
})

### Train (fine tune) the base model

In [8]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="steps",  # Changed to evaluate based on steps
    eval_steps=10,  # Evaluate every 10 steps
    learning_rate=1e-3,
    lr_scheduler_type="linear",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    logging_strategy="steps",
    logging_steps=10,
    disable_tqdm=False,  # Ensure tqdm progress bar is enabled
    max_steps = 2000, # increase max steps
    save_strategy="steps",
    save_steps=50,  # Save a checkpoint every 10 steps
    # predict_with_generate=True, # need this for ROUGE/ BLEU metrics
    optim="paged_adamw_32bit",
)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        return_tensors="pt",  # Return PyTorch tensors
    )
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
1,No log,1.282624
2,1.433800,1.145782
3,1.433800,1.035312
4,1.246500,0.93874
5,1.246500,0.851025
6,1.069000,0.777541
7,1.069000,0.720084
8,0.934500,0.678166
9,0.934500,0.649756
10,0.787200,0.635124


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enab

TrainOutput(global_step=10, training_loss=1.0941948771476746, metrics={'train_runtime': 19618.0545, 'train_samples_per_second': 0.016, 'train_steps_per_second': 0.001, 'total_flos': 2722954752294912.0, 'train_loss': 1.0941948771476746, 'epoch': 1.5384615384615383})

### Save the model

In [9]:
peft_model_repo_id = "aryaniyaps/finetuned_flan_t5_xl_lease_data_extraction_40_deals"

In [13]:
# Push the PEFT model (LoRA adapters) to Hugging Face Hub
trainer.model.save_pretrained("./saved_models/finetuned_flan_t5_xl_lease_data_extraction_200")

trainer.model.push_to_hub(peft_model_repo_id)

### Evaluate the model

In [11]:
from peft import PeftModel
 
# Load the Lora model
finetuned_model = PeftModel.from_pretrained(model, peft_model_repo_id, device_map={"":0})
finetuned_model.eval()
 
print("PEFT model loaded")

PEFT model loaded


Let's try data extraction with a random sample

In [None]:
from random import randrange
from tabulate import tabulate

sample = test_dataset[randrange(len(test_dataset))]
 
input_ids = tokenizer(prompt_template.format(extracted_text=sample["extracted_text"], extraction_json_schema_str=extraction_json_schema_str), return_tensors="pt", truncation=True).input_ids.cuda()
with torch.no_grad():
    outputs = finetuned_model.generate(input_ids=input_ids, max_new_tokens=512)


In [None]:
prediction = tokenizer.decode(outputs[0].detach().cpu().numpy())

table_data = [[sample['extracted_text'], prediction]]

# Define table headers
headers = ["OCR text", "Extracted data (Finetuned model)"]

# Display the table
tabulate(table_data, headers=headers, tablefmt="html", showindex=False)

OCR text,Extracted data (Finetuned model)
"POURL HOUSIMO OPPORTUNITY ADDITIONAL SPECIAL PROVISIONS NAA NATIONAL APARTMENT ASSOCIATION We Lead the Way Home DWELLING UNIT DESCRIPTION. Unit No. 2557E 2557 Mountain Lodge Circle (street address) in Vestavia Hills (city), Alabama, 35216 (zip code). LEASE CONTRACT DESCRIPTION. Lease Contract date: April 27, 2023 Owner's Name: Mountain BAL LLC Residents (list all residents): Gary Smith LEASE END DATE: If the resident decides to vacate at the end of the lease agreement, the term of his lease will end at 12:00 noon on the date stipulated in clause 3, not midnight. GUEST: Guest must be registered with the management office and are permitted for no more than 14 consecutive days or a cumulative total of 30 days for the lease term. The undersigned lessee agrees that the lessor shall have the right to prohibit with a 24 hour written notice to lessee, any and all guests, visitors and invites of lessee from entering into either lessee's apartment, property grounds or any common area which lessee's apartment forms part of. Lessee understands and agrees that the granting of access or any right of visitation or entry to lessee's apartment, grounds or common area, after having received a 24 hour notice shall be grounds for termination of lessee's tenancy. LEASE ABANDONMENT: In the event, the apartment is abandoned or otherwise vacated before the lease expiration date and a lease contract buyout agreement has not been approved or executed; resident will be assessed a lease abandonment fee equal to two months' rent. Resident's failure to provide a required written notice to vacate shall result in one but not both of the following: an assessed insufficient notice fee to cover the notice period for an early move out re letting charge as may be permitted by the lease contract. HOLDOVER: Residents who hold over after the termination of this lease, the tenancy shall thereafter be from month to month in the absence of any written agreements to the contrary. During any such holdover period, the terms and conditions of this lease shall remain in full force and effect. Month to month rent shall be based on current Yieldstar month to month pricing. CREDIT REPORTING: Resident is hereby notified that we reserve the right to furnish residents housing and payment history to 3rd party consumer or credit reporting agencies during and after the term of residency. LEASE ADMINISTRATION: Resident hereby agrees to pay a one-time non-refundable lease administration fee of $200. ATTORNEY FEES: Resident hereby agrees to pay attorney fees that result from a breach of the lease agreement as provided by the statute. All transfers will require the resident to be in good standing and with no eviction filings for the previous year. Additionally, all transfers will require an inspection of the current apartment, and any damages found must be paid prior to transfer. Prior approval from the Regional Manager is required for all transfers. A $400 transfer fee will be required and paid prior to transfer. CISTO Resident(s) (All residents must sign) Date of Signing Addendum 04/27/2023 Owner or Owner's Representative Yvette Kidd Date of Signing Addendum 04/28/2023 Alabama/National Apartment Association Official Form, July 2018 2018, National Apartment Association, Inc. :selected: Blue Moon eSignature Services Document ID: 371007951","DWELLING_UNIT_DESCRIPTION, GUEST, LEASE_END_DATE, GUEST, LEASE_ABANDONMENT, GUEST, GUEST_PERMITTED, LEASE_ABANDONMENT, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED"


### Evaluate the model
(Using F1 score and exact matches)

In [None]:
from tqdm import tqdm

def evaluate_peft_model(sample, max_target_length=512):
    # Generate extracted data
    input_ids = torch.tensor(sample["input_ids"]).unsqueeze(0).cuda()
    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, max_new_tokens=max_target_length)
    prediction = tokenizer.decode(outputs[0].detach().cpu().numpy())
    
    # Decode eval sample
    labels = tokenizer.decode(sample['labels'])
 
    return prediction, labels

# Run predictions
predictions, references = [], []
for sample in tqdm(tokenized_test_dataset):
    prediction, reference = evaluate_peft_model(sample)
    predictions.append(prediction)
    references.append(reference)


  0%|                                                                                                                             | 0/37 [00:00<?, ?it/s]


  3%|███▏                                                                                                                 | 1/37 [00:33<19:51, 33.10s/it]


  5%|██████▎                                                                                                              | 2/37 [00:37<09:31, 16.33s/it]


  8%|█████████▍                                                                                                           | 3/37 [00:41<06:00, 10.60s/it]


 11%|████████████▋                                                                                                        | 4/37 [01:07<09:10, 16.69s/it]


 14%|███████████████▊                                                                                                     | 5/37 [01:40<12:00, 22.51s/it]


 16%|██████████████████▉                                                                                                  | 6/37 [02:13<13:25, 25.99s/it]


 19%|██████████████████████▏                                                                                              | 7/37 [02:15<09:04, 18.15s/it]


 22%|█████████████████████████▎                                                                                           | 8/37 [02:47<11:01, 22.81s/it]


 24%|████████████████████████████▍                                                                                        | 9/37 [03:20<12:06, 25.93s/it]


 27%|███████████████████████████████▎                                                                                    | 10/37 [03:33<09:54, 22.03s/it]


 30%|██████████████████████████████████▍                                                                                 | 11/37 [03:34<06:44, 15.57s/it]


 32%|█████████████████████████████████████▌                                                                              | 12/37 [04:07<08:40, 20.82s/it]


 35%|████████████████████████████████████████▊                                                                           | 13/37 [04:40<09:46, 24.45s/it]


 38%|███████████████████████████████████████████▉                                                                        | 14/37 [05:13<10:20, 26.98s/it]


 41%|███████████████████████████████████████████████                                                                     | 15/37 [05:15<07:09, 19.51s/it]


 43%|██████████████████████████████████████████████████▏                                                                 | 16/37 [05:48<08:13, 23.49s/it]


 46%|█████████████████████████████████████████████████████▎                                                              | 17/37 [05:49<05:34, 16.72s/it]


 49%|████████████████████████████████████████████████████████▍                                                           | 18/37 [05:51<03:53, 12.30s/it]


 51%|███████████████████████████████████████████████████████████▌                                                        | 19/37 [06:24<05:32, 18.45s/it]


 54%|██████████████████████████████████████████████████████████████▋                                                     | 20/37 [06:56<06:27, 22.77s/it]


 57%|█████████████████████████████████████████████████████████████████▊                                                  | 21/37 [07:29<06:52, 25.77s/it]


 59%|████████████████████████████████████████████████████████████████████▉                                               | 22/37 [08:02<06:57, 27.86s/it]


 62%|████████████████████████████████████████████████████████████████████████                                            | 23/37 [08:03<04:36, 19.72s/it]


 65%|███████████████████████████████████████████████████████████████████████████▏                                        | 24/37 [08:11<03:32, 16.38s/it]


 68%|██████████████████████████████████████████████████████████████████████████████▍                                     | 25/37 [08:30<03:26, 17.20s/it]


 70%|█████████████████████████████████████████████████████████████████████████████████▌                                  | 26/37 [08:52<03:24, 18.59s/it]


 73%|████████████████████████████████████████████████████████████████████████████████████▋                               | 27/37 [09:19<03:29, 20.93s/it]


 76%|███████████████████████████████████████████████████████████████████████████████████████▊                            | 28/37 [09:30<02:41, 17.97s/it]


 78%|██████████████████████████████████████████████████████████████████████████████████████████▉                         | 29/37 [10:02<02:58, 22.34s/it]


 81%|██████████████████████████████████████████████████████████████████████████████████████████████                      | 30/37 [10:35<02:57, 25.38s/it]


 84%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                  | 31/37 [10:56<02:25, 24.19s/it]


 86%|████████████████████████████████████████████████████████████████████████████████████████████████████▎               | 32/37 [11:28<02:13, 26.66s/it]


 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 33/37 [12:01<01:53, 28.39s/it]


 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 34/37 [12:10<01:07, 22.65s/it]


 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▋      | 35/37 [12:43<00:51, 25.59s/it]


 97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 36/37 [12:46<00:18, 18.90s/it]


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 37/37 [12:48<00:00, 13.94s/it]


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 37/37 [12:48<00:00, 20.78s/it]




In [None]:
from tabulate import tabulate

# Prepare data for tabulation
table_data = []
for i, (pred, ref) in enumerate(zip(predictions, references), 1):
    table_data.append([i, ref, pred])

# Define table headers
headers = ["#", "Reference data", "Extracted data (Finetuned model)"]

# Display the table
tabulate(table_data, headers=headers, tablefmt="html", showindex=False)


#,Reference data,Extracted data (Finetuned model)
1,"""tenant_name"": ""Gary Smith"", ""unit_address"": ""2557 Mountain Lodge Circle, Vestavia Hills, Alabama, 35216"", ""unit_number"": ""2557E"", ""unit_type"": null, ""agreement_date"": ""2023-04-27"", ""lease_start"": ""2023-04-27"", ""lease_end"": null, ""lease_auto_renew"": ""Month to month"", ""hourly_rate"": null, ""monthly_rent"": null, ""prorated_rent"": null, ""security_deposit"": null, ""lease_rent"": null, ""monthly_payment_breakdown"": null, ""utility_charges"": null","DWELLING_UNIT_DESCRIPTION, GUEST, LEASE_END_DATE, GUEST, LEASE_ABANDONMENT, GUEST, GUEST_PERMITTED, LEASE_ABANDONMENT, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED, GUEST_PERMITTED"
2,"""tenant_name"": null, ""unit_address"": null, ""unit_number"": null, ""unit_type"": null, ""agreement_date"": ""2023-02-25"", ""lease_start"": null, ""lease_end"": null, ""lease_auto_renew"": null, ""hourly_rate"": null, ""monthly_rent"": null, ""prorated_rent"": null, ""security_deposit"": null, ""lease_rent"": null, ""monthly_payment_breakdown"": null, ""utility_charges"": null","E. Abandoned Property, i, has given Tenant written notice of termination as required by this Lease or the VRLTA including a notice that any items of personal property left in the Dwelling Unit or the Premises would be disposed of within twenty-four hours after termination;"
3,"""tenant_name"": ""Peter Mendez"", ""unit_address"": null, ""unit_number"": null, ""unit_type"": null, ""agreement_date"": null, ""lease_start"": null, ""lease_end"": null, ""lease_auto_renew"": null, ""hourly_rate"": null, ""monthly_rent"": 1928.0, ""prorated_rent"": null, ""security_deposit"": null, ""lease_rent"": null, ""monthly_payment_breakdown"": ""Rent"": 1758.0, ""Water"": 65.0, ""Trash"": 25.0, ""Pest Control Fee"": 5.0, ""Cable & Internet"": 75.0, ""Total Monthly Rent"": 1928.0, ""utility_charges"": ""Water"": 65.0, ""Trash"": 25.0, ""Pest Control Fee"": 5.0, ""Cable & Internet"": 75.0","-Rent, -Water, -Trash, -Pest Control, -Cable & Internet, -TotalMonthlyRent, -EARLYMOVEOUT, -If you:"
4,"""tenant_name"": ""ESTO"", ""unit_address"": ""2501-1 Mountain Lodge Circle, Vestavia Hills, AL 35216"", ""unit_number"": null, ""unit_type"": null, ""agreement_date"": ""2023-04-27"", ""lease_start"": null, ""lease_end"": null, ""lease_auto_renew"": null, ""hourly_rate"": null, ""monthly_rent"": 939.0, ""prorated_rent"": null, ""security_deposit"": null, ""lease_rent"": null, ""monthly_payment_breakdown"": ""rent"": 939.0, ""pest control"": 4.0, ""trash"": 4.0, ""valet trash"": 25.0, ""cable"": 26.0, ""internet"": 24.0, ""water/sewer"": 90.0, ""total monthly due"": 1112.0, ""utility_charges"": ""pest control"": 4.0, ""trash"": 4.0, ""valet trash"": 25.0, ""cable"": 26.0, ""internet"": 24.0, ""water/sewer"": 90.0","Severability, Originals and Attachments, and Signatures 51.SEVERABILITY. If any provision of this Lease Contract is invalid or unenforceable under applicable law, such provision shall be ineffective to the extent of such invalidity or unenforceability only without invalidating or otherwise affecting the remainder of this Lease Contract. The court shall interpret the lease and provisions herein in a manner such as to uphold the valid portions of this Lease Contract while preserving the intent of the parties. 52.ORIGINALS AND ATTACHMENTS. This Lease Contract has been executed in multiple originals, with original signatures. We will provide you with a copy of the Lease Contract. Your copy of the Lease Contract may be in paper format, in an electronic format at your request, or sent via e-mail if we have communicated by e-mail about this Lease. Our rules and community policies, if any, will be attached to the Lease Contract and provided to you at signing. When an Inventory and Condition form is completed, you should retain a copy, and we should retain a copy. Any addenda or amendments you sign as a part of executing this Lease Contract are binding and hereby incorporated into and made part of the Lease Contract between you and us. This lease is the entire agreement between you and us. You acknowledge that you are NOT relying on any oral representations. A copy or scan of this Lease Contract and related addenda, amendments, and agreements may be used for any purpose and shall be treated as an original. You affirmatively state that you are not a criminal sex offender. Resident or Residents (all sign below) ESTO Date Signed 04/27/2023 Owner or Owner's Authorized Representative and Property Manager (signing on behalf of owner) Date Signed Yvette Kidd 04/2"
5,"""tenant_name"": ""Peter Mendes Tony Diag"", ""unit_address"": null, ""unit_number"": null, ""unit_type"": null, ""agreement_date"": null, ""lease_start"": null, ""lease_end"": null, ""lease_auto_renew"": null, ""hourly_rate"": null, ""monthly_rent"": null, ""prorated_rent"": null, ""security_deposit"": null, ""lease_rent"": null, ""monthly_payment_breakdown"": null, ""utility_charges"": null",",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"
6,"""tenant_name"": null, ""unit_address"": null, ""unit_number"": null, ""unit_type"": null, ""agreement_date"": null, ""lease_start"": null, ""lease_end"": null, ""lease_auto_renew"": null, ""hourly_rate"": null, ""monthly_rent"": null, ""prorated_rent"": null, ""security_deposit"": null, ""lease_rent"": null, ""monthly_payment_breakdown"": null, ""utility_charges"": null","St, Term,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"
7,"""tenant_name"": null, ""unit_address"": null, ""unit_number"": null, ""unit_type"": null, ""agreement_date"": null, ""lease_start"": null, ""lease_end"": null, ""lease_auto_renew"": ""month-to-month"", ""hourly_rate"": null, ""monthly_rent"": null, ""prorated_rent"": null, ""security_deposit"": null, ""lease_rent"": null, ""monthly_payment_breakdown"": null, ""utility_charges"": null","1. Definitions., 2. Apartment., 3. Rent., 3.1. Payments., 3.2."
8,"""tenant_name"": null, ""unit_address"": null, ""unit_number"": null, ""unit_type"": null, ""agreement_date"": null, ""lease_start"": null, ""lease_end"": null, ""lease_auto_renew"": null, ""hourly_rate"": null, ""monthly_rent"": null, ""prorated_rent"": null, ""security_deposit"": null, ""lease_rent"": null, ""monthly_payment_breakdown"": null, ""utility_charges"": null",
9,"""tenant_name"": ""Lucas Poodfuturere"", ""unit_address"": null, ""unit_number"": null, ""unit_type"": null, ""agreement_date"": null, ""lease_start"": null, ""lease_end"": null, ""lease_auto_renew"": null, ""hourly_rate"": null, ""monthly_rent"": null, ""prorated_rent"": null, ""security_deposit"": null, ""lease_rent"": null, ""monthly_payment_breakdown"": null, ""utility_charges"": null","Default, Cause, Eviction, Default, Default, Cause, Eviction, Default, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction, Cause, Eviction,"
10,"""tenant_name"": null, ""unit_address"": null, ""unit_number"": null, ""unit_type"": null, ""agreement_date"": null, ""lease_start"": null, ""lease_end"": null, ""lease_auto_renew"": null, ""hourly_rate"": null, ""monthly_rent"": null, ""prorated_rent"": null, ""security_deposit"": null, ""lease_rent"": null, ""monthly_payment_breakdown"": null, ""utility_charges"": null","RESIDENT LIFE 10. Community Policies. Community Policies become part of the Lease and must be followed. We may make changes, including addi- tions, to our written Community Policies, and those changes can be- come effective immediately if the Community Policies are distributed --- and applicable to all units in the apartment community and do not change the dollar amounts in Lease Details. 10.1. Photo/Video Release. You give us permission to use any photograph, likeness, image or video taken of you while you are using property common areas or participating in any event sponsored by us. 10.2. Disclosure'of Information. At our sole option, we may, but are not obligated to, share and use information related to this Lease for law-enforcement, governmental, or business purposes. At our request, you authorize any utility provider to give us information about pending or actual connections or disconnections'of utility service to your apartment. 10.3. Guests. We may exclude"


In [None]:
# TODO: calculate accuracy using metrics