## Finetuning Flan T5 XL for lease agreement data extraction

### Install dependencies

In [1]:
%pip install pandas numpy peft scikit-learn transformers datasets torch accelerate bitsandbytes huggingface-hub -q

Note: you may need to restart the kernel to use updated packages.


### Build the dataframe
Build the dataframe from the collected data

In [1]:
import pandas as pd
import sqlite3

# Path to the SQLite database
db_path = "../output/extracted_lease_agreements.db"

# Connect to the SQLite database
conn = sqlite3.connect(db_path)

# Query to select all data from the extracted_data table
query = "SELECT * FROM extracted_data"

# Read the data into a DataFrame
df = pd.read_sql_query(query, conn, index_col="id")

# Close the database connection
conn.close()

df.head()

Unnamed: 0_level_0,extracted_text,extracted_fields
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Name, address and phone number of managing age...","{'tenant_name': 'Yolanda Strobert', 'unit_addr..."
2,dupusit: or 2) return the remaining portion (i...,"{'tenant_name': 'Comunque Bolas', 'unit_addres..."
3,"You'll pay for all other utilities, related de...","{'tenant_name': None, 'unit_address': None, 'u..."
4,APARTMENT LEASE CONTRACT\nNAA NATIONAL APARTME...,"{'tenant_name': 'Dominique Boles', 'unit_addre..."
5,Disposition or Sale. Except for animals and pr...,"{'tenant_name': None, 'unit_address': None, 'u..."


### Split the dataset

In [2]:
from sklearn.model_selection import train_test_split

# First split: 80% for training, 20% for temp (which will be split into eval and test)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Second split: 50% of temp for eval and 50% for test (10% of the original data each)
eval_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Check the sizes of the splits
print(f"Training set size: {len(train_df)}")
print(f"Evaluation set size: {len(eval_df)}")
print(f"Test set size: {len(test_df)}")


Training set size: 205
Evaluation set size: 26
Test set size: 26


### Load the base model

In [3]:
from peft import LoraConfig, TaskType

peft_config = LoraConfig(
    lora_alpha=16, # Higher alpha to match larger model capacity
    lora_dropout=0.1, # Consistent dropout rate to prevent overfitting
    r=8, # Rank, kept the same for balance between performance and efficiency
    task_type=TaskType.SEQ_2_SEQ_LM, # Change to SEQ_2_SEQ_LM for seq2seq models
    bias="none", # Keeping bias as none, similar to your original setup
    target_modules=[
        'q', 'v', 'k', 'o', # Attention layers (query, value, key, output projections)
        'wi', 'wo', # Feedforward layers (input, output projections)
        'wq', 'wv', 'wk', 'wo', # Additional T5-specific projection layers
    ], # Target modules relevant to T5 architecture
)


In [4]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
from huggingface_hub import notebook_login
from peft import get_peft_model, prepare_model_for_kbit_training
import torch

# login to access gated model
notebook_login()

# Load the tokenizer and model
model_id = "google/flan-t5-xl"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# empty GPU memory
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

model = AutoModelForSeq2SeqLM.from_pretrained(model_id, quantization_config=bnb_config)  # Change num_labels based on your task

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 12,189,696 || all params: 2,861,946,880 || trainable%: 0.4259


### Build the datasets

In [5]:
extraction_json_schema = {
  "title": "ExtractFieldsResult",
  "type": "object",
  "properties": {
    "tenant_name": {
      "type": ["string", "null"],
      "description": "The name of the tenant, found in the OCR text."
    },
    "unit_address": {
      "type": ["string", "null"],
      "description": "The unit address found in the OCR text."
    },
    "unit_number": {
      "type": ["string", "null"],
      "description": "The unit number found in the OCR text."
    },
    "unit_type": {
      "type": ["string", "null"],
      "description": "The unit type found in the OCR text."
    },
    "agreement_date": {
      "type": ["string", "null"],
      "format": "date"
    },
    "lease_start": {
      "type": ["string", "null"],
      "format": "date",
      "description": "The date when the lease starts, found in the OCR text."
    },
    "lease_end": {
      "type": ["string", "null"],
      "format": "date",
      "description": "The date when the lease ends, found in the OCR text."
    },
    "lease_auto_renew": {
      "type": ["string", "null"],
      "description": "The type of lease auto renewal, found in the OCR text."
    },
    "hourly_rate": {
      "type": ["number", "null"],
      "description": "The hourly rate found in the OCR text."
    },
    "monthly_rent": {
      "type": ["number", "null"],
      "description": "The monthly rent found in the OCR text."
    },
    "prorated_rent": {
      "type": ["number", "null"],
      "description": "The prorated rent found in the OCR text."
    },
    "security_deposit": {
      "type": ["number", "null"],
      "description": "The security deposit found in the OCR text."
    },
    "lease_rent": {
      "type": ["number", "null"],
      "description": "The security deposit found in the OCR text."
    },
    "monthly_payment_breakdown": {
      "type": ["object", "null"],
      "description": "The monthly payment breakdown data found in the OCR text.",
      "additionalProperties": {}
    },
    "utility_charges": {
      "type": ["object", "null"],
      "description": "The utility charges found in the OCR text. This is a dictionary with utility charges as the key, and their price as the value.",
      "additionalProperties": {
        "type": ["number", "null"]
      }
    }
  },
  "required": []
}


In [14]:
prompt_template = """
### TASK:
You are a specialized model for extracting specific information from lease agreement text. Your goal is to accurately identify and extract the required data fields from the provided OCR text of a lease agreement.

### INPUT TEXT:
Below is the OCR text extracted from a lease agreement. Carefully analyze this text to identify the relevant data fields.

OCR Text:
{extracted_text}

### REQUIRED DATA FIELDS:
Extract the following data fields from the OCR text:

1. Tenant Name: The name of the tenant(s) in the lease agreement.
2. Unit Address: The full address of the leased unit.
3. Unit Number: The specific number or identifier of the unit.
4. Unit Type: The type of the unit (e.g., apartment, office, etc.).
5. Agreement Date: The date on which the lease agreement was signed.
6. Lease Start Date: The starting date of the lease term.
7. Lease End Date: The ending date of the lease term.
8. Lease Auto-Renew: Whether the lease has an auto-renewal clause (Yes/No).
9. Hourly Rate: The hourly rate specified in the agreement, if any.
10. Monthly Rent: The amount of rent to be paid monthly.
11. Prorated Rent: The prorated rent amount for any partial month, if applicable.
12. Security Deposit: The amount of the security deposit required.
13. Lease Rent: The total rent amount for the lease term.
14. Monthly Payment Breakdown: A breakdown of the monthly payment details.
15. Utility Charges: Any charges for utilities specified in the agreement.

### RESPONSE FORMAT:
Return the extracted data as a JSON object, strictly adhering to the following JSON schema:

### IMPORTANT:
- Ensure all extracted values match exactly as they appear in the OCR text.
- If a field is not present in the OCR text, return null for that field.

```json
{extraction_json_schema}
```

### RESPONSE:        
"""

In [6]:
from datasets import Dataset

def preprocess_function(examples):
    inputs = [prompt_template.format(extracted_text=extracted_text, extraction_json_schema=extraction_json_schema) for extracted_text in examples["extracted_text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # The "labels" are the tokenized outputs:
    labels = tokenizer(
        text_target=examples["extracted_fields"], 
        max_length=512,         
        truncation=True
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Create datasets with the extracted text and labels
train_dataset = Dataset.from_pandas(train_df)
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = Dataset.from_pandas(eval_df)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)
test_dataset = Dataset.from_pandas(test_df)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

train_dataset


Map:   0%|          | 0/205 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Dataset({
    features: ['extracted_text', 'extracted_fields', 'id'],
    num_rows: 205
})

### Train (fine tune) the base model

In [8]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="steps",  # Changed to evaluate based on steps
    eval_steps=1,  # Evaluate every 1 step
    learning_rate=1e-3,
    lr_scheduler_type="linear",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    logging_strategy="steps",
    logging_steps=2,
    disable_tqdm=False,  # Ensure tqdm progress bar is enabled
    max_steps = 10,
    use_cpu=True,
    save_strategy="steps",
    save_steps=1,  # Save a checkpoint every 1 step
    # predict_with_generate=True, # need this for ROUGE/ BLEU metrics
    optim="paged_adamw_32bit",
)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        return_tensors="pt",  # Return PyTorch tensors
    )
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
1,No log,1.282624
2,1.433800,1.145782
3,1.433800,1.035312
4,1.246500,0.93874
5,1.246500,0.851025
6,1.069000,0.777541
7,1.069000,0.720084
8,0.934500,0.678166
9,0.934500,0.649756
10,0.787200,0.635124


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enab

TrainOutput(global_step=10, training_loss=1.0941948771476746, metrics={'train_runtime': 19618.0545, 'train_samples_per_second': 0.016, 'train_steps_per_second': 0.001, 'total_flos': 2722954752294912.0, 'train_loss': 1.0941948771476746, 'epoch': 1.5384615384615383})

### Save the model

In [9]:
peft_model_repo_id = "aryaniyaps/finetuned_flan_t5_xl_lease_data_extraction"

In [13]:
# Push the PEFT model (LoRA adapters) to Hugging Face Hub
trainer.model.save_pretrained("./saved_models/finetuned_flan_t5_xl_lease_data_extraction", push_to_hub=True, repo_name=peft_model_repo_id)

### Evaluate the model

In [11]:
from peft import PeftModel
 
# Load the Lora model
finetuned_model = PeftModel.from_pretrained(model, peft_model_repo_id, device_map={"":0})
finetuned_model.eval()
 
print("PEFT model loaded")

PEFT model loaded


Let's try data extraction with a random sample

In [23]:
from random import randrange
from tabulate import tabulate

sample = test_dataset[randrange(len(test_dataset))]
 
input_ids = tokenizer(prompt_template.format(extracted_text=sample["extracted_text"], extraction_json_schema=extraction_json_schema), return_tensors="pt", truncation=True).input_ids.cuda()
with torch.inference_mode():
    outputs = finetuned_model.generate(input_ids=input_ids, max_new_tokens=512)


In [27]:
prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)

table_data = [[sample['extracted_text'], prediction]]

# Define table headers
headers = ["OCR text", "Extracted data (Finetuned model)"]

# Display the table
tabulate(table_data, headers=headers, tablefmt="html", showindex=False)

OCR text,Extracted data (Finetuned model)
"OWNER/MANAGEMENT INFORMATION DISCLOSURE ADDENDUM: This Owner/Management Information Disclosure Addendum is made for the purpose of complying with Alabama law and is attached to the Lease Contract between Owner and Resident, incorporated with the terms of the Lease Contract herein by reference, and is considered to be a term of the Lease Contract itself. The person authorized to manage the premises contemplated by the Lease Contract is identified as follows and is considered to be Owner's Agent for purposes of the Lease Contract: Name of property manager: Elizabeth Gendron Business Address: The owner of the premises or a person authorized to act for and on behalf of the owner for the purpose of service of process in receiving and receipting the notices and demands called for or contemplated by the Lease Contract, is identified as follows and is considered to be Owner's Agent for purposes of this lease: Name of owner of premises: Madison BAL LLC Business Address: 850 Shoal Run Trail Birmingham, AL Owner's Disclosure (Initial) Eng Owner Resident's Acknowledgement (Initial) CM Resident TM Resident Mac Resident Resident Resident Resident Alabama/National Apartment Association Official Form, July 2018 :selected: Blue Moon eSignature Services Document ID: 361479112 , National Apartment Association, Inc.","', 'unit_address': 'null', 'tenant_name': 'null', 'unit_address': 'null', 'lease_term':'monthly', 'lease_start':'monthly', 'lease_end':'monthly', 'lease_end':'monthly', 'lease_start':'monthly', 'lease_end':'monthly', 'lease_end':'monthly', 'lease_start':'monthly', 'lease_end':'monthly', 'lease_end':'monthly', 'lease_start':'monthly', 'lease_end':'monthly', 'lease_start':'monthly', 'lease_end':'monthly']"


### Evaluate the model
(Using F1 score and exact matches)

#### Install dependencies

In [33]:
%pip install evaluate tabulate -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [13]:
from tqdm import tqdm

def evaluate_peft_model(sample, max_target_length=512):
    # Generate extracted data
    input_ids = torch.tensor(sample["input_ids"]).unsqueeze(0).cuda()
    with torch.inference_mode():
        outputs = model.generate(input_ids=input_ids, max_new_tokens=max_target_length)
    prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
    
    # Decode eval sample
    labels = tokenizer.decode(sample['labels'], skip_special_tokens=True)
 
    return prediction, labels

# Run predictions
predictions, references = [], []
for sample in tqdm(tokenized_test_dataset):
    prediction, reference = evaluate_peft_model(sample)
    predictions.append(prediction)
    references.append(reference)

100%|██████████| 26/26 [28:22<00:00, 65.48s/it]


In [17]:
from tabulate import tabulate

# Prepare data for tabulation
table_data = []
for i, (pred, ref) in enumerate(zip(predictions, references), 1):
    table_data.append([i, ref, pred])

# Define table headers
headers = ["#", "Reference data", "Extracted data (Finetuned model)"]

# Display the table
tabulate(table_data, headers=headers, tablefmt="html", showindex=False)


#,Reference data,Extracted data (Finetuned model)
1,"'tenant_name': None, 'unit_address': '333 H Street, Ste. 5000, Chula Vista, CA 91910', 'unit_number': None, 'unit_type': None, 'agreement_date': None, 'lease_start': None, 'lease_end': '2024-03-31', 'lease_auto_renew': None, 'hourly_rate': None,'monthly_rent': 3350.0, 'prorated_rent': 2121.67,'security_deposit': 2500.0, 'lease_rent': None,'monthly_payment_breakdown': 'Liability to Landlord Insurance': 9.5, 'Admin Fee - Liability to Landlord Insurance': 3.0, 'Rent Income': 3350.0, 'Total': 3362.5, 'utility_charges': None","'LeaseTerm':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':'monthly', 'LeaseRent':"
2,"'tenant_name': 'Saray Ramos Gutierrez', 'unit_address': 'Crystal Terrace', 'unit_number': None, 'unit_type': None, 'agreement_date': '2021-11-01', 'lease_start': None, 'lease_end': None, 'lease_auto_renew': None, 'hourly_rate': None,'monthly_rent': None, 'prorated_rent': None,'security_deposit': None, 'lease_rent': None,'monthly_payment_breakdown': None, 'utility_charges': None","'Lessee': Saray Ramos Gutierrez, 'Lessor': avant garde, 'LeaseTerm': '2011', 'LeaseUnit': 'Crystal Terrace A', 'LeaseAmount': '$', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseAmount': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011', 'LeaseTerm': '2011'"
3,"'tenant_name': 'Claudia Spooney', 'unit_address': '5604 Villas Cir Apt E, Montgomery, AL', 'unit_number': '5604E', 'unit_type': None, 'agreement_date': '2023-02-21', 'lease_start': '2023-02-28', 'lease_end': '2024-02-29', 'lease_auto_renew': None, 'hourly_rate': None,'monthly_rent': None, 'prorated_rent': None,'security_deposit': 0.0, 'lease_rent': None,'monthly_payment_breakdown': None, 'utility_charges': None","'Apartment': '5604E', 'LeaseTerm': '02/28/2023', 'LeaseTerm': '02/29/2024', 'ApartmentNumber': '5604E', 'LeaseTerm': '02/28/2023', 'LeaseTerm': '02/29/2024'"
4,"'tenant_name': None, 'unit_address': None, 'unit_number': None, 'unit_type': None, 'agreement_date': None, 'lease_start': None, 'lease_end': None, 'lease_auto_renew': None, 'hourly_rate': None,'monthly_rent': 1046.0, 'prorated_rent': None,'security_deposit': None, 'lease_rent': None,'monthly_payment_breakdown': None, 'utility_charges': 'electric_service': None, 'water_and_sewer': None, 'gas': None, 'trash': None, 'common_area_electric': None,'monthly_billing_fee': 4.99","'LeaseTerm':'monthly', 'LeaseTerm': 'year', 'LeaseTerm':'month', 'LeaseTerm': 'year', 'LeaseTerm':'month', 'LeaseTerm': 'year', 'LeaseTerm':'month', 'LeaseTerm': 'year', 'LeaseTerm':'month', 'LeaseTerm': 'year', 'LeaseTerm':'month', 'LeaseTerm': 'year', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm': 'year', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm': 'year', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm': 'year', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm':'month', 'LeaseTerm'"
5,"'tenant_name': None, 'unit_address': None, 'unit_number': None, 'unit_type': None, 'agreement_date': None, 'lease_start': None, 'lease_end': None, 'lease_auto_renew': None, 'hourly_rate': None,'monthly_rent': None, 'prorated_rent': None,'security_deposit': None, 'lease_rent': None,'monthly_payment_breakdown': None, 'utility_charges': None","'Apartment': None, 'Landlord': None, 'Apartment_Number': None, 'Lease_Term': None, 'Agreement_Term': None, 'Lease_Term': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Term': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Term': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number': None, 'Agreement_Number'"
6,"'tenant_name': None, 'unit_address': None, 'unit_number': None, 'unit_type': None, 'agreement_date': None, 'lease_start': None, 'lease_end': None, 'lease_auto_renew': None, 'hourly_rate': None,'monthly_rent': None, 'prorated_rent': None,'security_deposit': None, 'lease_rent': None,'monthly_payment_breakdown': None, 'utility_charges': None","'apartment': 'a', 'unit': 'a', 'lease': 'a', 'lease': 'a', 'agreement': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement', 'lease': 'agreement"
7,"'tenant_name': 'Benjamin Breier', 'unit_address': '1722 Greene Ave, Queens, New York 11385', 'unit_number': '1-L', 'unit_type': 'Apartment', 'agreement_date': '2022-08-08', 'lease_start': '2022-10-01', 'lease_end': '2023-09-30', 'lease_auto_renew': None, 'hourly_rate': None,'monthly_rent': 2581.25, 'prorated_rent': None,'security_deposit': 2581.25, 'lease_rent': 30975,'monthly_payment_breakdown': None, 'utility_charges': None","'Landlord': 100A Broadway Avenue, 'LeaseTerm': '2020-2023', 'Apartment': 1722 Greene Ave 1-L, 'LeaseRent': $30,975, 'SecurityDeposit': $2,581.25, 'Utilities': None, 'Agreement': None, 'Lease': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None, 'Agreement': None,"
8,"'tenant_name': 'Taylor Whittle', 'unit_address': None, 'unit_number': None, 'unit_type': None, 'agreement_date': '2023-02-27', 'lease_start': None, 'lease_end': None, 'lease_auto_renew': None, 'hourly_rate': None,'monthly_rent': None, 'prorated_rent': None,'security_deposit': None, 'lease_rent': None,'monthly_payment_breakdown': None, 'utility_charges': None","'LeaseTerm':'monthly', 'LeaseAgreement':'monthly', 'LeaseTerm':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseAgreement':'monthly', 'LeaseA"
9,"'tenant_name': 'Zoey Medina', 'unit_address': 'SPRINGHILL APARTMENTS', 'unit_number': '01102', 'unit_type': None, 'agreement_date': '2023-03-06', 'lease_start': '2023-03-06', 'lease_end': None, 'lease_auto_renew': None, 'hourly_rate': None,'monthly_rent': None, 'prorated_rent': None,'security_deposit': None, 'lease_rent': None,'monthly_payment_breakdown': None, 'utility_charges': None","'Address': '01102', 'LeaseTerm': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseTerm': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement': '3 Months', 'LeaseAgreement"
10,"'tenant_name': None, 'unit_address': None, 'unit_number': None, 'unit_type': None, 'agreement_date': None, 'lease_start': None, 'lease_end': None, 'lease_auto_renew': None, 'hourly_rate': None,'monthly_rent': None, 'prorated_rent': None,'security_deposit': None, 'lease_rent': None,'monthly_payment_breakdown': None, 'utility_charges': None","LESSEE_ID, LESSOR_ID, RENTAL_AMOUNT, RENTAL_QUANTITY, RENTAL_TERM, RENTAL_TERM_PAYMENT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, RENTAL_QUANTITY_PAYMENT_DEPOSIT, REN"


In [None]:
# TODO: calculate accuracy using metrics