In [1]:
%pip install pandas numpy torch transformers datasets scikit-learn accelerate bitsandbytes huggingface-hub -q -U

Note: you may need to restart the kernel to use updated packages.


## Build dataset

In [2]:
import pandas as pd
import sqlite3

# Path to the SQLite database
db_path = "../output/extracted_lease_agreements.db"

# Connect to the SQLite database
conn = sqlite3.connect(db_path)

# Query to select all data from the extracted_data table
query = "SELECT * FROM extracted_data"

# Read the data into a DataFrame
df = pd.read_sql_query(query, conn, index_col="id")

# Close the database connection
conn.close()

df

Unnamed: 0_level_0,extracted_text,extracted_fields
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Severability, Originals and Attachments, and S...","{""tenant_name"": ""Lazarus Hennesy Michael Debro..."
2,9. FLOOD DISCLOSURE. In compliance with the Ok...,"{""tenant_name"": null, ""unit_address"": null, ""u..."
3,APARTMENT LEASE CONTRACT\nNATIONAL APARTMENT A...,"{""tenant_name"": ""Lazarus Hennesy JR"", ""unit_ad..."
4,FORCE MAJEURE: If we are prevented from comple...,"{""tenant_name"": null, ""unit_address"": null, ""u..."
5,26.CONDITION OF THE PREMISES AND ALTERATIONS. ...,"{""tenant_name"": null, ""unit_address"": null, ""u..."
...,...,...
285,26.CONDITION OF THE PREMISES AND ALTERATIONS. ...,"{""tenant_name"": null, ""unit_address"": null, ""u..."
286,NAA NATIONAL APARTMENT ASSOCIATION We Lead the...,"{""tenant_name"": ""Novy Jezkova"", ""unit_address""..."
287,We :unselected: require :unselected: do not re...,"{""tenant_name"": null, ""unit_address"": null, ""u..."
288,L HOUSING OPPORTUNITY\nANIMAL ADDENDUM Becomes...,"{""tenant_name"": ""Novy Jezkova"", ""unit_address""..."


In [13]:
import json
# Function to convert snake_case to human-readable text
def humanize_field_name(field_name):
    # Replace underscores with spaces and capitalize the first letter of each word
    human_readable = field_name.replace("_", " ").capitalize()
    return human_readable

# Define descriptions for specific fields (optional)
field_descriptions = {
    "lease_start_date": "the start date of the lease agreement",
    "tenant_name": "the name of the tenant",
    "unit_address": "the address of the rental unit",
    "unit_number": "the number of the rental unit",
    "unit_type": "the type of the rental unit (e.g., apartment, studio, etc.)",
    "agreement_date": "the date the lease agreement was signed",
    "lease_start": "the date when the lease officially begins",
    "lease_end": "the date when the lease officially ends",
    "lease_auto_renew": "whether the lease will automatically renew or not",
    "hourly_rate": "the hourly rate if applicable",
    "monthly_rent": "the monthly rental amount to be paid",
    "prorated_rent": "the rent calculated based on a portion of the month",
    "security_deposit": "the security deposit amount required for the lease",
    "lease_rent": "the total rent amount specified in the lease",
    # Add more fields and their descriptions as needed
}


# fields that are too complex for now
restricted_fields = ["monthly_payment_breakdown", "utility_charges"]

# Initialize a list to hold the prompts and answers for testing Flan-T5
qa_data = []

# Iterate through the rows in the DataFrame
for index, row in df.iterrows():
    extracted_text = row['extracted_text']
    
    # Load the extracted_fields as a JSON object
    extracted_fields = json.loads(row['extracted_fields'])
    
    # Iterate over each key-value pair in the extracted_fields
    for field_name, field_value in extracted_fields.items():
        if field_value is not None and field_name not in restricted_fields:  # Only include non-None values
            # Convert the field name to a human-readable format
            human_readable_field = humanize_field_name(field_name)
            
            # Get the field description, or use the human-readable field name if no description is available
            field_description = field_descriptions.get(field_name, f"the {human_readable_field}")
            
            # Updated instructional prompt following Flan-T5 dataset style
            prompt = (
                f"Please extract {field_description} from the following text: "
                f"'{extracted_text}'."
            )
            
            # The answer will be the field_value
            answer = field_value
            
            # Append the prompt and answer as a tuple to the qa_data list
            qa_data.append((prompt, answer))

# Convert qa_data to a DataFrame for easy manipulation or export
qa_df = pd.DataFrame(qa_data, columns=['Prompt', 'Answer'])

# Display the first few rows of the DataFrame
qa_df.to_csv("lease_agreement_qa_data.csv", index=False)

In [2]:
import pandas as pd

qa_df = pd.read_csv("lease_agreement_qa_data.csv")
qa_df

Unnamed: 0,Prompt,Answer
0,Please extract the name of the tenant from the...,Lazarus Hennesy Michael Debrow
1,Please extract the address of the rental unit ...,"12401 N MacArthur Blvd, Oklahoma City, OK 73142"
2,Please extract the date the lease agreement wa...,2023-04-04
3,Please extract the name of the tenant from the...,Lazarus Hennesy JR
4,Please extract the address of the rental unit ...,12301 N MacArthur Blvd
...,...,...
636,Please extract the date the lease agreement wa...,2023-05-25
637,Please extract the name of the tenant from the...,Novy Jezkova
638,Please extract the address of the rental unit ...,"2501-1 Mountain Lodge Circle, Vestavia Hills, ..."
639,Please extract the date the lease agreement wa...,2023-05-25


In [1]:
%pip install sentencepiece tqdm -q -U

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import re
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm  # Import tqdm for progress bar

# Load the Flan-T5 large model and tokenizer
model_name = "google/flan-t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Normalize function to strip and lowercase
def normalize_answer(s):
    """Lower text and remove punctuation, articles, and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punctuation(text):
        return re.sub(r'[^\w\s]', '', text)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punctuation(lower(s))))

# Exact Match Calculation
def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)

# F1 Score Calculation
def f1_score_qa(prediction, ground_truth):
    pred_tokens = normalize_answer(prediction).split()
    gt_tokens = normalize_answer(ground_truth).split()
    common = set(pred_tokens) & set(gt_tokens)
    if len(common) == 0:
        return 0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gt_tokens)
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

# Custom Dataset for QA prompts
class QADataset(Dataset):
    def __init__(self, qa_df):
        self.qa_df = qa_df

    def __len__(self):
        return len(self.qa_df)

    def __getitem__(self, idx):
        return self.qa_df.iloc[idx]['Prompt'], self.qa_df.iloc[idx]['Answer']

# Function to evaluate Flan-T5 in batches
def evaluate_flan_t5(qa_df, batch_size=8, max_length=512):
    results = []
    dataset = QADataset(qa_df)
    dataloader = DataLoader(dataset, batch_size=batch_size)

    model.eval()
    
    with torch.no_grad():
        # Use tqdm to show progress
        for prompts, actual_answers in tqdm(dataloader, desc="Evaluating", unit="batch"):
            # Tokenize the input prompts
            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)

            # Generate the answers from the model
            outputs = model.generate(**inputs, max_length=128, num_beams=4)
            predicted_answers = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            # Calculate metrics for each example in the batch
            for actual_answer, predicted_answer in zip(actual_answers, predicted_answers):
                exact_match = exact_match_score(predicted_answer, actual_answer)
                f1 = f1_score_qa(predicted_answer, actual_answer)

                # Store the results
                results.append({
                    'Actual Answer': actual_answer,
                    'Predicted Answer': predicted_answer,
                    'Exact Match': exact_match,
                    'F1 Score': f1
                })

    return pd.DataFrame(results)

# Test the Flan-T5 large model on your QA dataset with batching
results_df = evaluate_flan_t5(qa_df)

# Calculate average Exact Match and F1 Score
exact_match_avg = results_df['Exact Match'].mean()
f1_avg = results_df['F1 Score'].mean()

# Display the results
print(f"Exact Match Accuracy: {exact_match_avg:.2f}")
print(f"Average F1 Score: {f1_avg:.2f}")

# Display the first few results
print(results_df.head())


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Evaluating:   0%|          | 0/81 [00:00<?, ?batch/s]

: 