In [21]:
import os
import pandas as pd
from typing import List, Dict, Any
from langchain_core.documents import Document
from langchain.text_splitter import (RecursiveCharacterTextSplitter,
                                     CharacterTextSplitter,
                                     TokenTextSplitter)
from langchain_community.document_loaders import TextLoader, DirectoryLoader

# Create a Simple Document


* Metadata is useful for:

1. Filtering search results
2. Tracking document sources
3. Adding context in responses
4. Debugging and auditing

In [10]:
doc = Document(page_content="This is the main text content that will be embedded and searched.",
               metadata={"source": "example.txt",
                         "page": 1,
                         "author": "self",
                         "created_at":"2024-09-01",
                         "custom_field":"any_value"})

print(f"""Content: {doc.page_content} \n\nMetadata: {doc.metadata}""")

Content: This is the main text content that will be embedded and searched. 

Metadata: {'source': 'example.txt', 'page': 1, 'author': 'self', 'created_at': '2024-09-01', 'custom_field': 'any_value'}


# Read Text File

In [17]:
loader = TextLoader("../data/txt/india real estate checklist.txt", encoding='utf-8')
documents = loader.load()

In [19]:
documents[0].page_content

"When buying farmland in India, you must check land title deeds and encumbrance certificates to ensure clear ownership and no financial liabilities, along with revenue records like the Record of Rights (ROR)/, mutation register, and property tax receipts to confirm tax compliance and a clean transaction history. Physical checks of the land, including boundary demarcation, soil quality, and water availability, are crucial. Additionally, be aware of state-specific rules, potential land ceiling limits, the need for conversion permissions, and zoning regulations. \nLegal & Documentation Checks\nTitle Deed & Ownership: Verify the seller's ownership and examine the title deed to ensure it's free from disputes. \nEncumbrance Certificate (EC): This certificate confirms the land is free from any financial or legal liabilities like mortgages. \nRecord of Rights (ROR) / 7-12 Extract: This document provides a history of land ownership and cultivation, verifying past transactions. \nMutation Regist

In [20]:
documents[0].metadata

{'source': '../data/txt/india real estate checklist.txt'}

## Read all text files from a Directory

In [22]:
directory_loader = DirectoryLoader("../data/txt", 
                                   glob="*.txt",
                                   loader_cls=TextLoader,
                                   loader_kwargs={"encoding": "utf-8"},
                                   show_progress=True)

documents = directory_loader.load()
len(documents)

100%|██████████| 2/2 [00:00<00:00, 83.71it/s]


2

In [23]:
[print(i.metadata) for i in documents]

{'source': '..\\data\\txt\\india real estate checklist.txt'}
{'source': '..\\data\\txt\\karnataka document checklist.txt'}


[None, None]

# Text Splitting Techniques

## 1. CharacterTextSplitter

In [34]:
text = documents[0].page_content

character_splitter = CharacterTextSplitter(
    separator='\n',
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)

char_chunks = character_splitter.split_text(text=text)
print(char_chunks[0])

Created a chunk of size 565, which is longer than the specified 200


When buying farmland in India, you must check land title deeds and encumbrance certificates to ensure clear ownership and no financial liabilities, along with revenue records like the Record of Rights (ROR)/, mutation register, and property tax receipts to confirm tax compliance and a clean transaction history. Physical checks of the land, including boundary demarcation, soil quality, and water availability, are crucial. Additionally, be aware of state-specific rules, potential land ceiling limits, the need for conversion permissions, and zoning regulations.


In [26]:
len(char_chunks)

15

## 2. Recursive Character Text Splitter

In [36]:
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)

recursive_chunks = recursive_splitter.split_text(text=text)
recursive_chunks

['When buying farmland in India, you must check land title deeds and encumbrance certificates to ensure clear ownership and no financial liabilities, along with revenue records like the Record of Rights',
 'Record of Rights (ROR)/, mutation register, and property tax receipts to confirm tax compliance and a clean transaction history. Physical checks of the land, including boundary demarcation, soil',
 'demarcation, soil quality, and water availability, are crucial. Additionally, be aware of state-specific rules, potential land ceiling limits, the need for conversion permissions, and zoning',
 'and zoning regulations.',
 "Legal & Documentation Checks\nTitle Deed & Ownership: Verify the seller's ownership and examine the title deed to ensure it's free from disputes.",
 'Encumbrance Certificate (EC): This certificate confirms the land is free from any financial or legal liabilities like mortgages.',
 'Record of Rights (ROR) / 7-12 Extract: This document provides a history of land ownershi

## TokenTextSplitter

In [38]:
token_splitter = TokenTextSplitter(
    chunk_size=200,
    chunk_overlap=20
)

token_chunks = token_splitter.split_text(text=text)
print(token_chunks)

["When buying farmland in India, you must check land title deeds and encumbrance certificates to ensure clear ownership and no financial liabilities, along with revenue records like the Record of Rights (ROR)/, mutation register, and property tax receipts to confirm tax compliance and a clean transaction history. Physical checks of the land, including boundary demarcation, soil quality, and water availability, are crucial. Additionally, be aware of state-specific rules, potential land ceiling limits, the need for conversion permissions, and zoning regulations. \nLegal & Documentation Checks\nTitle Deed & Ownership: Verify the seller's ownership and examine the title deed to ensure it's free from disputes. \nEncumbrance Certificate (EC): This certificate confirms the land is free from any financial or legal liabilities like mortgages. \nRecord of Rights (ROR) / 7-12 Extract: This document provides a history of land ownership and cultivation, verifying past transactions. \nMutation Regis