In [None]:
import os
import PyPDF2
import pandas as pd
from tqdm import tqdm

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text() + "\n\n"
        return text

def process_pdfs(directory):
    """Process all PDFs in the given directory."""
    results = []
    pdf_files = [f for f in os.listdir(directory) if f.lower().endswith('.pdf')]
    
    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        file_path = os.path.join(directory, pdf_file)
        try:
            text = extract_text_from_pdf(file_path)
            results.append({
                'filename': pdf_file,
                'text': text,
                'size': os.path.getsize(file_path)
            })
            print(f"Successfully processed {pdf_file}")
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")
    
    # Convert to DataFrame
    df = pd.DataFrame(results)
    return df

if __name__ == "__main__":
    data_dir = "data"
    results_df = process_pdfs(data_dir)
    results_df.to_pickle("extracted_pdf_texts.pkl")
    print(f"Processed {len(results_df)} PDF files and saved to extracted_pdf_texts.pkl")

Processing PDFs:   4%|▍         | 1/23 [00:19<07:15, 19.82s/it]

Successfully processed 4000.1hsghhdbk103123.pdf


Processing PDFs:   9%|▊         | 2/23 [00:20<02:55,  8.38s/it]

Successfully processed CCL_BuyersGuide.pdf
Successfully processed renting-vs-owning.pdf


Processing PDFs:  22%|██▏       | 5/23 [00:20<00:41,  2.30s/it]

Successfully processed RS20530.pdf
Successfully processed FHA-Reference-Guide-2023.pdf
Successfully processed ort-ss-realestatedictionary.pdf


Processing PDFs:  30%|███       | 7/23 [00:20<00:20,  1.26s/it]

Successfully processed TJC_ebook_fha-homeloan.pdf


Processing PDFs:  35%|███▍      | 8/23 [00:21<00:17,  1.14s/it]

Successfully processed GeneralGlossary.pdf


Processing PDFs:  39%|███▉      | 9/23 [00:22<00:12,  1.10it/s]

Successfully processed GLOSSARY_OF_REAL_ESTATE_TERMS.pdf


Processing PDFs:  43%|████▎     | 10/23 [00:22<00:11,  1.13it/s]

Successfully processed Home_Buyers_Guide.pdf
Successfully processed consumer-guide-buying-your-first-home-2024-11-05.pdf
Successfully processed FHA_loan_guidelines.pdf


Processing PDFs:  57%|█████▋    | 13/23 [00:23<00:04,  2.21it/s]

Successfully processed 2024_Zillow_Rent-vs-Buy.pdf


Processing PDFs:  61%|██████    | 14/23 [00:23<00:04,  2.08it/s]

Successfully processed guide_firsttimehomebuying-2.pdf


Processing PDFs:  65%|██████▌   | 15/23 [00:24<00:03,  2.07it/s]

Successfully processed 1507.pdf


Processing PDFs:  70%|██████▉   | 16/23 [00:25<00:04,  1.72it/s]

Successfully processed HL_Buyers_Guide_FINAL_March2019.pdf


Processing PDFs:  78%|███████▊  | 18/23 [00:25<00:02,  2.15it/s]

Successfully processed NAHREP-Glossary-of-Real-Estate-Industry-Terms.pdf
Successfully processed naiop-2024-terms-and-definitions.pdf


Processing PDFs:  87%|████████▋ | 20/23 [00:26<00:00,  3.16it/s]

Successfully processed First-TIme-HomeBuyer-Guide.pdf
Successfully processed realestateglossary.pdf


Processing PDFs:  91%|█████████▏| 21/23 [00:26<00:00,  2.81it/s]

Successfully processed home-buyers-guide-1.pdf
Successfully processed renting-vs-buying-study-press-release.pdf


Processing PDFs: 100%|██████████| 23/23 [00:26<00:00,  1.16s/it]

Successfully processed First-TIme-HomeBuyer-Guide-2.pdf
Processed 23 PDF files and saved to extracted_pdf_texts.pkl





In [9]:
import pandas as pd
import random
import os

def check_extracted_data(pkl_path, num_samples=3, sample_length=500):
    """
    Examine the extracted PDF data to check its quality.
    
    Args:
        pkl_path: Path to the pickle file with extracted text
        num_samples: Number of random samples to display
        sample_length: Number of characters to display from each sample
    """
    # Load the data
    if not os.path.exists(pkl_path):
        print(f"Error: {pkl_path} does not exist.")
        return
    
    print(f"Loading data from {pkl_path}...")
    df = pd.read_pickle(pkl_path)
    
    # Print basic information
    print(f"\nDataset contains {len(df)} documents")
    print(f"Columns: {df.columns.tolist()}")
    
    # Check for empty text
    empty_texts = df[df['text'].str.strip() == ''].shape[0]
    print(f"\nDocuments with empty text: {empty_texts}")
    
    # Check text lengths
    df['text_length'] = df['text'].str.len()
    print(f"\nText length statistics:")
    print(df['text_length'].describe())
    
    # Show some random samples
    print(f"\n{num_samples} random samples (first {sample_length} chars):")
    sample_indices = random.sample(range(len(df)), min(num_samples, len(df)))
    
    for i, idx in enumerate(sample_indices):
        doc = df.iloc[idx]
        print(f"\nSample {i+1} from '{doc['filename']}':")
        print("-" * 80)
        print(doc['text'][:sample_length] + "...")
        print("-" * 80)
    
    # Check for common issues
    print("\nChecking for potential issues:")
    
    # Missing spaces between words (possible OCR issue)
    no_spaces = df[~df['text'].str.contains(' ', regex=False)].shape[0]
    print(f"Documents with no spaces (potential OCR issues): {no_spaces}")
    
    # Unusual characters (possible encoding issues)
    unusual_chars = df[df['text'].str.contains('[^\x00-\x7F]', regex=True)].shape[0]
    print(f"Documents with non-ASCII characters: {unusual_chars}")
    
    return df

if __name__ == "__main__":
    check_extracted_data("extracted_pdf_texts.pkl")

Loading data from extracted_pdf_texts.pkl...

Dataset contains 23 documents
Columns: ['filename', 'text', 'size']

Documents with empty text: 0

Text length statistics:
count    2.300000e+01
mean     2.255588e+05
std      8.347313e+05
min      2.083000e+03
25%      2.211300e+04
50%      3.974200e+04
75%      8.795000e+04
max      4.047940e+06
Name: text_length, dtype: float64

3 random samples (first 500 chars):

Sample 1 from 'CCL_BuyersGuide.pdf':
--------------------------------------------------------------------------------
Home 
Buyer’s 
Guide

1

2About Us
Corcoran Classic Living is a top- performing 
residential and commercial real estate firm 
serving greater Athens, GA. Our agents 
are socially-minded and deeply entrenched 
in their communities, backed by a top-
notch support staff, global connections, 
and innovative technologies that ensure a 
seamless experience. Our mission is to treat 
our customers and clients as the lifeblood of 
our business, keep their satisfaction o

In [1]:
import pandas as pd
import re
import unicodedata
import string
from tqdm import tqdm

def clean_text(text):
    """Clean and preprocess text for NLP tasks."""
    # Replace common non-ASCII characters 
    text = text.replace('–', '-').replace('—', '-').replace(''', "'").replace(''', "'")
    text = text.replace('"', '"').replace('"', '"').replace('…', '...')
    
    # Normalize unicode characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove excessive newlines
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    # Clean up page numbers and headers/footers (common in PDFs)
    text = re.sub(r'\n\s*\d+\s*\n', '\n', text)  # Standalone page numbers
    
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    return text.strip()

def chunk_text(text, chunk_size=1000, overlap=100):
    """Split text into overlapping chunks of approximately chunk_size characters."""
    if len(text) <= chunk_size:
        return [text]
    
    chunks = []
    start = 0
    
    # Add timeout protection
    max_iterations = (len(text) // (chunk_size - overlap)) * 2  # Generous upper bound
    iteration = 0
    
    while start < len(text) and iteration < max_iterations:
        iteration += 1
        end = min(start + chunk_size, len(text))
        
        # Limit the search window for breaking points to improve performance
        search_start = max(start, end - 200)
        
        # Try to find a good breaking point (end of sentence or paragraph)
        if end < len(text):
            # Look for paragraph break first (limit search range)
            paragraph_break = text.rfind('\n\n', search_start, end)
            if paragraph_break != -1:
                end = paragraph_break
            else:
                # Look for sentence break (use a simpler, faster approach)
                for marker in ['. ', '! ', '? ']:
                    sentence_break = text.rfind(marker, search_start, end)
                    if sentence_break != -1:
                        end = sentence_break + 2  # +2 to include the punctuation and space
                        break
        
        # Make sure we're making progress
        if end <= start:
            end = start + chunk_size  # Force progress if no break point found
            
        chunks.append(text[start:end].strip())
        start = end - overlap  # Create overlap between chunks
    
    return chunks

def process_text_data(pkl_path, output_path=None, chunk_size=1500):
    """Clean, preprocess and chunk text data from PDFs."""
    # Load the data
    print(f"Loading data from {pkl_path}...")
    df = pd.read_pickle(pkl_path)
    
    # Clean texts
    print("Cleaning text data...")
    df['cleaned_text'] = df['text'].progress_apply(clean_text)
    
    # Chunk texts
    print("Chunking documents into smaller pieces...")
    all_chunks = []
    
    for i, row in tqdm(df.iterrows(), total=len(df), desc="Chunking documents"):
        try:
            # Skip extremely large docs or process them differently
            if len(row['cleaned_text']) > 1_000_000:  # 1 million chars
                print(f"⚠️ Large document detected: {row['filename']} ({len(row['cleaned_text'])} chars)")
                # Process large documents in a simpler way (just divide by size)
                simple_chunks = [row['cleaned_text'][j:j+chunk_size] 
                               for j in range(0, len(row['cleaned_text']), chunk_size)]
                for j, chunk in enumerate(simple_chunks):
                    all_chunks.append({
                        'source_file': row['filename'],
                        'chunk_id': f"{row['filename']}_simple_{j}",
                        'text': chunk.strip()
                    })
                continue
            
            # Regular chunking for normal sized documents
            chunks = chunk_text(row['cleaned_text'], chunk_size=chunk_size)
            for j, chunk in enumerate(chunks):
                all_chunks.append({
                    'source_file': row['filename'],
                    'chunk_id': f"{row['filename']}_{j}",
                    'text': chunk
                })
        except Exception as e:
            print(f"Error processing {row['filename']}: {e}")
    
    chunks_df = pd.DataFrame(all_chunks)
    print(f"Created {len(chunks_df)} chunks from {len(df)} documents")
    
    # Save the processed data
    if output_path:
        chunks_df.to_pickle(output_path)
        print(f"Saved processed chunks to {output_path}")
    
    return chunks_df

if __name__ == "__main__":
    # Add tqdm to pandas operations
    tqdm.pandas()
    
    # Process the data
    processed_df = process_text_data("extracted_pdf_texts.pkl", "processed_chunks.pkl")
    
    # Display some statistics
    print("\nChunk length statistics:")
    processed_df['text_length'] = processed_df['text'].str.len()
    print(processed_df['text_length'].describe())
    
    # Print a few sample chunks
    print("\nSample chunks:")
    for i in range(min(3, len(processed_df))):
        print(f"\nChunk {i+1} from {processed_df.iloc[i]['source_file']}:")
        print("-" * 80)
        print(processed_df.iloc[i]['text'][:300] + "..." if len(processed_df.iloc[i]['text']) > 300 else processed_df.iloc[i]['text'])
        print("-" * 80)

Loading data from extracted_pdf_texts.pkl...
Cleaning text data...


100%|██████████| 23/23 [00:00<00:00, 116.40it/s]


Chunking documents into smaller pieces...


Chunking documents: 100%|██████████| 23/23 [00:00<00:00, 1913.00it/s]

⚠️ Large document detected: 4000.1hsghhdbk103123.pdf (3893681 chars)
Created 4138 chunks from 23 documents
Saved processed chunks to processed_chunks.pkl

Chunk length statistics:
count    4138.000000
mean     1241.202272
std       527.984130
min        99.000000
25%      1428.000000
50%      1499.000000
75%      1500.000000
max      1500.000000
Name: text_length, dtype: float64

Sample chunks:

Chunk 1 from 4000.1hsghhdbk103123.pdf:
--------------------------------------------------------------------------------
Special Attention of: Transmittal: Handbook 4000.1 All FHA -Approved Mortgagees Issued: October 31, 2023 All Direct Endorsement Underwriters Effective Date: April 29, 2024 All Eligible Submission Sources for Condominium Project Approvals All FHA Roster Appraisers All FHA -Approved 203(k) Consultants...
--------------------------------------------------------------------------------

Chunk 2 from 4000.1hsghhdbk103123.pdf:
--------------------------------------------------------




In [2]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
import faiss
import pickle
import os

# First, install required packages
# !pip install sentence-transformers faiss-cpu

class RealEstateRAG:
    def __init__(self, chunks_path="processed_chunks.pkl", embedding_model="all-MiniLM-L6-v2"):
        """
        Initialize the RAG system with document chunks and embedding model.
        
        Args:
            chunks_path: Path to the pickle file with processed chunks
            embedding_model: SentenceTransformer model to use for embeddings
        """
        self.chunks_path = chunks_path
        self.embedding_model_name = embedding_model
        self.embedding_model = SentenceTransformer(embedding_model)
        self.index = None
        self.chunks_df = None
        self.embeddings = None
        
        # Load chunks if they exist
        if os.path.exists(chunks_path):
            self.load_chunks()
    
    def load_chunks(self):
        """Load document chunks from pickle file."""
        print(f"Loading chunks from {self.chunks_path}...")
        self.chunks_df = pd.read_pickle(self.chunks_path)
        print(f"Loaded {len(self.chunks_df)} chunks")
    
    def create_embeddings(self, save_path="embeddings.pkl"):
        """Create embeddings for all chunks."""
        if self.chunks_df is None:
            self.load_chunks()
        
        print(f"Creating embeddings using {self.embedding_model_name}...")
        texts = self.chunks_df['text'].tolist()
        
        # Embed in batches to avoid memory issues
        batch_size = 64
        embeddings = []
        
        for i in tqdm(range(0, len(texts), batch_size), desc="Creating embeddings"):
            batch_texts = texts[i:i + batch_size]
            batch_embeddings = self.embedding_model.encode(batch_texts)
            embeddings.append(batch_embeddings)
        
        self.embeddings = np.vstack(embeddings)
        
        # Save embeddings
        if save_path:
            with open(save_path, 'wb') as f:
                pickle.dump(self.embeddings, f)
            print(f"Saved embeddings to {save_path}")
        
        return self.embeddings
    
    def load_embeddings(self, embeddings_path="embeddings.pkl"):
        """Load pre-computed embeddings."""
        if os.path.exists(embeddings_path):
            print(f"Loading embeddings from {embeddings_path}...")
            with open(embeddings_path, 'rb') as f:
                self.embeddings = pickle.load(f)
            print(f"Loaded embeddings with shape {self.embeddings.shape}")
            return True
        else:
            print(f"Embeddings file {embeddings_path} not found.")
            return False
    
    def build_index(self):
        """Build a FAISS index for fast similarity search."""
        if self.embeddings is None:
            if not self.load_embeddings():
                self.create_embeddings()
        
        print("Building FAISS index...")
        dimension = self.embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dimension)
        self.index.add(self.embeddings.astype('float32'))
        print(f"Built index with {self.index.ntotal} vectors")
    
    def search(self, query, k=5):
        """
        Search for chunks most similar to the query.
        
        Args:
            query: The search query
            k: Number of results to return
        
        Returns:
            List of dictionaries with chunk text and metadata
        """
        if self.index is None:
            self.build_index()
        
        # Embed the query
        query_embedding = self.embedding_model.encode([query])
        
        # Search the index
        distances, indices = self.index.search(query_embedding.astype('float32'), k)
        
        # Get the results
        results = []
        for i, idx in enumerate(indices[0]):
            if idx < len(self.chunks_df):  # Ensure index is valid
                chunk = self.chunks_df.iloc[idx]
                results.append({
                    'chunk_id': chunk['chunk_id'],
                    'source_file': chunk['source_file'],
                    'text': chunk['text'],
                    'distance': distances[0][i]
                })
        
        return results
    
    def answer_question(self, question, k=5):
        """
        Answer a question using RAG.
        
        Args:
            question: The question to answer
            k: Number of chunks to retrieve
        
        Returns:
            Dictionary with retrieved context and sample answer
        """
        # Retrieve relevant chunks
        relevant_chunks = self.search(question, k=k)
        
        # Combine context
        context = "\n\n".join([f"From {chunk['source_file']}:\n{chunk['text']}" 
                              for chunk in relevant_chunks])
        
        return {
            'question': question,
            'retrieved_chunks': relevant_chunks,
            'context': context
        }

# Initialize and run
if __name__ == "__main__":
    rag = RealEstateRAG()
    
    # Check if embeddings exist, if not create them
    if not os.path.exists("embeddings.pkl"):
        rag.create_embeddings()
    else:
        rag.load_embeddings()
    
    # Build search index
    rag.build_index()
    
    # Test the search
    test_queries = [
        "What is an FHA loan?",
        "Should I rent or buy a house?",
        "What are closing costs?",
        "How do I get pre-approved for a mortgage?"
    ]
    
    print("\nTesting retrieval with sample queries:")
    for query in test_queries:
        print(f"\nQuery: {query}")
        results = rag.search(query, k=2)
        for i, result in enumerate(results):
            print(f"Result {i+1} from {result['source_file']}:")
            print(f"Distance: {result['distance']:.4f}")
            print(f"Text snippet: {result['text'][:150]}...")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading chunks from processed_chunks.pkl...
Loaded 4138 chunks
Creating embeddings using all-MiniLM-L6-v2...


Creating embeddings:   0%|          | 0/65 [00:00<?, ?it/s]

Saved embeddings to embeddings.pkl
Building FAISS index...
Built index with 4138 vectors

Testing retrieval with sample queries:

Query: What is an FHA loan?
Result 1 from FHA-Reference-Guide-2023.pdf:
Distance: 0.6199
Text snippet: alify for than conventional mortgages. FHA loans are insured by the Federal Housing Administration. FHA does not lend money or issue cre dit, so the f...
Result 2 from RS20530.pdf:
Distance: 0.6410
Text snippet: ....... ................................ ................................ ......................... 19 FHA-Insured Home Loans: An Overview Congression...

Query: Should I rent or buy a house?
Result 1 from CCL_BuyersGuide.pdf:
Distance: 0.7296
Text snippet: pected to rise 10 to 15 percent over the next decade, creating a continued high demand for housing. EQUITY Money paid for rent is money that youll nev...
Result 2 from HL_Buyers_Guide_FINAL_March2019.pdf:
Distance: 0.7623
Text snippet: heres one reason you feel speaks especially to you, circle it

In [6]:
class RealEstateChatbot:
    def __init__(self, rag_system):
        """
        Initialize the chatbot with a RAG system.
        
        Args:
            rag_system: An initialized RealEstateRAG object
        """
        self.rag = rag_system
        self.conversation_history = []
    
    def answer(self, query, k=5, show_context=False):
        """
        Answer a user question by retrieving relevant context.
        
        Args:
            query: The user's question
            k: Number of contexts to retrieve
            show_context: Whether to display the retrieved context
        """
        # Save the user's query to conversation history
        self.conversation_history.append({"role": "user", "content": query})
        
        # Retrieve relevant information
        retrieval_results = self.rag.answer_question(query, k=k)
        context = retrieval_results['context']
        
        # Show the retrieved context if requested
        if show_context:
            print("RETRIEVED CONTEXT:")
            print("-" * 80)
            print(context)
            print("-" * 80)
            print()
        
        # Get sources for citation
        sources = []
        for chunk in retrieval_results['retrieved_chunks']:
            source = chunk['source_file']
            if source not in sources:
                sources.append(source)
        
        # Format response with retrieved information
        response = "Based on the retrieved documents:\n\n"
        for i, chunk in enumerate(retrieval_results['retrieved_chunks']):
            response += f"From {chunk['source_file']}:\n"
            response += f"{chunk['text'][:300]}...\n\n"
        
        response += "\nSources: " + ", ".join(sources)
        
        # Add the response to conversation history
        self.conversation_history.append({"role": "assistant", "content": response})
        
        return response
    
    def chat(self):
        """Start an interactive chat session."""
        print("Welcome to RealEstateGPT! Ask me anything about real estate.")
        print("Type 'exit' to end the conversation.\n")
        
        while True:
            query = input("You: ")
            if query.lower() in ['exit', 'quit', 'bye']:
                print("RealEstateGPT: Goodbye! Hope I was helpful.")
                break
            
            answer = self.answer(query, show_context=False)
            print(f"\nRealEstateGPT: {answer}\n")

# Function for Jupyter Notebook interface
def create_chatbot_interface():
    try:
        import ipywidgets as widgets
        from IPython.display import display, clear_output, Markdown
        
        # Initialize the chatbot
        rag = RealEstateRAG()
        rag.load_embeddings()
        rag.build_index()
        chatbot = RealEstateChatbot(rag)
        
        # Create widgets
        output = widgets.Output()
        text_input = widgets.Text(
            placeholder='Type your real estate question here...',
            layout=widgets.Layout(width='80%')
        )
        context_checkbox = widgets.Checkbox(
            value=False,
            description='Show retrieved context',
            disabled=False
        )
        send_button = widgets.Button(
            description='Send',
            button_style='primary',
            tooltip='Send your question'
        )
        clear_button = widgets.Button(
            description='Clear',
            tooltip='Clear the conversation'
        )
        
        # Layout
        input_box = widgets.HBox([text_input, send_button, clear_button])
        display(widgets.VBox([context_checkbox, input_box, output]))
        
        def on_send_button_clicked(b):
            with output:
                query = text_input.value
                if query.strip() == "":
                    return
                
                # Display user question
                display(Markdown(f"**You:** {query}"))
                
                # Get and display answer
                answer = chatbot.answer(query, show_context=context_checkbox.value)
                display(Markdown(f"**RealEstateGPT:** {answer}"))
                
                # Clear input field
                text_input.value = ""
        
        def on_clear_button_clicked(b):
            with output:
                clear_output()
                chatbot.conversation_history = []
        
        # Connect events
        send_button.on_click(on_send_button_clicked)
        clear_button.on_click(on_clear_button_clicked)
        
        # Also submit on enter key
        text_input.on_submit(lambda x: on_send_button_clicked(None))
        
        return chatbot
        
    except ImportError:
        print("ipywidgets not available. Use chatbot.chat() instead.")
        return None

# Initialize and run immediately
print("Initializing Real Estate Chatbot...")
rag = RealEstateRAG()
rag.load_embeddings()
rag.build_index()
chatbot = RealEstateChatbot(rag)

# Demo section
print("\n=== Demo: Ask a few sample questions ===\n")

demo_questions = [
    "What are the advantages of FHA loans for first-time homebuyers?",
    "Is it better to rent or buy in 2024?",
    "What should I know about closing costs?"
]

for question in demo_questions:
    print(f"Question: {question}")
    answer = chatbot.answer(question, k=3)
    print(f"Answer: {answer}\n")
    print("-" * 80 + "\n")

print("\n=== Interactive Mode ===\n")
# For Jupyter Notebook interface
create_chatbot_interface()

print("Or start a command-line chat with: chatbot.chat()")