In [3]:
#Install packages if necessary
#!pip install pandas numpy faiss-cpu PyPDF2 sentence-transformers python-docx tqdm ipywidgets

In [8]:
import os
import PyPDF2
import pandas as pd
from tqdm import tqdm

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text() + "\n\n"
        return text

def process_pdfs(directory):
    """Process all PDFs in the given directory."""
    results = []
    pdf_files = [f for f in os.listdir(directory) if f.lower().endswith('.pdf')]
    
    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        file_path = os.path.join(directory, pdf_file)
        try:
            text = extract_text_from_pdf(file_path)
            results.append({
                'filename': pdf_file,
                'text': text,
                'size': os.path.getsize(file_path)
            })
            print(f"Successfully processed {pdf_file}")
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")
    
    # Convert to DataFrame
    df = pd.DataFrame(results)
    return df

if __name__ == "__main__":
    data_dir = "data"
    results_df = process_pdfs(data_dir)
    results_df.to_pickle("extracted_pdf_texts.pkl")
    print(f"Processed {len(results_df)} PDF files and saved to extracted_pdf_texts.pkl")

Processing PDFs:   4%|▍         | 1/23 [00:01<00:26,  1.21s/it]

Successfully processed 1507.pdf


Processing PDFs:   9%|▊         | 2/23 [00:01<00:16,  1.25it/s]

Successfully processed 2024_Zillow_Rent-vs-Buy.pdf


Processing PDFs:  13%|█▎        | 3/23 [02:05<19:01, 57.08s/it]

Successfully processed 4000.1hsghhdbk103123.pdf


Processing PDFs:  17%|█▋        | 4/23 [02:07<11:13, 35.42s/it]

Successfully processed CCL_BuyersGuide.pdf


Processing PDFs:  22%|██▏       | 5/23 [02:08<06:49, 22.75s/it]

Successfully processed consumer-guide-buying-your-first-home-2024-11-05.pdf


Processing PDFs:  26%|██▌       | 6/23 [02:08<04:19, 15.25s/it]

Successfully processed FHA-Reference-Guide-2023.pdf
Successfully processed FHA_loan_guidelines.pdf


Processing PDFs:  35%|███▍      | 8/23 [02:10<02:00,  8.06s/it]

Successfully processed First-TIme-HomeBuyer-Guide-2.pdf


Processing PDFs:  39%|███▉      | 9/23 [02:11<01:28,  6.32s/it]

Successfully processed First-TIme-HomeBuyer-Guide.pdf


Processing PDFs:  43%|████▎     | 10/23 [02:16<01:16,  5.91s/it]

Successfully processed GeneralGlossary.pdf


Processing PDFs:  48%|████▊     | 11/23 [02:17<00:56,  4.69s/it]

Successfully processed GLOSSARY_OF_REAL_ESTATE_TERMS.pdf


Processing PDFs:  52%|█████▏    | 12/23 [02:20<00:44,  4.06s/it]

Successfully processed guide_firsttimehomebuying-2.pdf


Processing PDFs:  57%|█████▋    | 13/23 [02:25<00:42,  4.25s/it]

Successfully processed HL_Buyers_Guide_FINAL_March2019.pdf


Processing PDFs:  61%|██████    | 14/23 [02:27<00:33,  3.74s/it]

Successfully processed home-buyers-guide-1.pdf


Processing PDFs:  65%|██████▌   | 15/23 [02:32<00:32,  4.07s/it]

Successfully processed Home_Buyers_Guide.pdf


Processing PDFs:  70%|██████▉   | 16/23 [02:35<00:27,  3.87s/it]

Successfully processed NAHREP-Glossary-of-Real-Estate-Industry-Terms.pdf


Processing PDFs:  74%|███████▍  | 17/23 [02:37<00:18,  3.05s/it]

Successfully processed naiop-2024-terms-and-definitions.pdf


Processing PDFs:  78%|███████▊  | 18/23 [02:37<00:11,  2.26s/it]

Successfully processed ort-ss-realestatedictionary.pdf


Processing PDFs:  83%|████████▎ | 19/23 [02:38<00:07,  1.79s/it]

Successfully processed realestateglossary.pdf


Processing PDFs:  91%|█████████▏| 21/23 [02:38<00:01,  1.02it/s]

Successfully processed renting-vs-buying-study-press-release.pdf
Successfully processed renting-vs-owning.pdf


Processing PDFs:  96%|█████████▌| 22/23 [02:40<00:01,  1.29s/it]

Successfully processed RS20530.pdf


Processing PDFs: 100%|██████████| 23/23 [02:41<00:00,  7.01s/it]

Successfully processed TJC_ebook_fha-homeloan.pdf
Processed 23 PDF files and saved to extracted_pdf_texts.pkl





In [26]:
import pandas as pd
import random
import os

def check_extracted_data(pkl_path, num_samples=3, sample_length=500):
    """
    Examine the extracted PDF data to check its quality.
    
    Args:
        pkl_path: Path to the pickle file with extracted text
        num_samples: Number of random samples to display
        sample_length: Number of characters to display from each sample
    """
    # Load the data
    if not os.path.exists(pkl_path):
        print(f"Error: {pkl_path} does not exist.")
        return
    
    print(f"Loading data from {pkl_path}...")
    df = pd.read_pickle(pkl_path)
    
    # Print basic information
    print(f"\nDataset contains {len(df)} documents")
    print(f"Columns: {df.columns.tolist()}")
    
    # Check for empty text
    empty_texts = df[df['text'].str.strip() == ''].shape[0]
    print(f"\nDocuments with empty text: {empty_texts}")
    
    # Check text lengths
    df['text_length'] = df['text'].str.len()
    print(f"\nText length statistics:")
    print(df['text_length'].describe())
    
    # Show some random samples
    print(f"\n{num_samples} random samples (first {sample_length} chars):")
    sample_indices = random.sample(range(len(df)), min(num_samples, len(df)))
    
    for i, idx in enumerate(sample_indices):
        doc = df.iloc[idx]
        print(f"\nSample {i+1} from '{doc['filename']}':")
        print("-" * 80)
        print(doc['text'][:sample_length] + "...")
        print("-" * 80)
    
    # Check for common issues
    print("\nChecking for potential issues:")
    
    # Missing spaces between words (possible OCR issue)
    no_spaces = df[~df['text'].str.contains(' ', regex=False)].shape[0]
    print(f"Documents with no spaces (potential OCR issues): {no_spaces}")
    
    # Unusual characters (possible encoding issues)
    unusual_chars = df[df['text'].str.contains('[^\x00-\x7F]', regex=True)].shape[0]
    print(f"Documents with non-ASCII characters: {unusual_chars}")
    
    return df

if __name__ == "__main__":
    check_extracted_data("extracted_pdf_texts.pkl")

Loading data from extracted_pdf_texts.pkl...

Dataset contains 23 documents
Columns: ['filename', 'text', 'size']

Documents with empty text: 0

Text length statistics:
count    2.300000e+01
mean     2.255588e+05
std      8.347313e+05
min      2.083000e+03
25%      2.211300e+04
50%      3.974200e+04
75%      8.795000e+04
max      4.047940e+06
Name: text_length, dtype: float64

3 random samples (first 500 chars):

Sample 1 from 'TJC_ebook_fha-homeloan.pdf':
--------------------------------------------------------------------------------
SIMPLE STEPS TO AN 
FHA 
HOME LOAN
Written by: Alvaro R. MoreiraMoreir a Team Mortgage’s


CONTENTS
Introduction ................................................................................................................................  3 
Chapter 1:   Choosing the Right FHA Professional .....................................................................  4
Chapter 2:  What Y ou Need To Know About FHA Home Loans ..................................

In [27]:
import pandas as pd
import re
import unicodedata
import string
from tqdm import tqdm

def clean_text(text):
    """Clean and preprocess text for NLP tasks."""
    # Replace common non-ASCII characters 
    text = text.replace('–', '-').replace('—', '-').replace(''', "'").replace(''', "'")
    text = text.replace('"', '"').replace('"', '"').replace('…', '...')
    
    # Normalize unicode characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove excessive newlines
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    # Clean up page numbers and headers/footers (common in PDFs)
    text = re.sub(r'\n\s*\d+\s*\n', '\n', text)  # Standalone page numbers
    
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    return text.strip()

def chunk_text(text, chunk_size=1000, overlap=100):
    """Split text into overlapping chunks of approximately chunk_size characters."""
    if len(text) <= chunk_size:
        return [text]
    
    chunks = []
    start = 0
    
    # Add timeout protection
    max_iterations = (len(text) // (chunk_size - overlap)) * 2  # Generous upper bound
    iteration = 0
    
    while start < len(text) and iteration < max_iterations:
        iteration += 1
        end = min(start + chunk_size, len(text))
        
        # Limit the search window for breaking points to improve performance
        search_start = max(start, end - 200)
        
        # Try to find a good breaking point (end of sentence or paragraph)
        if end < len(text):
            # Look for paragraph break first (limit search range)
            paragraph_break = text.rfind('\n\n', search_start, end)
            if paragraph_break != -1:
                end = paragraph_break
            else:
                # Look for sentence break (use a simpler, faster approach)
                for marker in ['. ', '! ', '? ']:
                    sentence_break = text.rfind(marker, search_start, end)
                    if sentence_break != -1:
                        end = sentence_break + 2  # +2 to include the punctuation and space
                        break
        
        # Make sure we're making progress
        if end <= start:
            end = start + chunk_size  # Force progress if no break point found
            
        chunks.append(text[start:end].strip())
        start = end - overlap  # Create overlap between chunks
    
    return chunks

def process_text_data(pkl_path, output_path=None, chunk_size=1500):
    """Clean, preprocess and chunk text data from PDFs."""
    # Load the data
    print(f"Loading data from {pkl_path}...")
    df = pd.read_pickle(pkl_path)
    
    # Clean texts
    print("Cleaning text data...")
    df['cleaned_text'] = df['text'].progress_apply(clean_text)
    
    # Chunk texts
    print("Chunking documents into smaller pieces...")
    all_chunks = []
    
    for i, row in tqdm(df.iterrows(), total=len(df), desc="Chunking documents"):
        try:
            # Skip extremely large docs or process them differently
            if len(row['cleaned_text']) > 1_000_000:  # 1 million chars
                print(f"⚠️ Large document detected: {row['filename']} ({len(row['cleaned_text'])} chars)")
                # Process large documents in a simpler way (just divide by size)
                simple_chunks = [row['cleaned_text'][j:j+chunk_size] 
                               for j in range(0, len(row['cleaned_text']), chunk_size)]
                for j, chunk in enumerate(simple_chunks):
                    all_chunks.append({
                        'source_file': row['filename'],
                        'chunk_id': f"{row['filename']}_simple_{j}",
                        'text': chunk.strip()
                    })
                continue
            
            # Regular chunking for normal sized documents
            chunks = chunk_text(row['cleaned_text'], chunk_size=chunk_size)
            for j, chunk in enumerate(chunks):
                all_chunks.append({
                    'source_file': row['filename'],
                    'chunk_id': f"{row['filename']}_{j}",
                    'text': chunk
                })
        except Exception as e:
            print(f"Error processing {row['filename']}: {e}")
    
    chunks_df = pd.DataFrame(all_chunks)
    print(f"Created {len(chunks_df)} chunks from {len(df)} documents")
    
    # Save the processed data
    if output_path:
        chunks_df.to_pickle(output_path)
        print(f"Saved processed chunks to {output_path}")
    
    return chunks_df

if __name__ == "__main__":
    # Add tqdm to pandas operations
    tqdm.pandas()
    
    # Process the data
    processed_df = process_text_data("extracted_pdf_texts.pkl", "processed_chunks.pkl")
    
    # Display some statistics
    print("\nChunk length statistics:")
    processed_df['text_length'] = processed_df['text'].str.len()
    print(processed_df['text_length'].describe())
    
    # Print a few sample chunks
    print("\nSample chunks:")
    for i in range(min(3, len(processed_df))):
        print(f"\nChunk {i+1} from {processed_df.iloc[i]['source_file']}:")
        print("-" * 80)
        print(processed_df.iloc[i]['text'][:300] + "..." if len(processed_df.iloc[i]['text']) > 300 else processed_df.iloc[i]['text'])
        print("-" * 80)

Loading data from extracted_pdf_texts.pkl...
Cleaning text data...


100%|██████████| 23/23 [00:00<00:00, 61.24it/s]


Chunking documents into smaller pieces...


Chunking documents: 100%|██████████| 23/23 [00:00<00:00, 724.64it/s]

⚠️ Large document detected: 4000.1hsghhdbk103123.pdf (3893681 chars)
Created 4138 chunks from 23 documents
Saved processed chunks to processed_chunks.pkl

Chunk length statistics:
count    4138.000000
mean     1241.202272
std       527.984130
min        99.000000
25%      1428.000000
50%      1499.000000
75%      1500.000000
max      1500.000000
Name: text_length, dtype: float64

Sample chunks:

Chunk 1 from 1507.pdf:
--------------------------------------------------------------------------------
Client: HUD Job Number: GOV HUC C1 1175 Size: 14 x 8.5 Date: 4/09/01 Art Director: Kennedy Copywriter: --- Production: Traffic: Studio Proofreader Copywriter Art Director Art Buyer Creative Dir. Acct. Exec. Acct. Sup. THE HUD HOME BUYING GUIDE U.S. Department of Housing and Urban Development Office ...
--------------------------------------------------------------------------------

Chunk 2 from 1507.pdf:
--------------------------------------------------------------------------------
homeown




In [28]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
import faiss
import pickle
import os

# First, install required packages
# !pip install sentence-transformers faiss-cpu

class RealEstateRAG:
    def __init__(self, chunks_path="processed_chunks.pkl", embedding_model="all-MiniLM-L6-v2"):
        """
        Initialize the RAG system with document chunks and embedding model.
        
        Args:
            chunks_path: Path to the pickle file with processed chunks
            embedding_model: SentenceTransformer model to use for embeddings
        """
        self.chunks_path = chunks_path
        self.embedding_model_name = embedding_model
        self.embedding_model = SentenceTransformer(embedding_model)
        self.index = None
        self.chunks_df = None
        self.embeddings = None
        
        # Load chunks if they exist
        if os.path.exists(chunks_path):
            self.load_chunks()
    
    def load_chunks(self):
        """Load document chunks from pickle file."""
        print(f"Loading chunks from {self.chunks_path}...")
        self.chunks_df = pd.read_pickle(self.chunks_path)
        print(f"Loaded {len(self.chunks_df)} chunks")
    
    def create_embeddings(self, save_path="embeddings.pkl"):
        """Create embeddings for all chunks."""
        if self.chunks_df is None:
            self.load_chunks()
        
        print(f"Creating embeddings using {self.embedding_model_name}...")
        texts = self.chunks_df['text'].tolist()
        
        # Embed in batches to avoid memory issues
        batch_size = 64
        embeddings = []
        
        for i in tqdm(range(0, len(texts), batch_size), desc="Creating embeddings"):
            batch_texts = texts[i:i + batch_size]
            batch_embeddings = self.embedding_model.encode(batch_texts)
            embeddings.append(batch_embeddings)
        
        self.embeddings = np.vstack(embeddings)
        
        # Save embeddings
        if save_path:
            with open(save_path, 'wb') as f:
                pickle.dump(self.embeddings, f)
            print(f"Saved embeddings to {save_path}")
        
        return self.embeddings
    
    def load_embeddings(self, embeddings_path="embeddings.pkl"):
        """Load pre-computed embeddings."""
        if os.path.exists(embeddings_path):
            print(f"Loading embeddings from {embeddings_path}...")
            with open(embeddings_path, 'rb') as f:
                self.embeddings = pickle.load(f)
            print(f"Loaded embeddings with shape {self.embeddings.shape}")
            return True
        else:
            print(f"Embeddings file {embeddings_path} not found.")
            return False
    
    def build_index(self):
        """Build a FAISS index for fast similarity search."""
        if self.embeddings is None:
            if not self.load_embeddings():
                self.create_embeddings()
        
        print("Building FAISS index...")
        dimension = self.embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dimension)
        self.index.add(self.embeddings.astype('float32'))
        print(f"Built index with {self.index.ntotal} vectors")
    
    def search(self, query, k=5):
        """
        Search for chunks most similar to the query.
        
        Args:
            query: The search query
            k: Number of results to return
        
        Returns:
            List of dictionaries with chunk text and metadata
        """
        if self.index is None:
            self.build_index()
        
        # Embed the query
        query_embedding = self.embedding_model.encode([query])
        
        # Search the index
        distances, indices = self.index.search(query_embedding.astype('float32'), k)
        
        # Get the results
        results = []
        for i, idx in enumerate(indices[0]):
            if idx < len(self.chunks_df):  # Ensure index is valid
                chunk = self.chunks_df.iloc[idx]
                results.append({
                    'chunk_id': chunk['chunk_id'],
                    'source_file': chunk['source_file'],
                    'text': chunk['text'],
                    'distance': distances[0][i]
                })
        
        return results
    
    def answer_question(self, question, k=5):
        """
        Answer a question using RAG.
        
        Args:
            question: The question to answer
            k: Number of chunks to retrieve
        
        Returns:
            Dictionary with retrieved context and sample answer
        """
        # Retrieve relevant chunks
        relevant_chunks = self.search(question, k=k)
        
        # Combine context
        context = "\n\n".join([f"From {chunk['source_file']}:\n{chunk['text']}" 
                              for chunk in relevant_chunks])
        
        return {
            'question': question,
            'retrieved_chunks': relevant_chunks,
            'context': context
        }

# Initialize and run
if __name__ == "__main__":
    rag = RealEstateRAG()
    
    # Check if embeddings exist, if not create them
    if not os.path.exists("embeddings.pkl"):
        rag.create_embeddings()
    else:
        rag.load_embeddings()
    
    # Build search index
    rag.build_index()
    
    # Test the search
    test_queries = [
        "What is an FHA loan?",
        "Should I rent or buy a house?",
        "What are closing costs?",
        "How do I get pre-approved for a mortgage?"
    ]
    
    print("\nTesting retrieval with sample queries:")
    for query in test_queries:
        print(f"\nQuery: {query}")
        results = rag.search(query, k=2)
        for i, result in enumerate(results):
            print(f"Result {i+1} from {result['source_file']}:")
            print(f"Distance: {result['distance']:.4f}")
            print(f"Text snippet: {result['text'][:150]}...")

Loading chunks from processed_chunks.pkl...
Loaded 4138 chunks
Loading embeddings from embeddings.pkl...
Loaded embeddings with shape (4138, 384)
Building FAISS index...
Built index with 4138 vectors

Testing retrieval with sample queries:

Query: What is an FHA loan?
Result 1 from FHA-Reference-Guide-2023.pdf:
Distance: 0.6199
Text snippet: alify for than conventional mortgages. FHA loans are insured by the Federal Housing Administration. FHA does not lend money or issue cre dit, so the f...
Result 2 from RS20530.pdf:
Distance: 0.6410
Text snippet: ....... ................................ ................................ ......................... 19 FHA-Insured Home Loans: An Overview Congression...

Query: Should I rent or buy a house?
Result 1 from CCL_BuyersGuide.pdf:
Distance: 0.7296
Text snippet: pected to rise 10 to 15 percent over the next decade, creating a continued high demand for housing. EQUITY Money paid for rent is money that youll nev...
Result 2 from HL_Buyers_Guide_FI

In [29]:
class RealEstateChatbot:
    def __init__(self, rag_system):
        """
        Initialize the chatbot with a RAG system.

        Args:
            rag_system: An initialized RealEstateRAG object
        """
        self.rag = rag_system
        self.conversation_history = []

    def answer(self, query, k=5, show_context=False):
        """
        Answer a user question by retrieving relevant context.

        Args:
            query: The user's question
            k: Number of contexts to retrieve
            show_context: Whether to display the retrieved context
        """
        # Save the user's query to conversation history
        self.conversation_history.append({"role": "user", "content": query})

        # Retrieve relevant information
        retrieval_results = self.rag.answer_question(query, k=k)
        context = retrieval_results['context']

        # Show the retrieved context if requested
        if show_context:
            print("RETRIEVED CONTEXT:")
            print("-" * 80)
            print(context)
            print("-" * 80)
            print()

        # Get sources for citation
        sources = []
        for chunk in retrieval_results['retrieved_chunks']:
            source = chunk['source_file']
            if source not in sources:
                sources.append(source)

        # Format response with retrieved information
        response = "Based on the retrieved documents:\n\n"
        for i, chunk in enumerate(retrieval_results['retrieved_chunks']):
            response += f"From {chunk['source_file']}:\n"
            response += f"{chunk['text'][:300]}...\n\n"

        response += "\nSources: " + ", ".join(sources)

        # Add the response to conversation history
        self.conversation_history.append({"role": "assistant", "content": response})

        return response

    def chat(self):
        """Start an interactive chat session."""
        print("Welcome to RealEstateGPT! Ask me anything about real estate.")
        print("Type 'exit' to end the conversation.\n")

        while True:
            query = input("You: ")
            if query.lower() in ['exit', 'quit', 'bye']:
                print("RealEstateGPT: Goodbye! Hope I was helpful.")
                break

            answer = self.answer(query, show_context=False)
            print(f"\nRealEstateGPT: {answer}\n")

# Function for Jupyter Notebook interface
def create_chatbot_interface():
    try:
        import ipywidgets as widgets
        from IPython.display import display, clear_output, Markdown

        # Initialize the chatbot
        rag = RealEstateRAG()
        rag.load_embeddings()
        rag.build_index()
        chatbot = RealEstateChatbot(rag)

        # Create widgets
        output = widgets.Output()
        text_input = widgets.Text(
            placeholder='Type your real estate question here...',
            layout=widgets.Layout(width='80%')
        )
        context_checkbox = widgets.Checkbox(
            value=False,
            description='Show retrieved context',
            disabled=False
        )
        send_button = widgets.Button(
            description='Send',
            button_style='primary',
            tooltip='Send your question'
        )
        clear_button = widgets.Button(
            description='Clear',
            tooltip='Clear the conversation'
        )

        # Layout
        input_box = widgets.HBox([text_input, send_button, clear_button])
        display(widgets.VBox([context_checkbox, input_box, output]))

        def on_send_button_clicked(b):
            with output:
                query = text_input.value
                if query.strip() == "":
                    return

                # Display user question
                display(Markdown(f"**You:** {query}"))

                # Get and display answer
                answer = chatbot.answer(query, show_context=context_checkbox.value)
                display(Markdown(f"**RealEstateGPT:** {answer}"))

                # Clear input field
                text_input.value = ""

        def on_clear_button_clicked(b):
            with output:
                clear_output()
                chatbot.conversation_history = []

        # Connect events
        send_button.on_click(on_send_button_clicked)
        clear_button.on_click(on_clear_button_clicked)

        # Also submit on enter key
        text_input.on_submit(lambda x: on_send_button_clicked(None))

        return chatbot

    except ImportError:
        print("ipywidgets not available. Use chatbot.chat() instead.")
        return None

# Initialize and run immediately
print("Initializing Real Estate Chatbot...")
rag = RealEstateRAG()
rag.load_embeddings()
rag.build_index()
chatbot = RealEstateChatbot(rag)

# Demo section
print("\n=== Demo: Ask a few sample questions ===\n")

demo_questions = [
    "What are the advantages of FHA loans for first-time homebuyers?",
    "Is it better to rent or buy in 2024?",
    "What should I know about closing costs?"
]

for question in demo_questions:
    print(f"Question: {question}")
    answer = chatbot.answer(question, k=3)
    print(f"Answer: {answer}\n")
    print("-" * 80 + "\n")

print("\n=== Interactive Mode ===\n")
# For Jupyter Notebook interface
create_chatbot_interface()

print("Or start a command-line chat with: chatbot.chat()")

Initializing Real Estate Chatbot...
Loading chunks from processed_chunks.pkl...
Loaded 4138 chunks
Loading embeddings from embeddings.pkl...
Loaded embeddings with shape (4138, 384)
Building FAISS index...
Built index with 4138 vectors

=== Demo: Ask a few sample questions ===

Question: What are the advantages of FHA loans for first-time homebuyers?
Answer: Based on the retrieved documents:

From guide_firsttimehomebuying-2.pdf:
ents can be as low as 3.5%. + Credit requirements are more accessible than many conventional loans. + Closing costs may be lower than with conventional loans.CONS: An FHA loan may not be the most cost-effective option if you have good credit and 10% or greater down payment. Mortgage insurance is req...

From RS20530.pdf:
arly large role for first -time homebuyers, low - and moderate -income households, and minorit ies. For example , nearly 85% of FHA -insured mortgages made to purchase a home (rather than to refinance an existing mortgage) in FY20 21 were ob t

VBox(children=(Checkbox(value=False, description='Show retrieved context'), HBox(children=(Text(value='', layo…

Or start a command-line chat with: chatbot.chat()


  text_input.on_submit(lambda x: on_send_button_clicked(None))


In [30]:
import pandas as pd
import docx
import PyPDF2
import os
from tqdm import tqdm
import pickle

class DataProcessor:
    """Base class for processing different types of real estate data"""
    
    def __init__(self, data_dir="data"):
        self.data_dir = data_dir
        
    def save_processed_data(self, data, output_path):
        """Save processed data to a pickle file"""
        with open(output_path, 'wb') as f:
            pickle.dump(data, f)
        print(f"Saved processed data to {output_path}")

class StructuredDataProcessor(DataProcessor):
    """Process structured data (Excel/CSV files)"""
    
    def process_excel(self, file_path):
        """Process Excel file and return structured data"""
        try:
            df = pd.read_excel(file_path)
            return {
                'filename': os.path.basename(file_path),
                'data': df,
                'metadata': {
                    'columns': df.columns.tolist(),
                    'rows': len(df),
                    'dtypes': {col: str(dtype) for col, dtype in df.dtypes.items()}
                }
            }
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            return None
    
    def process_csv(self, file_path):
        """Process CSV file and return structured data"""
        try:
            df = pd.read_csv(file_path)
            return {
                'filename': os.path.basename(file_path),
                'data': df,
                'metadata': {
                    'columns': df.columns.tolist(),
                    'rows': len(df),
                    'dtypes': {col: str(dtype) for col, dtype in df.dtypes.items()}
                }
            }
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            return None
    
    def process_all(self):
        """Process all structured data files in the data directory"""
        results = []
        
        # Find all Excel and CSV files
        excel_files = [f for f in os.listdir(self.data_dir) 
                      if f.lower().endswith(('.xlsx', '.xls'))]
        csv_files = [f for f in os.listdir(self.data_dir) 
                    if f.lower().endswith('.csv')]
        
        # Process Excel files
        for file in tqdm(excel_files, desc="Processing Excel files"):
            file_path = os.path.join(self.data_dir, file)
            result = self.process_excel(file_path)
            if result:
                results.append(result)
        
        # Process CSV files
        for file in tqdm(csv_files, desc="Processing CSV files"):
            file_path = os.path.join(self.data_dir, file)
            result = self.process_csv(file_path)
            if result:
                results.append(result)
        
        return results

class DocProcessor(DataProcessor):
    """Process Word documents"""
    
    def extract_text_from_docx(self, file_path):
        """Extract text from a Word document"""
        try:
            doc = docx.Document(file_path)
            full_text = []
            for para in doc.paragraphs:
                full_text.append(para.text)
            return '\n'.join(full_text)
        except Exception as e:
            print(f"Error extracting text from {file_path}: {e}")
            return ""
    
    def process_all(self):
        """Process all Word documents in the data directory"""
        results = []
        
        # Find all Word documents
        docx_files = [f for f in os.listdir(self.data_dir) 
                     if f.lower().endswith('.docx')]
        
        # Process Word documents
        for file in tqdm(docx_files, desc="Processing Word documents"):
            file_path = os.path.join(self.data_dir, file)
            try:
                text = self.extract_text_from_docx(file_path)
                results.append({
                    'filename': file,
                    'text': text,
                    'size': os.path.getsize(file_path)
                })
                print(f"Successfully processed {file}")
            except Exception as e:
                print(f"Error processing {file}: {e}")
        
        return results

In [31]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
import faiss
import pickle
import os
import re

class EnhancedRealEstateRAG:
    def __init__(self, 
                 text_chunks_path="processed_chunks.pkl", 
                 structured_data_path="processed_structured_data.pkl",
                 embedding_model="all-MiniLM-L6-v2"):
        """
        Initialize the enhanced RAG system with document chunks and embedding model.
        
        Args:
            text_chunks_path: Path to the pickle file with processed text chunks
            structured_data_path: Path to the pickle file with processed structured data
            embedding_model: SentenceTransformer model to use for embeddings
        """
        self.text_chunks_path = text_chunks_path
        self.structured_data_path = structured_data_path
        self.embedding_model_name = embedding_model
        self.embedding_model = SentenceTransformer(embedding_model)
        
        self.text_index = None
        self.text_chunks_df = None
        self.text_embeddings = None
        
        self.structured_data = None
        self.structured_data_descriptions = None
        self.structured_data_index = None
        self.structured_data_embeddings = None
        
        # Load data if it exists
        self.load_data()
    
    def load_data(self):
        """Load document chunks and structured data from pickle files."""
        # Load text chunks
        if os.path.exists(self.text_chunks_path):
            print(f"Loading text chunks from {self.text_chunks_path}...")
            self.text_chunks_df = pd.read_pickle(self.text_chunks_path)
            print(f"Loaded {len(self.text_chunks_df)} text chunks")
        
        # Load structured data
        if os.path.exists(self.structured_data_path):
            print(f"Loading structured data from {self.structured_data_path}...")
            with open(self.structured_data_path, 'rb') as f:
                self.structured_data = pickle.load(f)
            print(f"Loaded structured data with {len(self.structured_data)} files")
            
            # Create descriptions for structured data
            self.create_structured_data_descriptions()
    
    def create_structured_data_descriptions(self):
        """Create searchable descriptions for structured data."""
        if not self.structured_data:
            return
        
        descriptions = []
        for item in self.structured_data:
            df = item['data']
            filename = item['filename']
            
            # Create a description of the dataset
            description = f"Dataset: {filename}\n"
            description += f"Contains {len(df)} rows and {len(df.columns)} columns.\n"
            description += f"Columns: {', '.join(df.columns.tolist())}\n"
            
            # Add sample data (first few rows)
            description += "Sample data:\n"
            sample = df.head(3).to_string()
            description += sample
            
            descriptions.append({
                'source_file': filename,
                'text': description,
                'type': 'structured'
            })
            
            # Also add column-specific descriptions
            for column in df.columns:
                col_desc = f"Column '{column}' in dataset {filename}.\n"
                try:
                    # Add statistical information if numeric
                    if pd.api.types.is_numeric_dtype(df[column]):
                        col_desc += f"Min: {df[column].min()}, Max: {df[column].max()}, Mean: {df[column].mean():.2f}\n"
                    # Add unique values if categorical with few values
                    elif df[column].nunique() < 10:
                        col_desc += f"Values: {', '.join(map(str, df[column].unique()[:10]))}\n"
                    # Add value counts for high-frequency items
                    value_counts = df[column].value_counts().nlargest(5).to_dict()
                    col_desc += "Most common values: " + ", ".join([f"{k}: {v}" for k, v in value_counts.items()])
                except:
                    # Skip if there's an error calculating stats
                    pass
                
                descriptions.append({
                    'source_file': filename,
                    'text': col_desc,
                    'type': 'structured_column',
                    'column': column
                })
        
        self.structured_data_descriptions = pd.DataFrame(descriptions)
    
    def create_embeddings(self, save_text_path="text_embeddings.pkl", save_structured_path="structured_embeddings.pkl"):
        """Create embeddings for all content."""
        # Create text embeddings
        if self.text_chunks_df is not None:
            print(f"Creating text embeddings using {self.embedding_model_name}...")
            texts = self.text_chunks_df['text'].tolist()
            
            # Embed in batches
            batch_size = 64
            embeddings = []
            
            for i in tqdm(range(0, len(texts), batch_size), desc="Creating text embeddings"):
                batch_texts = texts[i:i + batch_size]
                batch_embeddings = self.embedding_model.encode(batch_texts)
                embeddings.append(batch_embeddings)
            
            self.text_embeddings = np.vstack(embeddings)
            
            # Save embeddings
            if save_text_path:
                with open(save_text_path, 'wb') as f:
                    pickle.dump(self.text_embeddings, f)
                print(f"Saved text embeddings to {save_text_path}")
        
        # Create structured data embeddings
        if self.structured_data_descriptions is not None:
            print(f"Creating structured data embeddings using {self.embedding_model_name}...")
            descriptions = self.structured_data_descriptions['text'].tolist()
            
            # Embed in batches
            batch_size = 64
            embeddings = []
            
            for i in tqdm(range(0, len(descriptions), batch_size), desc="Creating structured data embeddings"):
                batch_texts = descriptions[i:i + batch_size]
                batch_embeddings = self.embedding_model.encode(batch_texts)
                embeddings.append(batch_embeddings)
            
            self.structured_data_embeddings = np.vstack(embeddings)
            
            # Save embeddings
            if save_structured_path:
                with open(save_structured_path, 'wb') as f:
                    pickle.dump(self.structured_data_embeddings, f)
                print(f"Saved structured data embeddings to {save_structured_path}")
    
    def load_embeddings(self, text_embeddings_path="text_embeddings.pkl", 
                       structured_embeddings_path="structured_embeddings.pkl"):
        """Load pre-computed embeddings."""
        # Load text embeddings
        if os.path.exists(text_embeddings_path):
            print(f"Loading text embeddings from {text_embeddings_path}...")
            with open(text_embeddings_path, 'rb') as f:
                self.text_embeddings = pickle.load(f)
            print(f"Loaded text embeddings with shape {self.text_embeddings.shape}")
        
        # Load structured data embeddings
        if os.path.exists(structured_embeddings_path) and self.structured_data_descriptions is not None:
            print(f"Loading structured data embeddings from {structured_embeddings_path}...")
            with open(structured_embeddings_path, 'rb') as f:
                self.structured_data_embeddings = pickle.load(f)
            print(f"Loaded structured data embeddings with shape {self.structured_data_embeddings.shape}")
    
    def build_indices(self):
        """Build FAISS indices for fast similarity search."""
        # Build text index
        if self.text_embeddings is not None:
            print("Building text FAISS index...")
            dimension = self.text_embeddings.shape[1]
            self.text_index = faiss.IndexFlatL2(dimension)
            self.text_index.add(self.text_embeddings.astype('float32'))
            print(f"Built text index with {self.text_index.ntotal} vectors")
        
        # Build structured data index
        if self.structured_data_embeddings is not None:
            print("Building structured data FAISS index...")
            dimension = self.structured_data_embeddings.shape[1]
            self.structured_data_index = faiss.IndexFlatL2(dimension)
            self.structured_data_index.add(self.structured_data_embeddings.astype('float32'))
            print(f"Built structured data index with {self.structured_data_index.ntotal} vectors")
    
    def classify_query(self, query):
        """Classify whether the query is more likely to be for text or structured data."""
        # Keywords that suggest structured data queries
        structured_keywords = [
            'how many', 'average', 'median', 'statistics', 'price range', 'data', 
            'dataset', 'spreadsheet', 'chart', 'graph', 'table', 'column', 'row',
            'excel', 'csv', 'percentage', 'trend', 'distribution', 'count'
        ]
        
        # Simple rule-based classification
        if any(keyword in query.lower() for keyword in structured_keywords):
            return "structured"
        
        # More complex classification could be implemented here
        return "text"
    
    def search(self, query, k=5, query_type=None):
        """
        Search for content most similar to the query.
        
        Args:
            query: The search query
            k: Number of results to return
            query_type: Force search in "text" or "structured" data, or "both"
        
        Returns:
            List of dictionaries with content and metadata
        """
        # Auto-classify query if not specified
        if query_type is None:
            query_type = self.classify_query(query)
        
        # Embed the query
        query_embedding = self.embedding_model.encode([query])
        
        results = []
        
        # Search text index
        if query_type in ["text", "both"] and self.text_index is not None:
            distances, indices = self.text_index.search(query_embedding.astype('float32'), k)
            
            for i, idx in enumerate(indices[0]):
                if idx < len(self.text_chunks_df):  # Ensure index is valid
                    chunk = self.text_chunks_df.iloc[idx]
                    results.append({
                        'chunk_id': chunk['chunk_id'],
                        'source_file': chunk['source_file'],
                        'text': chunk['text'],
                        'distance': distances[0][i],
                        'type': 'text'
                    })
        
        # Search structured data index
        if query_type in ["structured", "both"] and self.structured_data_index is not None:
            # Adjust k to get enough results when combining
            s_k = k if query_type == "structured" else k // 2
            
            distances, indices = self.structured_data_index.search(query_embedding.astype('float32'), s_k)
            
            for i, idx in enumerate(indices[0]):
                if idx < len(self.structured_data_descriptions):  # Ensure index is valid
                    item = self.structured_data_descriptions.iloc[idx]
                    results.append({
                        'source_file': item['source_file'],
                        'text': item['text'],
                        'distance': distances[0][i],
                        'type': item['type']
                    })
        
        # Sort by distance
        results.sort(key=lambda x: x['distance'])
        
        # Limit to k results
        return results[:k]
    
    def answer_question(self, question, k=5):
        """
        Answer a question using RAG.
        
        Args:
            question: The question to answer
            k: Number of chunks to retrieve
        
        Returns:
            Dictionary with retrieved context and metadata
        """
        # Determine if this is a structured or unstructured data question
        query_type = self.classify_query(question)
        
        # Retrieve relevant content
        relevant_content = self.search(question, k=k, query_type=query_type)
        
        # Combine context
        context = "\n\n".join([f"From {item['source_file']}:\n{item['text']}" 
                              for item in relevant_content])
        
        return {
            'question': question,
            'retrieved_content': relevant_content,
            'context': context,
            'query_type': query_type
        }
    
    def extract_structured_data(self, filename, column=None, filters=None):
        """
        Extract specific data from structured datasets.
        
        Args:
            filename: Name of the dataset file
            column: Specific column to extract
            filters: Dictionary of column:value pairs to filter the data
        
        Returns:
            DataFrame or Series with the extracted data
        """
        if not self.structured_data:
            return None
        
        # Find the dataset
        dataset = None
        for item in self.structured_data:
            if item['filename'] == filename:
                dataset = item['data']
                break
        
        if dataset is None:
            return None
        
        # Apply filters
        if filters:
            for col, value in filters.items():
                if col in dataset.columns:
                    dataset = dataset[dataset[col] == value]
        
        # Extract specific column
        if column and column in dataset.columns:
            return dataset[column]
        
        return dataset

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import json
import os
from tqdm import tqdm

class FineTuningDataGenerator:
    """Generate data for fine-tuning LLMs on real estate content"""
    
    def __init__(self, text_chunks_path="processed_chunks.pkl", output_dir="fine_tuning_data"):
        self.text_chunks_path = text_chunks_path
        self.output_dir = output_dir
        self.chunks_df = None
        
        # Load chunks data
        if os.path.exists(text_chunks_path):
            self.chunks_df = pd.read_pickle(text_chunks_path)
            print(f"Loaded {len(self.chunks_df)} text chunks for fine-tuning data generation")
        
        # Create output directory if needed
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
    
    def generate_qa_pairs(self, num_pairs=100, min_context_length=200):
        """
        Generate question-answer pairs for fine-tuning based on chunks.
        
        This creates synthetic QA pairs from the text chunks.
        """
        if self.chunks_df is None:
            print("No chunks data available. Please check the path.")
            return
        
        # Filter chunks by minimum length
        valid_chunks = self.chunks_df[self.chunks_df['text'].str.len() > min_context_length]
        if len(valid_chunks) < num_pairs:
            print(f"Warning: Only {len(valid_chunks)} valid chunks available.")
            num_pairs = len(valid_chunks)
        
        # Sample random chunks
        selected_chunks = valid_chunks.sample(num_pairs)
        
        qa_pairs = []
        
        question_templates = [
            "What does the document say about {topic}?",
            "Can you explain {topic} based on the text?",
            "What information is provided about {topic}?",
            "How does the document describe {topic}?",
            "What are the key points mentioned about {topic}?"
        ]
        
        # Extract topics from chunks and create QA pairs
        for _, chunk in tqdm(selected_chunks.iterrows(), total=len(selected_chunks), 
                            desc="Generating QA pairs"):
            # Extract potential topic words (nouns) using simple heuristics
            # In a real implementation, you might use NLP libraries for better extraction
            text = chunk['text']
            words = text.split()
            
            # Find potential topic words (longer words, likely to be meaningful)
            potential_topics = [word for word in words 
                               if len(word) > 5 and word.isalpha()]
            
            if not potential_topics:
                # If no good topics found, use generic template
                question = "What information is provided in this text?"
                answer = text
            else:
                # Select a random topic and question template
                topic = np.random.choice(potential_topics)
                question_template = np.random.choice(question_templates)
                question = question_template.format(topic=topic)
                answer = text
            
            qa_pairs.append({
                'question': question,
                'answer': answer,
                'source': chunk['source_file']
            })
        
        # Save the generated QA pairs
        output_path = os.path.join(self.output_dir, "qa_pairs.json")
        with open(output_path, 'w') as f:
            json.dump(qa_pairs, f, indent=2)
        
        print(f"Generated {len(qa_pairs)} QA pairs and saved to {output_path}")
        return qa_pairs
    
    def generate_fine_tuning_formats(self, qa_pairs=None, test_size=0.2):
        """
        Format QA pairs for different fine-tuning methods.
        
        Generates formats for:
        1. OpenAI fine-tuning (JSONL)
        2. Hugging Face fine-tuning (CSV)
        """
        if qa_pairs is None:
            # Try to load QA pairs
            qa_path = os.path.join(self.output_dir, "qa_pairs.json")
            if os.path.exists(qa_path):
                with open(qa_path, 'r') as f:
                    qa_pairs = json.load(f)
            else:
                print("No QA pairs provided or found. Please generate QA pairs first.")
                return
        
        # Split into train and test sets
        train_pairs, test_pairs = train_test_split(qa_pairs, test_size=test_size, random_state=42)
        
        # 1. OpenAI fine-tuning format (JSONL)
        openai_train = []
        openai_test = []
        
        for pair in train_pairs:
            openai_train.append({
                "messages": [
                    {"role": "system", "content": "You are a helpful assistant specialized in real estate."},
                    {"role": "user", "content": pair['question']},
                    {"role": "assistant", "content": pair['answer']}
                ]
            })
        
        for pair in test_pairs:
            openai_test.append({
                "messages": [
                    {"role": "system", "content": "You are a helpful assistant specialized in real estate."},
                    {"role": "user", "content": pair['question']},
                    {"role": "assistant", "content": pair['answer']}
                ]
            })
        
        # Save OpenAI format
        with open(os.path.join(self.output_dir, "openai_train.jsonl"), 'w') as f:
            for item in openai_train:
                f.write(json.dumps(item) + '\n')
        
        with open(os.path.join(self.output_dir, "openai_test.jsonl"), 'w') as f:
            for item in openai_test:
                f.write(json.dumps(item) + '\n')
        
        # 2. Hugging Face fine-tuning format (CSV)
        hf_train_data = pd.DataFrame({
            'input': [pair['question'] for pair in train_pairs],
            'output': [pair['answer'] for pair in train_pairs]
        })
        
        hf_test_data = pd.DataFrame({
            'input': [pair['question'] for pair in test_pairs],
            'output': [pair['answer'] for pair in test_pairs]
        })
        
        # Save Hugging Face format
        hf_train_data.to_csv(os.path.join(self.output_dir, "hf_train.csv"), index=False)
        hf_test_data.to_csv(os.path.join(self.output_dir, "hf_test.csv"), index=False)
        
        print(f"Created fine-tuning datasets in multiple formats:")
        print(f"OpenAI: {len(openai_train)} training, {len(openai_test)} testing examples")
        print(f"Hugging Face: {len(hf_train_data)} training, {len(hf_test_data)} testing examples")
        
        return {
            'openai': {'train': openai_train, 'test': openai_test},
            'huggingface': {'train': hf_train_data, 'test': hf_test_data}
        }

In [34]:
import pandas as pd
import numpy as np
import os
import re
import pickle
from enhanced_rag import EnhancedRealEstateRAG

class RealEstateChatbot:
    def __init__(self, rag_system=None):
        """
        Initialize the chatbot with a RAG system.
        
        Args:
            rag_system: An initialized EnhancedRealEstateRAG object
        """
        # Initialize RAG if not provided
        if rag_system is None:
            print("Initializing new RAG system...")
            self.rag = EnhancedRealEstateRAG()
            
            # Load embeddings and build index
            if os.path.exists("text_embeddings.pkl"):
                self.rag.load_embeddings()
                self.rag.build_indices()
            else:
                print("No embeddings found. Please create embeddings first.")
                return
        else:
            self.rag = rag_system
        
        self.conversation_history = []
        self.last_query_type = None
        self.last_structured_data_file = None
    
    def answer(self, query, k=5, show_context=False, query_type=None):
        """
        Answer a user question by retrieving relevant context.
        
        Args:
            query: The user's question
            k: Number of contexts to retrieve
            show_context: Whether to display the retrieved context
            query_type: Force a specific query type ("text", "structured", or "both")
            
        Returns:
            A formatted response
        """
        # Save the user's query to conversation history
        self.conversation_history.append({"role": "user", "content": query})
        
        # Check for special commands
        if query.lower().startswith("analyze "):
            return self._handle_analysis_command(query[8:])
        
        # Retrieve relevant information
        retrieval_results = self.rag.answer_question(query, k=k)
        context = retrieval_results['context']
        self.last_query_type = retrieval_results['query_type']
        
        # Show the retrieved context if requested
        if show_context:
            print("RETRIEVED CONTEXT:")
            print("-" * 80)
            print(context)
            print("-" * 80)
            print()
        
        # Get sources for citation
        sources = []
        structured_files = []
        for item in retrieval_results['retrieved_content']:
            source = item['source_file']
            if source not in sources:
                sources.append(source)
            
            # Track structured data files for follow-up
            if item.get('type') in ['structured', 'structured_column']:
                if source not in structured_files:
                    structured_files.append(source)
                    self.last_structured_data_file = source
        
        # Format response based on query type
        if self.last_query_type == "structured":
            response = self._format_structured_response(retrieval_results, structured_files)
        else:
            response = self._format_text_response(retrieval_results)
        
        # Add the response to conversation history
        self.conversation_history.append({"role": "assistant", "content": response})
        
        return response
    
    def _format_text_response(self, retrieval_results):
        """Format response for text-based queries."""
        response = "Based on the real estate documents I've analyzed:\n\n"
        
        # Group by source file for better organization
        by_source = {}
        for item in retrieval_results['retrieved_content']:
            source = item['source_file']
            if source not in by_source:
                by_source[source] = []
            by_source[source].append(item['text'])
        
        # Build response from each source
        for source, texts in by_source.items():
            response += f"From {source}:\n"
            
            # Join and truncate content from this source
            combined_text = "\n".join(texts)
            truncated = combined_text[:500] + "..." if len(combined_text) > 500 else combined_text
            response += f"{truncated}\n\n"
        
        # Add sources
        sources = list(by_source.keys())
        response += f"Sources: {', '.join(sources)}"
        
        return response
    
    def _format_structured_response(self, retrieval_results, structured_files):
        """Format response for structured data queries."""
        response = "Based on the real estate data analysis:\n\n"
        
        for item in retrieval_results['retrieved_content']:
            if item.get('type') in ['structured', 'structured_column']:
                response += f"From {item['source_file']}:\n"
                response += f"{item['text']}\n\n"
        
        # Add follow-up hint
        if structured_files:
            response += f"You can ask me to analyze specific aspects of the {structured_files[0]} dataset. "
            response += f"For example, try 'analyze {structured_files[0]} by price range'.\n\n"
        
        return response
    
    def _handle_analysis_command(self, command):
        """Handle special analysis commands for structured data."""
        # Parse command to extract file and analysis type
        parts = command.strip().split()
        if len(parts) < 2:
            return "Please specify what to analyze. For example: 'analyze dataset.csv by price'"
        
        filename = parts[0]
        analysis_type = " ".join(parts[1:])
        
        # Find the structured dataset
        if not self.rag.structured_data:
            return "No structured data is available for analysis."
        
        dataset = None
        for item in self.rag.structured_data:
            if item['filename'] == filename:
                dataset = item['data']
                break
        
        if dataset is None:
            return f"Dataset '{filename}' not found. Available datasets: " + \
                   ", ".join([item['filename'] for item in self.rag.structured_data])
        
        # Perform different analyses based on the command
        try:
            if "summary" in analysis_type.lower():
                return self._generate_summary_stats(dataset, filename)
            elif "price range" in analysis_type.lower() or "price distribution" in analysis_type.lower():
                return self._analyze_price_distribution(dataset, filename)
            elif "location" in analysis_type.lower() or "area" in analysis_type.lower():
                return self._analyze_by_location(dataset, filename)
            elif "trend" in analysis_type.lower() or "time" in analysis_type.lower():
                return self._analyze_time_trends(dataset, filename)
            else:
                return f"I'm not sure how to analyze {filename} by {analysis_type}. " + \
                       "Try asking for a summary, price range, location analysis, or time trends."
        except Exception as e:
            return f"Error analyzing {filename}: {str(e)}"
    
    def _generate_summary_stats(self, df, filename):
        """Generate summary statistics for a dataset."""
        response = f"## Summary Statistics for {filename}\n\n"
        
        # Basic dataset info
        response += f"This dataset contains {len(df)} properties with {len(df.columns)} attributes.\n\n"
        
        # Try to identify numeric columns for statistics
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        if numeric_cols:
            response += "### Key Metrics\n\n"
            
            # Look for price/value columns
            price_cols = [col for col in numeric_cols 
                         if any(term in col.lower() for term in ['price', 'value', 'cost', 'amount'])]
            
            if price_cols:
                for col in price_cols[:2]:  # Limit to first 2 price columns
                    response += f"**{col}**:\n"
                    response += f"- Average: ${df[col].mean():.2f}\n"
                    response += f"- Median: ${df[col].median():.2f}\n"
                    response += f"- Range: ${df[col].min():.2f} to ${df[col].max():.2f}\n\n"
            
            # Look for size/area columns
            size_cols = [col for col in numeric_cols 
                        if any(term in col.lower() for term in ['size', 'area', 'sqft', 'feet', 'acre'])]
            
            if size_cols:
                for col in size_cols[:2]:  # Limit to first 2 size columns
                    response += f"**{col}**:\n"
                    response += f"- Average: {df[col].mean():.2f}\n"
                    response += f"- Median: {df[col].median():.2f}\n"
                    response += f"- Range: {df[col].min():.2f} to {df[col].max():.2f}\n\n"
        
        # Try to identify categorical columns
        categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
        if categorical_cols:
            # Look for location columns
            location_cols = [col for col in categorical_cols 
                           if any(term in col.lower() for term in ['city', 'state', 'zip', 'location', 'address'])]
            
            if location_cols:
                for col in location_cols[:1]:  # Limit to first location column
                    top_locations = df[col].value_counts().nlargest(5)
                    response += f"**Top {col}**:\n"
                    for loc, count in top_locations.items():
                        response += f"- {loc}: {count} properties ({count/len(df)*100:.1f}%)\n"
                    response += "\n"
            
            # Look for property type columns
            type_cols = [col for col in categorical_cols 
                       if any(term in col.lower() for term in ['type', 'style', 'property', 'category'])]
            
            if type_cols:
                for col in type_cols[:1]:  # Limit to first type column
                    top_types = df[col].value_counts().nlargest(5)
                    response += f"**Top {col}**:\n"
                    for typ, count in top_types.items():
                        response += f"- {typ}: {count} properties ({count/len(df)*100:.1f}%)\n"
                    response += "\n"
        
        response += "You can ask for more specific analyses like 'analyze " + \
                    f"{filename} by price range' or 'analyze {filename} by location'"
        
        return response
    
    def _analyze_price_distribution(self, df, filename):
        """Analyze price distribution in the dataset."""
        # Try to find price column
        price_cols = [col for col in df.columns 
                     if any(term in col.lower() for term in ['price', 'value', 'cost', 'amount'])]
        
        if not price_cols:
            return f"Could not identify a price column in {filename}."
        
        price_col = price_cols[0]  # Use the first price column found
        
        # Ensure the column is numeric
        if not pd.api.types.is_numeric_dtype(df[price_col]):
            return f"The column {price_col} is not numeric, cannot analyze price distribution."
        
        response = f"## Price Distribution Analysis for {filename}\n\n"
        
        # Calculate price ranges
        min_price = df[price_col].min()
        max_price = df[price_col].max()
        
        response += f"Price range from ${min_price:.2f} to ${max_price:.2f}\n\n"
        
        # Create price brackets
        brackets = 5
        bracket_size = (max_price - min_price) / brackets
        
        response += "### Price Brackets\n\n"
        for i in range(brackets):
            lower = min_price + i * bracket_size
            upper = lower + bracket_size
            count = df[(df[price_col] >= lower) & (df[price_col] < upper)].shape[0]
            percentage = count / len(df) * 100
            
            response += f"- ${lower:.2f} to ${upper:.2f}: {count} properties ({percentage:.1f}%)\n"
        
        # Properties above the highest bracket
        count = df[df[price_col] >= max_price].shape[0]
        percentage = count / len(df) * 100
        response += f"- ${max_price:.2f} and above: {count} properties ({percentage:.1f}%)\n\n"
        
        return response
    
    def _analyze_by_location(self, df, filename):
        """Analyze properties by location."""
        # Try to find location columns
        location_cols = [col for col in df.columns 
                       if any(term in col.lower() for term in ['city', 'state', 'zip', 'location', 'address'])]
        
        if not location_cols:
            return f"Could not identify a location column in {filename}."
        
        location_col = location_cols[0]  # Use the first location column found
        
        response = f"## Location Analysis for {filename}\n\n"
        
        # Count properties by location
        location_counts = df[location_col].value_counts().nlargest(10)
        
        response += "### Top 10 Locations\n\n"
        for location, count in location_counts.items():
            percentage = count / len(df) * 100
            response += f"- {location}: {count} properties ({percentage:.1f}%)\n"
        
        # Try to find price column to analyze price by location
        price_cols = [col for col in df.columns 
                     if any(term in col.lower() for term in ['price', 'value', 'cost', 'amount'])]
        
        if price_cols and pd.api.types.is_numeric_dtype(df[price_cols[0]]):
            price_col = price_cols[0]
            response += f"\n### Average Prices by Top 5 Locations\n\n"
            
            for location in location_counts.index[:5]:
                avg_price = df[df[location_col] == location][price_col].mean()
                response += f"- {location}: ${avg_price:.2f}\n"
        
        return response
    
    def _analyze_time_trends(self, df, filename):
        """Analyze trends over time."""
        # Try to find date columns
        date_cols = [col for col in df.columns 
                   if any(term in col.lower() for term in ['date', 'year', 'month', 'time'])]
        
        if not date_cols:
            return f"Could not identify a date/time column in {filename}."
        
        date_col = date_cols[0]  # Use the first date column found
        
        # Try to convert to datetime if not already
        try:
            if not pd.api.types.is_datetime64_dtype(df[date_col]):
                df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
        except:
            return f"Could not convert {date_col} to a valid date format."
        
        # Drop rows with invalid dates
        df = df.dropna(subset=[date_col])
        
        response = f"## Time Trend Analysis for {filename}\n\n"
        
        # Extract year and month
        df['year'] = df[date_col].dt.year
        
        # Count by year
        year_counts = df['year'].value_counts().sort_index()
        
        response += "### Properties by Year\n\n"
        for year, count in year_counts.items():
            percentage = count / len(df) * 100
            response += f"- {year}: {count} properties ({percentage:.1f}%)\n"
        
        # Try to find price column to analyze price trends
        price_cols = [col for col in df.columns 
                     if any(term in col.lower() for term in ['price', 'value', 'cost', 'amount'])]
        
        if price_cols and pd.api.types.is_numeric_dtype(df[price_cols[0]]):
            price_col = price_cols[0]
            response += f"\n### Average Prices by Year\n\n"
            
            price_by_year = df.groupby('year')[price_col].mean()
            
            for year, avg_price in price_by_year.items():
                response += f"- {year}: ${avg_price:.2f}\n"
            
            # Calculate price change between first and last year
            if len(price_by_year) > 1:
                first_year = price_by_year.index[0]
                last_year = price_by_year.index[-1]
                price_change = ((price_by_year[last_year] / price_by_year[first_year]) - 1) * 100
                
                response += f"\nPrice change from {first_year} to {last_year}: {price_change:.1f}%\n"
        
        return response
    
    def chat(self):
        """Start an interactive chat session."""
        print("Welcome to RealEstateGPT! Ask me anything about real estate.")
        print("Type 'exit' to end the conversation.\n")
        
        while True:
            query = input("You: ")
            if query.lower() in ['exit', 'quit', 'bye']:
                print("RealEstateGPT: Goodbye! Hope I was helpful.")
                break
            
            answer = self.answer(query, show_context=False)
            print(f"\nRealEstateGPT: {answer}\n")

ModuleNotFoundError: No module named 'enhanced_rag'

In [35]:
#MAIN APPLICATION
import os
import argparse
from data_processors import DataProcessor, StructuredDataProcessor, DocProcessor
from enhanced_rag import EnhancedRealEstateRAG
from real_estate_chatbot import RealEstateChatbot
from fine_tuning import FineTuningDataGenerator

def process_all_data(data_dir="data", force_reprocess=False):
    """Process all data sources and generate necessary files."""
    # Check if processed files already exist
    text_chunks_exists = os.path.exists("processed_chunks.pkl")
    structured_data_exists = os.path.exists("processed_structured_data.pkl")
    
    if text_chunks_exists and structured_data_exists and not force_reprocess:
        print("Processed data files already exist. Use --force to reprocess.")
        return
    
    # Process PDF files (using your existing code)
    if not text_chunks_exists or force_reprocess:
        print("Processing PDF files...")
        # You can call your existing functions here
        # We'll assume they create processed_chunks.pkl
    
    # Process structured data
    if not structured_data_exists or force_reprocess:
        print("Processing structured data...")
        structured_processor = StructuredDataProcessor(data_dir)
        structured_data = structured_processor.process_all()
        structured_processor.save_processed_data(structured_data, "processed_structured_data.pkl")
        print(f"Processed {len(structured_data)} structured data files")
    
    # Process Word documents
    print("Processing Word documents...")
    doc_processor = DocProcessor(data_dir)
    doc_data = doc_processor.process_all()
    doc_processor.save_processed_data(doc_data, "processed_doc_data.pkl")
    print(f"Processed {len(doc_data)} Word documents")

def create_embeddings(force_recreate=False):
    """Create embeddings for all data sources."""
    text_embeddings_exists = os.path.exists("text_embeddings.pkl")
    structured_embeddings_exists = os.path.exists("structured_embeddings.pkl")
    
    if text_embeddings_exists and structured_embeddings_exists and not force_recreate:
        print("Embedding files already exist. Use --force to recreate.")
        return
    
    # Initialize RAG system
    rag = EnhancedRealEstateRAG()
    
    # Create embeddings
    rag.create_embeddings()

def prepare_fine_tuning_data(num_pairs=1000):
    """Prepare data for fine-tuning experiments."""
    print(f"Generating fine-tuning data with {num_pairs} QA pairs...")
    
    # Initialize data generator
    generator = FineTuningDataGenerator()
    
    # Generate QA pairs
    qa_pairs = generator.generate_qa_pairs(num_pairs=num_pairs)
    
    # Format for different fine-tuning approaches
    generator.generate_fine_tuning_formats(qa_pairs)

def start_chatbot():
    """Initialize and start the chatbot."""
    print("Initializing Real Estate Chatbot...")
    
    # Initialize RAG system
    rag = EnhancedRealEstateRAG()
    rag.load_embeddings()
    rag.build_indices()
    
    # Initialize chatbot
    chatbot = RealEstateChatbot(rag)
    
    # Start interactive chat
    chatbot.chat()

def main():
    parser = argparse.ArgumentParser(description='RealEstateLLM - AI-powered real estate chatbot')
    
    parser.add_argument('--process', action='store_true', 
                        help='Process all data sources')
    parser.add_argument('--embeddings', action='store_true', 
                        help='Create embeddings for all data')
    parser.add_argument('--finetune', action='store_true', 
                        help='Prepare data for fine-tuning')
    parser.add_argument('--chat', action='store_true', 
                        help='Start the chatbot')
    parser.add_argument('--force', action='store_true', 
                        help='Force reprocessing of existing files')
    parser.add_argument('--qa-pairs', type=int, default=1000, 
                        help='Number of QA pairs to generate for fine-tuning')
    
    args = parser.parse_args()
    
    # If no arguments, show help
    if not any(vars(args).values()):
        parser.print_help()
        return
    
    # Process data if requested
    if args.process:
        process_all_data(force_reprocess=args.force)
    
    # Create embeddings if requested
    if args.embeddings:
        create_embeddings(force_recreate=args.force)
    
    # Prepare fine-tuning data if requested
    if args.finetune:
        prepare_fine_tuning_data(num_pairs=args.qa_pairs)
    
    # Start chatbot if requested
    if args.chat:
        start_chatbot()

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'data_processors'