### Create Vector Store

In [4]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import os

# Alternative: Use TfidfVectorizer for embeddings (no download needed)
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def build_vector_store():
    docs = []
    folder_path = "docs/"

    # Check if folder exists
    if not os.path.exists(folder_path):
        print(f"Error: '{folder_path}' directory not found!")
        return

    # Load all text files
    for file in os.listdir(folder_path):
        if file.endswith(".txt"):
            try:
                loader = TextLoader(os.path.join(folder_path, file), encoding='utf-8')
                docs.extend(loader.load())
                print(f"Loaded: {file}")
            except Exception as e:
                print(f"Error loading {file}: {e}")
                continue

    if not docs:
        print("No documents found!")
        return

    print(f"Total documents loaded: {len(docs)}")

    # Split documents into chunks
    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    split_docs = splitter.split_documents(docs)
    print(f"Total chunks created: {len(split_docs)}")

    # Create a simple embedding function using TfidfVectorizer
    class LocalEmbeddings:
        def __init__(self):
            self.vectorizer = TfidfVectorizer(max_features=100)
            
        def embed_documents(self, texts):
            return self.vectorizer.fit_transform(texts).toarray().astype(np.float32)
        
        def embed_query(self, text):
            return self.vectorizer.transform([text]).toarray()[0].astype(np.float32)
    
    embeddings = LocalEmbeddings()
    
    # Create embeddings for all documents
    doc_texts = [doc.page_content for doc in split_docs]
    embeddings_array = embeddings.embed_documents(doc_texts)
    
    # Build FAISS vector store
    import faiss
    dimension = embeddings_array.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings_array)
    
    # Create a wrapper for FAISS that works with our embeddings
    class FAISSWrapper:
        def __init__(self, index, embeddings, docs):
            self.index = index
            self.embeddings = embeddings
            self.docs = docs
        
        def save_local(self, path):
            os.makedirs(path, exist_ok=True)
            faiss.write_index(self.index, f"{path}/index.faiss")
            import pickle
            with open(f"{path}/docs.pkl", "wb") as f:
                pickle.dump(self.docs, f)
            print(f"Vector store saved to {path}")
    
    vector_store = FAISSWrapper(index, embeddings, split_docs)
    vector_store.save_local("faiss_store")
    print("Vector store saved successfully!")

build_vector_store()
print("Vector store created successfully!")

Loaded: cement.txt
Loaded: ml.txt
Loaded: pakistan_history.txt
Total documents loaded: 3
Total chunks created: 6
Vector store saved to faiss_store
Vector store saved successfully!
Vector store created successfully!


### Create Chatbot (with memory)

In [7]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
import faiss
import pickle
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def create_chatbot():
    # Load the saved FAISS index
    faiss_index = faiss.read_index("faiss_store/index.faiss")
    
    # Load the documents and embeddings
    with open("faiss_store/docs.pkl", "rb") as f:
        docs = pickle.load(f)
    
    # Recreate the embeddings object
    class LocalEmbeddings:
        def __init__(self):
            self.vectorizer = TfidfVectorizer(max_features=100)
            # Fit on the existing documents
            doc_texts = [doc.page_content for doc in docs]
            self.vectorizer.fit(doc_texts)
            
        def embed_documents(self, texts):
            return self.vectorizer.transform(texts).toarray().astype(np.float32)
        
        def embed_query(self, text):
            return self.vectorizer.transform([text]).toarray()[0].astype(np.float32)
        
        def similarity_search(self, query, k=4):
            query_embedding = self.embed_query(query)
            query_embedding = np.array([query_embedding])
            distances, indices = faiss_index.search(query_embedding, k)
            return [docs[i] for i in indices[0]]
    
    embeddings = LocalEmbeddings()
    
    # Create a simple retriever
    class SimpleRetriever:
        def __init__(self, embeddings, docs):
            self.embeddings = embeddings
            self.docs = docs
            
        def get_relevant_documents(self, query):
            return self.embeddings.similarity_search(query)
    
    retriever = SimpleRetriever(embeddings, docs)
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    
    # For now, create a simple chatbot without LlamaCpp (since model file may not exist)
    # You can replace this with actual LLM later
    print("Chatbot initialized successfully!")
    print(f"Loaded {len(docs)} documents from vector store")
    
    return retriever, memory

retriever, memory = create_chatbot()

Chatbot initialized successfully!
Loaded 6 documents from vector store


  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


### Test Chatbot

In [10]:
# Test the retriever with a query
query = "What is cement and how is it made?"
relevant_docs = retriever.get_relevant_documents(query)

print(f"Query: {query}\n")
print("=" * 50)
print("Relevant Documents:\n")
for i, doc in enumerate(relevant_docs, 1):
    print(f"Document {i}:")
    print(doc.page_content[:500])  # Print first 500 chars
    print("-" * 50)

Query: What is cement and how is it made?

Relevant Documents:

Document 1:
Cement is a fine powder made from limestone and other minerals, which acts as a binder when mixed with water. The main ingredient of cement is calcium carbonate, usually derived from limestone. Cement is used in construction as a binding material to hold together bricks, stones, and concrete.

The production process involves mining limestone, crushing it, and heating it in a kiln at high temperatures. This process produces clinker, which is then ground into a fine powder and mixed with gypsum. C
--------------------------------------------------
Document 2:
In construction, cement is mixed with sand and aggregate to form concrete. Concrete is one of the most widely used building materials due to its strength, durability, and flexibility in construction design. The ratio of cement to sand and aggregate determines the strength of the concrete. The water-cement ratio is also important; too much water can weaken th