In [1]:
import markdown2
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

  from tqdm.autonotebook import tqdm, trange


In [2]:
with open('alice_in_wonderland.md', 'r', encoding='utf-8') as file:
    markdown_content = file.read()


In [3]:
text_content = markdown2.markdown(markdown_content)


In [4]:
# Wrap the text in a Document object
documents = [Document(page_content=text_content, metadata={"source": "alice_in_wonderland.md"})]


In [8]:
def split_text(documents: list[Document]):
    # Initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,           # Maximum size of each chunk
        chunk_overlap=50,         # Overlap between chunks
        length_function=len,       # Function to determine the length of the chunk
        add_start_index=True,      # Include the starting index in the metadata
    )

    # Split the documents into chunks
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    # Example: Inspect the content and metadata of the 10th chunk
    document = chunks[10]
    print(document.page_content)  # Print the chunk content
    print(document.metadata)      # Print the metadata

    return chunks


In [9]:
chunks = split_text(documents)


Split 1 documents into 464 chunks.
passed; it was labelled “ORANGE MARMALADE”, but to her great
disappointment it was empty: she did not like to drop the jar for fear
of killing somebody underneath, so managed to put it into one of the
cupboards as she fell past it.</p>
{'source': 'alice_in_wonderland.md', 'start_index': 3679}


In [10]:
embeddingModel = SentenceTransformer('all-MiniLM-L6-v2')  # You can choose another model as well

# Embed the text chunks
chunk_embeddings = []
for chunk in chunks:
    embedding = embeddingModel.encode(chunk.page_content)
    chunk_embeddings.append({
        "embedding": embedding,
        "metadata": chunk.metadata,
        "content": chunk.page_content
    })


In [11]:
from chromadb import Client
from chromadb.config import Settings

In [12]:
from uuid import uuid4  # Import uuid4 to generate unique IDs

# Initialize ChromaDB client
client = Client(Settings(persist_directory='./chromadb_data'))

# Create a collection in ChromaDB
collection = client.create_collection('story_chunks')


# Connect to the existing collection
collection = client.get_collection('story_chunks')

# Add embeddings and corresponding metadata to the database
for chunk_data in chunk_embeddings:
    # Generate a unique ID for each document
    doc_id = str(uuid4())
    
    # Convert the numpy array to a list
    embedding_list = chunk_data["embedding"].tolist()

    collection.add(
        ids=[doc_id],                      # Add the unique ID here
        documents=[chunk_data["content"]],
        embeddings=[embedding_list],       # Ensure embedding is in list format
        metadatas=[chunk_data["metadata"]]
    )


In [13]:
# Example query
query_text = "Describe Alice's age and appearance in the story."
query_embedding = embeddingModel.encode(query_text).tolist()

# Perform the query
query_results = collection.query(query_embeddings=[query_embedding], n_results=3)

# Print the results
print("Query results:")
for i, result in enumerate(query_results["documents"]):
    print(f"Result {i+1}:")
    print("Content:", result)
    print("Metadata:", query_results["metadatas"][i])
    print("ID:", query_results["ids"][i])



Query results:
Result 1:
Content: ['<p>And so it was indeed: she was now only ten inches high, and her face\nbrightened up at the thought that she was now the right size for going\nthrough the little door into that lovely garden. First, however, she\nwaited for a few minutes to see if she was going to shrink any further:\nshe felt a little nervous about this; “for it might end, you know,”\nsaid Alice to herself, “in my going out altogether, like a candle. I\nwonder what I should be like then?” And she tried to fancy what the', '<p>The first question of course was, how to get dry again: they had a\nconsultation about this, and after a few minutes it seemed quite\nnatural to Alice to find herself talking familiarly with them, as if\nshe had known them all her life. Indeed, she had quite a long argument\nwith the Lory, who at last turned sulky, and would only say, “I am\nolder than you, and must know better;” and this Alice would not allow\nwithout knowing how old it was, and, as the Lory

In [14]:
from langchain_community.llms import Ollama
# Define llm
llm = Ollama(model="gemma2:2b")

In [15]:
from langchain.chains import RetrievalQA
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate

# Adjusting the Prompt Template
prompt = """
1. Use the following context to answer the question at the end.
2. Be precise and avoid speculation. If the information isn't clear, say "I don't know."
3. Provide a concise, 2-3 sentence answer.

Context: {context}

Question: {question}

Accurate Answer:"""


QA_CHAIN_PROMPT = PromptTemplate.from_template(prompt)

llm_chain = LLMChain(
    llm=llm, 
    prompt=QA_CHAIN_PROMPT, 
    callbacks=None, 
    verbose=True
)


  llm_chain = LLMChain(


In [16]:
document_prompt = PromptTemplate(
    input_variables=["page_content", "source"],
    template="Context:\ncontent:{page_content}\nsource:{source}",
)

combine_documents_chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_variable_name="context",
    document_prompt=document_prompt,
    callbacks=None,
)

  combine_documents_chain = StuffDocumentsChain(


In [17]:
from langchain.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')


  embedding_model = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')


In [18]:
from langchain.vectorstores import Chroma

vector_store = Chroma.from_documents(
    documents=chunks,  # This should be your list of Document objects
    embedding=embedding_model,
    persist_directory='./chromadb_data',
)


In [19]:
qa_chain = RetrievalQA.from_llm(
    llm=llm,
    retriever=vector_store.as_retriever(),  # Use the retriever from the vector store
    verbose=True
)


In [26]:
query = "What happened alice when she fall down to rabbit hole? Tell me something about Alice after her falling down"

# Run the query through the chain
result = qa_chain.run(query)

print("Generated Answer:")
print(result)




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Generated Answer:
When Alice fell into the rabbit hole, she went straight down and ended up in a deep well.  After falling, she got back on her feet and continued chasing the rabbit. 

Here's some more about Alice after falling: 

* **She was unharmed:** After falling, Alice didn't get hurt.
* **Curious:** She found herself being curious about the rabbit.
* **Chasing the Rabbit:**  Alice followed the rabbit into a large hole.
* **She followed closely:** She ran to catch up with the rabbit and was right behind it. 


