In [1]:
import markdown2
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

  from tqdm.autonotebook import tqdm, trange


In [2]:
with open('alice_in_wonderland.md', 'r', encoding='utf-8') as file:
    markdown_content = file.read()


In [3]:
text_content = markdown2.markdown(markdown_content)


In [4]:
# Wrap the text in a Document object
documents = [Document(page_content=text_content, metadata={"source": "alice_in_wonderland.md"})]


In [25]:
def split_text(documents: list[Document]):
    # Initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,           # Maximum size of each chunk
        chunk_overlap=50,         # Overlap between chunks
        length_function=len,       # Function to determine the length of the chunk
        add_start_index=True,      # Include the starting index in the metadata
    )

    # Split the documents into chunks
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    # Example: Inspect the content and metadata of the 10th chunk
    document = chunks[10]
    print(document.page_content)  # Print the chunk content
    print(document.metadata)      # Print the metadata

    return chunks


In [26]:
chunks = split_text(documents)


Split 1 documents into 587 chunks.
watch out of its waistcoat-pocket</em>, and looked at it, and then hurried
on, Alice started to her feet, for it flashed across her mind that she
had never before seen a rabbit with either a waistcoat-pocket, or a
watch to take out of it, and burning with curiosity, she ran across the
field after it, and fortunately was just in time to see it pop down a
large rabbit-hole under the hedge.</p>
{'source': 'alice_in_wonderland.md', 'start_index': 2450}


In [27]:
embeddingModel = SentenceTransformer('all-MiniLM-L6-v2')  # You can choose another model as well

# Embed the text chunks
chunk_embeddings = []
for chunk in chunks:
    embedding = embeddingModel.encode(chunk.page_content)
    chunk_embeddings.append({
        "embedding": embedding,
        "metadata": chunk.metadata,
        "content": chunk.page_content
    })


In [8]:
from chromadb import Client
from chromadb.config import Settings

In [29]:
from uuid import uuid4  # Import uuid4 to generate unique IDs

# Initialize ChromaDB client
client = Client(Settings(persist_directory='./chromadb_data'))

# Create a collection in ChromaDB
#collection = client.create_collection('story_chunks')


# Connect to the existing collection
collection = client.get_collection('story_chunks')

# Add embeddings and corresponding metadata to the database
for chunk_data in chunk_embeddings:
    # Generate a unique ID for each document
    doc_id = str(uuid4())
    
    # Convert the numpy array to a list
    embedding_list = chunk_data["embedding"].tolist()

    collection.add(
        ids=[doc_id],                      # Add the unique ID here
        documents=[chunk_data["content"]],
        embeddings=[embedding_list],       # Ensure embedding is in list format
        metadatas=[chunk_data["metadata"]]
    )


In [30]:
# Example query
query_text = "Describe Alice's age and appearance in the story."
query_embedding = embeddingModel.encode(query_text).tolist()

# Perform the query
query_results = collection.query(query_embeddings=[query_embedding], n_results=2)

# Print the results
print("Query results:")
for i, result in enumerate(query_results["documents"]):
    print(f"Result {i+1}:")
    print("Content:", result)
    print("Metadata:", query_results["metadatas"][i])
    print("ID:", query_results["ids"][i])



Query results:
Result 1:
Content: ['older than you, and must know better;” and this Alice would not allow\nwithout knowing how old it was, and, as the Lory positively refused to\ntell its age, there was no more to be said.</p>', '<p>Just at this moment Alice felt a very curious sensation, which puzzled\nher a good deal until she made out what it was: she was beginning to\ngrow larger again, and she thought at first she would get up and leave\nthe court; but on second thoughts she decided to remain where she was\nas long as there was room for her.</p>']
Metadata: [{'source': 'alice_in_wonderland.md', 'start_index': 24980}, {'source': 'alice_in_wonderland.md', 'start_index': 134140}]
ID: ['a2e0fc1c-dc31-46dc-bc14-36c7c03faa31', '07458721-6602-4fe3-80f7-f2685b16a012']


In [31]:
from langchain_community.llms import Ollama
# Define llm
llm = Ollama(model="gemma2:2b")

In [32]:
from langchain.chains import RetrievalQA
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate

# Adjusting the Prompt Template
prompt = """
1. Use the following context to answer the question at the end.
2. Be precise and avoid speculation. If the information isn't clear, say "I don't know."
3. Provide a concise, 2-3 sentence answer.

Context: {context}

Question: {question}

Accurate Answer:"""


QA_CHAIN_PROMPT = PromptTemplate.from_template(prompt)

llm_chain = LLMChain(
    llm=llm, 
    prompt=QA_CHAIN_PROMPT, 
    callbacks=None, 
    verbose=True
)


In [33]:
document_prompt = PromptTemplate(
    input_variables=["page_content", "source"],
    template="Context:\ncontent:{page_content}\nsource:{source}",
)

combine_documents_chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_variable_name="context",
    document_prompt=document_prompt,
    callbacks=None,
)

In [34]:
from langchain.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')




In [35]:
from langchain.vectorstores import Chroma

vector_store = Chroma.from_documents(
    documents=chunks,  # This should be your list of Document objects
    embedding=embedding_model,
    persist_directory='./chromadb_data',
)


In [36]:
qa_chain = RetrievalQA.from_llm(
    llm=llm,
    retriever=vector_store.as_retriever(),  # Use the retriever from the vector store
    verbose=True
)


In [38]:
query = "Can you give me a summary of that story. Please give me some details and it should be a little long"

# Run the query through the chain
result = qa_chain.run(query)

print("Generated Answer:")
print(result)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Generated Answer:
Alice, a young girl, finds herself falling down a rabbit hole into a whimsical and fantastical world.  The story is full of absurd characters like talking animals and nonsensical situations. 

She encounters several peculiar events, including a Caucus-race with talking rabbits, a mad tea party where everyone's behavior is unpredictable, and a croquet game played by the Queen who rules over this strange land. Alice even meets an advice-giving Caterpillar! Throughout her journey, she faces challenges like being chased down a path, witnessing the loss of tarts in the hands of someone unknown,  and ultimately tries to unravel a mystery when a mysterious culprit steals them. 

The story uses symbolism and imagery to explore themes such as childhood imagination, the power of memory, and the absurdity of reality. It's filled with moments that defy logic and question our understanding of time and space. 