In [44]:

from langchain_core.prompts import ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.schema import Document
from unstructured.partition.auto import partition
import os
import pandas as pd
import gradio as gr
# 1. Document Loading and Page Tracking
docs = []
folder_path = r'C:\Users\admin\Documents\LLM\B1-B data'
doc_folder = folder_path
for filename in os.listdir(doc_folder):
    filepath = os.path.join(doc_folder, filename)
    if os.path.isfile(filepath):
        elements = partition(filename=filepath)
        for i, element in enumerate(elements):
            text = str(element)
            page_number = element.metadata.page_number if element.metadata.page_number else 'N/A'  # Extract page info
            docs.append({"source": filename, "content": text, "page": page_number})

# 2. Chunking while Preserving Page Information
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=300)
all_splits = []
current_chunk = ""
current_metadata = {}

for doc in docs:
    splits = text_splitter.split_text(doc['content'])
    for split in splits:
        if len(current_chunk) + len(split) <= 4096: 
            current_chunk += split + " " 
            current_metadata = {"source": doc['source'], "page": doc['page']} 
        else:
            all_splits.append(Document(page_content=current_chunk, metadata=current_metadata))
            current_chunk = split + " "
            current_metadata = {"source": doc['source'], "page": doc['page']}

if current_chunk:
    all_splits.append(Document(page_content=current_chunk, metadata=current_metadata)) 

# 3. Vectorstore and LLM Setup - Load LLM and Vectorstore only once
model = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = Chroma.from_documents(documents=all_splits, embedding=model, collection_name="docs_collection")
llm = ChatOllama(model="llama3.1:8b")  # LLM loaded only once

In [1]:
# if docs.pkl already exists, run this -- run this in demo
from langchain_core.prompts import ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.schema import Document
from unstructured.partition.auto import partition
import pandas as pd
import gradio as gr
import pickle
# Load docs from pickle file
with open(r'C:\Users\Ian\Documents\RAG\B-1B data processed\docs.pkl', 'rb') as f:
    docs = pickle.load(f)

# Chunking while Preserving Page Information
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=300)
all_splits = []
current_chunk = ""
current_metadata = {}

for doc in docs:
    splits = text_splitter.split_text(doc['content'])
    for split in splits:
        if len(current_chunk) + len(split) <= 4096:
            current_chunk += split + " "
            current_metadata = {"source": doc['source'], "page": doc['page']}
        else:
            all_splits.append(Document(page_content=current_chunk, metadata=current_metadata))
            current_chunk = split + " "
            current_metadata = {"source": doc['source'], "page": doc['page']}

# Append the last chunk
if current_chunk:
    all_splits.append(Document(page_content=current_chunk, metadata=current_metadata))

# Vectorstore and Retriever Setup
model = OllamaEmbeddings(model="nomic-embed-text")
embedding_function = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = Chroma.from_documents(all_splits, embedding_function, collection_name="B-1B", persist_directory=r'C:\Users\Ian\Documents\RAG\B-1B data processed\vectorDB_B-1B')
llm = ChatOllama(model="llama3.1:8b")  # Or your preferred LLM

  from .autonotebook import tqdm as notebook_tqdm


Loaded 28689 documents.


In [4]:
query = Document(page_content="Humpty Dumpty had a great what?")
query_embedded = Chroma.from_documents(documents=[query], embedding=model, collection_name="query_collection")
humpdump = Document(page_content='''                  
The story goes: 
Humpty Dumpty sat on a wall,
Humpty Dumpty had a great fall;
All the king's horses and all the king's men
Couldn't put Humpty together again.                         
''')
humpdump_embedded = Chroma.from_documents(documents=[humpdump], embedding=model, collection_name="humpdumpstory_collection")

In [5]:
#TODO: embed query AND documents
doc_embeddings = vectorstore._collection.get(include = ['embeddings'])['embeddings']
query_embeddings = query_embedded._collection.get(include = ['embeddings'])['embeddings']
humpdump_embeddings = humpdump_embedded._collection.get(include = ['embeddings'])['embeddings']

In [None]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
# Assuming you have query embedding and document embeddings as numpy arrays
# query_embedding: shape (1, embedding_dim)
# doc_embeddings: shape (num_docs, embedding_dim)

# Combine query and document embeddings

all_embeddings = np.vstack([query_embeddings, humpdump_embeddings, doc_embeddings])
all_embeddings = normalize(all_embeddings, axis=1)
import umap.umap_ as umap

# Apply UMAP for dimensionality reduction
umap_reducer = umap.UMAP(random_state=42)
embeddings_2d = umap_reducer.fit_transform(all_embeddings)

# Plot the UMAP result
plt.figure(figsize=(10, 8))

# Plot document embeddings (blue)
plt.scatter(embeddings_2d[2:, 0], embeddings_2d[2:, 1], c='blue', label='Documents', alpha=0.7)

# Plot query embedding (red)
plt.scatter(embeddings_2d[0, 0], embeddings_2d[0, 1], c='red', label='Query', s=100)

# Plot Humpty Dumpty embedding (green)
plt.scatter(embeddings_2d[1, 0], embeddings_2d[1, 1], c='green', label='Humpty Dumpty', marker='o', s=100)

plt.legend()
plt.title("UMAP Visualization of Query, Humpty Dumpty, and Document Embeddings")
plt.show()