In [3]:
#!pip install tiktoken==0.6.0 pypdf==4.0.1 langchain==0.1.1 langchain-community==0.0.13 chromadb==0.4.22 sentence-transformers==2.3.1


^C


In [1]:
import chromadb
persistance_directory = 'vectorstore'
chroma_client = chromadb.PersistentClient(persistance_directory)


In [2]:
chroma_client.list_collections()

[]

In [4]:
if "my_collection" in [col.name for col in chroma_client.list_collections()]:
    chroma_client.delete_collection(name="my_collection")  # Deletes the entire collection

# Recreate the collection
collection = chroma_client.get_or_create_collection(name="my_collection")



In [5]:
import os
from pathlib import Path
from langchain.text_splitter import CharacterTextSplitter  # Importing text splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing a better text splitter!

# Define the path to your directory containing the text files
directory_path = Path("document-corpus")

# Initialize lists for documents, IDs, and metadata
documents = []
ids = []
metadatas = []

In [7]:
import re

def filter_text(text):
    """
    Replaces multiple new lines with a single space and strips extra whitespace.
    
    Parameters:
    - text (str): The text to filter.
    
    Returns:
    - str: The filtered text.
    """
    text = re.sub(r'\n+', ' ', text)  # Replace multiple new lines with a single space
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text.strip()  # Strip leading and trailing whitespace


In [8]:
from PyPDF2 import PdfReader

# Initialize the text splitter with the desired chunk size and overlap
# Splitting Logic: The CharacterTextSplitter splits text based on a fixed character length. It does this without considering the semantic structure of the text.
# text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100, separator='')

# Splitting Logic: The RecursiveCharacterTextSplitter attempts to split text into chunks of a given length, but with a focus on preserving the semantic integrity of the text.
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name='cl100k_base',
    chunk_size=512,
    chunk_overlap=32
)

# Loop through each PDF file in the directory
for file in directory_path.glob("*.pdf"):  # Only select .pdf files
    try:
        print(f"Processing file: {file}")  # Debugging line to track files
        with open(file, "rb") as f:
            # Read the PDF content
            pdf_reader = PdfReader(f)
            content = ""
            for page in pdf_reader.pages:
                content += page.extract_text() or ""
            content = filter_text(content)  # Stripping extra whitespace/newlines
            
            if content:  # Only process non-empty files
                # Use the text splitter to split the document into chunks
                chunks = text_splitter.split_text(content)
                print("chunks", len(chunks), " file name", file)
                # Append each chunk as a document
                for i, chunk in enumerate(chunks):
                    documents.append(chunk)
                    # Create unique IDs for each chunk (using file stem and chunk index)
                    ids.append(f"{file.stem}_chunk_{i}")
                    print(f"Added document chunk with ID: {file.stem}_chunk_{i}")
                    
                    # Create metadata for the document chunk
                    metadata = {
                        "filename": file.name,
                        "size": len(chunk),  # Size of the chunk in characters
                        "path": str(file),
                        "chunk_index": i  # Index of the chunk
                    }
                    metadatas.append(metadata)
            else:
                print(f"Skipped empty file: {file}")
    except Exception as e:
        print(f"Error reading {file}: {e}")
        continue

Processing file: document-corpus\Additive_Manufacturing_of_Bio_and_Synthetic_Polymers.pdf
chunks 579  file name document-corpus\Additive_Manufacturing_of_Bio_and_Synthetic_Polymers.pdf
Added document chunk with ID: Additive_Manufacturing_of_Bio_and_Synthetic_Polymers_chunk_0
Added document chunk with ID: Additive_Manufacturing_of_Bio_and_Synthetic_Polymers_chunk_1
Added document chunk with ID: Additive_Manufacturing_of_Bio_and_Synthetic_Polymers_chunk_2
Added document chunk with ID: Additive_Manufacturing_of_Bio_and_Synthetic_Polymers_chunk_3
Added document chunk with ID: Additive_Manufacturing_of_Bio_and_Synthetic_Polymers_chunk_4
Added document chunk with ID: Additive_Manufacturing_of_Bio_and_Synthetic_Polymers_chunk_5
Added document chunk with ID: Additive_Manufacturing_of_Bio_and_Synthetic_Polymers_chunk_6
Added document chunk with ID: Additive_Manufacturing_of_Bio_and_Synthetic_Polymers_chunk_7
Added document chunk with ID: Additive_Manufacturing_of_Bio_and_Synthetic_Polymers_chun

unknown widths : 
[0, IndirectObject(5842, 0, 1711151014352)]
unknown widths : 
[0, IndirectObject(5846, 0, 1711151014352)]
unknown widths : 
[0, IndirectObject(5850, 0, 1711151014352)]
unknown widths : 
[0, IndirectObject(5854, 0, 1711151014352)]
unknown widths : 
[0, IndirectObject(5859, 0, 1711151014352)]
unknown widths : 
[0, IndirectObject(5863, 0, 1711151014352)]
unknown widths : 
[0, IndirectObject(5871, 0, 1711151014352)]
unknown widths : 
[0, IndirectObject(5875, 0, 1711151014352)]
unknown widths : 
[0, IndirectObject(5854, 0, 1711151014352)]
unknown widths : 
[0, IndirectObject(5880, 0, 1711151014352)]
unknown widths : 
[0, IndirectObject(5884, 0, 1711151014352)]


chunks 839  file name document-corpus\Advances_in_Sustainable_Concrete_System.pdf
Added document chunk with ID: Advances_in_Sustainable_Concrete_System_chunk_0
Added document chunk with ID: Advances_in_Sustainable_Concrete_System_chunk_1
Added document chunk with ID: Advances_in_Sustainable_Concrete_System_chunk_2
Added document chunk with ID: Advances_in_Sustainable_Concrete_System_chunk_3
Added document chunk with ID: Advances_in_Sustainable_Concrete_System_chunk_4
Added document chunk with ID: Advances_in_Sustainable_Concrete_System_chunk_5
Added document chunk with ID: Advances_in_Sustainable_Concrete_System_chunk_6
Added document chunk with ID: Advances_in_Sustainable_Concrete_System_chunk_7
Added document chunk with ID: Advances_in_Sustainable_Concrete_System_chunk_8
Added document chunk with ID: Advances_in_Sustainable_Concrete_System_chunk_9
Added document chunk with ID: Advances_in_Sustainable_Concrete_System_chunk_10
Added document chunk with ID: Advances_in_Sustainable_Concr

Multiple definitions in dictionary at byte 0x7a for key /Subtype
Multiple definitions in dictionary at byte 0x7c3 for key /Subtype
Multiple definitions in dictionary at byte 0x7a for key /Subtype
Multiple definitions in dictionary at byte 0x7c5 for key /Subtype
Multiple definitions in dictionary at byte 0x7a for key /Subtype
Multiple definitions in dictionary at byte 0x7c5 for key /Subtype
Multiple definitions in dictionary at byte 0x7a for key /Subtype
Multiple definitions in dictionary at byte 0x7c5 for key /Subtype
Multiple definitions in dictionary at byte 0x7a for key /Subtype
Multiple definitions in dictionary at byte 0x7c5 for key /Subtype
Multiple definitions in dictionary at byte 0x7a for key /Subtype
Multiple definitions in dictionary at byte 0x7c3 for key /Subtype
Multiple definitions in dictionary at byte 0x7a for key /Subtype
Multiple definitions in dictionary at byte 0x7c3 for key /Subtype
Multiple definitions in dictionary at byte 0x7a for key /Subtype
Multiple definitio

chunks 1310  file name document-corpus\Development_of_BioBased_Materials_Synthesis_Characterization_and_Applications.pdf
Added document chunk with ID: Development_of_BioBased_Materials_Synthesis_Characterization_and_Applications_chunk_0
Added document chunk with ID: Development_of_BioBased_Materials_Synthesis_Characterization_and_Applications_chunk_1
Added document chunk with ID: Development_of_BioBased_Materials_Synthesis_Characterization_and_Applications_chunk_2
Added document chunk with ID: Development_of_BioBased_Materials_Synthesis_Characterization_and_Applications_chunk_3
Added document chunk with ID: Development_of_BioBased_Materials_Synthesis_Characterization_and_Applications_chunk_4
Added document chunk with ID: Development_of_BioBased_Materials_Synthesis_Characterization_and_Applications_chunk_5
Added document chunk with ID: Development_of_BioBased_Materials_Synthesis_Characterization_and_Applications_chunk_6
Added document chunk with ID: Development_of_BioBased_Materials_Syn

In [9]:
len(documents)

32313

In [10]:
from langchain.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings)

# Initialize embedding model
#  Ideal when you want flexibility with different models, compatibility with Hugging Face ecosystems, or are working on applications where model speed and memory efficiency are crucial.
# embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") 

# When accuracy and precision are more important, and you have more computational resources available, gte-large provides richer, more nuanced embeddings at the cost of being slower and more resource-intensive.
embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')


# Create the initial vector store with an empty collection
vectorstore = Chroma(
    collection_name="my_collection",
    embedding_function=embedding_model,
    persist_directory=persistance_directory  # Adjust if using persistent storage
)

# Define a function to add in batches
def add_documents_in_batches(vectorstore, documents, ids, metadatas, batch_size=10000):
    total_docs = len(documents)
    print(f"total_docs: {total_docs}")
    batch_counter = 1
    for start_idx in range(0, total_docs, batch_size):
        end_idx = min(start_idx + batch_size, total_docs)
        batch_documents = documents[start_idx:end_idx]
        batch_ids = ids[start_idx:end_idx]
        batch_metadatas = metadatas[start_idx:end_idx]
        print(f"start_idx: {start_idx}, end_idx: {end_idx}")

        # Add documents to the ChromaDB collection using the embeddings
        vectorstore.add_texts(
            texts=batch_documents,
            metadatas=batch_metadatas,
            ids=batch_ids
        )
        
        print(f"Added batch #{batch_counter} from {start_idx} to {end_idx}.")
        batch_counter += 1

# Check if any documents were collected before adding to the collection
if len(documents) == len(ids) and documents:
    # Add documents in batches
    add_documents_in_batches(vectorstore, documents, ids, metadatas)
    print(f"Added {len(documents)} document chunks to the collection with metadata.")
    vectorstore.persist()  # Persist the collection to disk if using a persistent directory
else:
    print("Mismatch in documents and ids or no documents were collected.")


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange
  vectorstore = Chroma(


total_docs: 32313
start_idx: 0, end_idx: 10000
Added batch #1 from 0 to 10000.
start_idx: 10000, end_idx: 20000
Added batch #2 from 10000 to 20000.
start_idx: 20000, end_idx: 30000
Added batch #3 from 20000 to 30000.
start_idx: 30000, end_idx: 32313
Added batch #4 from 30000 to 32313.
Added 32313 document chunks to the collection with metadata.


  vectorstore.persist()  # Persist the collection to disk if using a persistent directory


In [11]:
vectorstore.persist()

In [12]:
# all_docs = collection.get()
# all_docs

In [13]:
# results = collection.query(
#     query_texts=["Biosynthesis"],
#     n_results = 2
# )
# results

In [13]:
import sqlite3
import os

# Create a new SQLite database file
db_path = "chroma.sqlite3"
conn = sqlite3.connect(db_path)

# Create a table named 'temp' and insert 5 numbers into it
conn.execute("CREATE TABLE IF NOT EXISTS temp (id INTEGER PRIMARY KEY, number INTEGER);")

# Insert 5 numbers into the 'temp' table
numbers_to_insert = [1, 2, 3, 4, 5]
conn.executemany("INSERT INTO temp (number) VALUES (?);", [(num,) for num in numbers_to_insert])

# Commit changes and close the connection
conn.commit()
conn.close()

# Manually expand the file to 900 MB
initial_size_mb = 900
with open(db_path, "ab") as f:
    f.truncate(initial_size_mb * 1024 * 1024)  # Set the file size to 900 MB

# re-initializing
conn = sqlite3.connect(db_path)
# Create a table named 'temp' and insert 5 numbers into it
conn.execute("DROP TABLE IF EXISTS temp;")

# Commit changes and close the connection
conn.commit()
conn.close()
