In [1]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

def sanitize_text(text):
    """Remove invalid content from the text."""
    sanitized = text.replace("null", "None")  # Replace JSON-style null
    sanitized = sanitized.replace("\u0000", "")  # Remove null characters
    sanitized = sanitized.strip()  # Remove leading/trailing whitespaces
    return sanitized

# Load and split the PDF
document_dir = "./"
filename = "youtube_data.pdf"
file_path = os.path.join(document_dir, filename)

loader = PyPDFLoader(file_path)
pages = loader.load_and_split()

# Split into chunks
text_splitter = CharacterTextSplitter(chunk_size=10000, chunk_overlap=0)
chunks = text_splitter.split_documents(pages)

# Sanitize chunks
sanitized_chunks = [sanitize_text(chunk.page_content) for chunk in chunks]

# Deduplicate and filter chunks
unique_chunks = list(set(sanitized_chunks))
filtered_chunks = [chunk for chunk in unique_chunks if "Link:" in chunk and "Description:" in chunk]

# Consolidate chunks based on unique links
seen_links = set()
consolidated_chunks = []
for chunk in filtered_chunks:
    link = chunk.split("Link:")[1].split("\n")[0].strip()
    if link not in seen_links:
        consolidated_chunks.append(chunk)
        seen_links.add(link)

print(f"Consolidated chunks: {len(consolidated_chunks)}")

# Create ChromaDB
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
documents = [Document(page_content=chunk, metadata={}) for chunk in consolidated_chunks]
db = Chroma.from_documents(documents, embeddings, persist_directory="./chroma_db")
print("ChromaDB created with clean document embeddings.")

Consolidated chunks: 27


  embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


ChromaDB created with clean document embeddings.
