In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [24]:
# Process pdf and create splits to be used for vector db

from langchain_core.documents import Document
from langchain_text_splitters import MarkdownTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import PyMuPDFLoader

pdf_path = '../data/gfs.pdf'
loader = PyMuPDFLoader(pdf_path)

# Create chunks
chunk_size = 2000
chunk_overlap = 200
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

splits = loader.load_and_split(text_splitter)

In [32]:
for split in splits:
    split.metadata = {
        "source": "Google File System",
        "page": split.metadata['page'],
        "total_pages": split.metadata['total_pages']
    }

In [35]:
splits[0]

Document(metadata={'source': 'Google File System', 'page': 0, 'total_pages': 15}, page_content='The Google File System\nSanjay Ghemawat, Howard Gobioff, and Shun-Tak Leung\nGoogle∗\nABSTRACT\nWe have designed and implemented the Google File Sys-\ntem, a scalable distributed ﬁle system for large distributed\ndata-intensive applications. It provides fault tolerance while\nrunning on inexpensive commodity hardware, and it delivers\nhigh aggregate performance to a large number of clients.\nWhile sharing many of the same goals as previous dis-\ntributed ﬁle systems, our design has been driven by obser-\nvations of our application workloads and technological envi-\nronment, both current and anticipated, that reﬂect a marked\ndeparture from some earlier ﬁle system assumptions. This\nhas led us to reexamine traditional choices and explore rad-\nically diﬀerent design points.\nThe ﬁle system has successfully met our storage needs.\nIt is widely deployed within Google as the storage platform\nfo

In [33]:
# Initialize Azure Search db constants
VECTOR_STORE_ADDR = os.environ.get("VECTOR_STORE_ADDR")
VECTOR_STORE_PASS = os.environ.get("VECTOR_STORE_PASS")
# Initialize Azure Open AI constants
AZURE_AI_ENDPOINT = os.environ.get("AZURE_AI_ENDPOINT")
AZURE_AI_KEY = os.environ.get("AZURE_AI_KEY")
OPENAI_MODEL_NAME = os.environ.get("OPENAI_MODEL_NAME")
OPENAI_VERSION = os.environ.get("OPENAI_VERSION")
OPENAI_EMBED_MODEL_NAME = os.environ.get("OPENAI_EMBED_MODEL_NAME")

In [38]:
from azure.search.documents.indexes.models import (
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
)
from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=OPENAI_EMBED_MODEL_NAME,
    azure_endpoint=AZURE_AI_ENDPOINT,
    api_version=OPENAI_VERSION,
    api_key=AZURE_AI_KEY
)

In [None]:
fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embeddings.embed_query("Text")),
        vector_search_profile_name="myHnswProfile",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field to store the title
    SearchableField(
        name="source",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchableField(
        name="page",
        type=SearchFieldDataType.Int32,
        searchable=True
    )
]