In [47]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader('./', glob="**/*.pdf")
data = loader.load()

In [48]:
from dotenv import load_dotenv
load_dotenv()

True

In [49]:
print(f'You have {len(data)} documents in your data')
print(f'There are {len(data[0].page_content)} characters in your document')

You have 1 documents in your data
There are 16527 characters in your document


In [50]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
docs = text_splitter.split_documents(data)

In [51]:
from langchain.embeddings.openai import OpenAIEmbeddings
import os

embeddings = OpenAIEmbeddings(openai_api_key = os.environ["OPENAI_API_KEY"])

In [52]:
import weaviate

auth_config = weaviate.AuthApiKey(api_key = os.environ["WEAVIATE_API_KEY"])

client = weaviate.Client(
  url="https://rag-l4x00jnr.weaviate.network",
  additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
  auth_client_secret=auth_config,
  startup_period=10
)

In [53]:
# define input structure
client.schema.delete_all()
client.schema.get()
schema = {
    "classes": [
        {
            "class": "Chatbot",
            "description": "Documents for chatbot",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {"text2vec-openai": {"model": "ada", "type": "text"}},
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "The content of the paragraph",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": "content",
                },
            ],
        },
    ]
}

client.schema.create(schema)

vectorstore = Weaviate(client, "Chatbot", "content", attributes=["source"])



In [58]:
text_meta_pair = [(doc.page_content, doc.metadata) for doc in docs]
texts, meta = list(zip(*text_meta_pair))
c = vectorstore.add_texts(texts, meta)

In [59]:
len(c)

41

In [37]:
query = "what is this weeks main challenge?"

docs = vectorstore.similarity_search(query, k=4)