<a href="https://colab.research.google.com/github/adas754/generative-AI_class/blob/main/Weaviate_Vector_Database.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install weaviate-client
!pip install langchain
!pip install openai

https://console.weaviate.cloud/

In [36]:
OPENAI_API_KEY = ""
WEAVIATE_API_KEY = ""
WEAVIATE_CLUSTER = ""

## Data Reading

In [None]:
!pip install unstructured
!pip install "unstructured[pdf]"

In [None]:
!mkdir data

In [51]:
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader("./data",glob = "**/*.pdf")
data = loader.load()

In [None]:
data

## Text Splitting

In [53]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
docs = text_splitter.split_documents(data)

In [None]:
len(docs)

## Embedding Convertion

In [55]:
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key= OPENAI_API_KEY)

In [56]:
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x7c48b4fb97b0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x7c48b4fb8e80>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)

## Vector Database Storage

In [66]:
import weaviate
from langchain.vectorstores import Weaviate

#Connect to weaviate Cluster
auth_config = weaviate.auth.AuthApiKey(api_key = WEAVIATE_API_KEY)
WEAVIATE_URL = WEAVIATE_CLUSTER

client = weaviate.Client(
    url = WEAVIATE_URL,
    additional_headers = {"X-OpenAI-Api-key": OPENAI_API_KEY},
    auth_client_secret = auth_config,
    startup_period = 10
)

In [67]:
client.is_ready()

True

In [63]:
# define input structure
client.schema.delete_all()
client.schema.get()
schema = {
    "classes": [
        {
            "class": "Chatbot",
            "description": "Documents for chatbot",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {"text2vec-openai": {"model": "ada", "type": "text"}},
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "The content of the paragraph",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": "content",
                },
            ],
        },
    ]
}

client.schema.create(schema)
vectorstore = Weaviate(client, "Chatbot", "content", attributes=["source"])

In [None]:
# load text into the vectorstore
text_meta_pair = [(doc.page_content, doc.metadata) for doc in docs]
texts, meta = list(zip(*text_meta_pair))
vectorstore.add_texts(texts, meta)

## Similarity Measurement

In [None]:
query = "what is a yolo?"

# retrieve text related to the query
docs = vectorstore.similarity_search(query, top_k=7)

In [None]:
docs

## Custom chatbot

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [None]:
# define chain
chain = load_qa_chain(
    OpenAI(openai_api_key = OPENAI_API_KEY,temperature=0),
    chain_type="stuff")

In [None]:
# create answer
chain.run(input_documents=docs, question=query)