In [14]:
import weaviate
import json
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Weaviate
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from torch import cuda, bfloat16
from langchain.llms import HuggingFacePipeline
import transformers

from langchain.chains import ChatVectorDBChain,RetrievalQA

In [2]:
client = weaviate.Client(
    url="http://localhost:8080",   
)

In [3]:
loader = PyPDFLoader("../pdf_querying_summarization/temp_file/doc.pdf")

documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

docs = text_splitter.split_documents(documents)

embeddings = HuggingFaceInstructEmbeddings(
            model_name="hkunlp/instructor-xl", model_kwargs={"device": "cuda"}
        )

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


In [9]:
db = Weaviate.from_documents(docs, embeddings, weaviate_url="http://localhost:8080", by_text=False)

In [12]:
query = "When were the GRI standards written?"
docs = db.similarity_search(query)

In [13]:
docs

[Document(page_content="3\nA Short Introduction to the GRI Standards\n...\nApply all three Universal \nStandards to your reportingUse the Sector Standards that \napply to your sectorsSelect Topic Standards to report \nspeciﬁc information on your \nmaterial topicsSector Standards Universal Standards\n...Topic Standards\nGRI 201\nGRI 415\nGRI 304\nGRI 403\nGRI 303\nGRI 205\nGRI 305\nGRI 202\nGRI 13\nGRI 16\nGRI 12\nGRI 15\nGRI 18\nGRI 11\nGRI 14\nGRI 17GRI Standards\nRequirements and \nprinciples for using the \nGRI Standards\nDisclosures about the \nreporting organization\nDisclosures and \nguidance about the \norganization's material \ntopics \nGRI 1\nGRI 2\nGRI 3Figure 1. GRI Standards: Universal, Sector and Topic Standards• GRI 2: General Disclosures 2021 ( GRI 2 ) contains \ndisclosures relating to details about an organization’s \nstructure and reporting practices; activities and \nworkers; governance; strategy; policies; practices; and \nstakeholder engagement. These give insight 

In [8]:
docs = db.similarity_search_with_score(query, by_text=False)
docs[0]

(Document(page_content='A Short Introduction to  \nthe GRI Standards\nwww.globalreporting.org www.globalreporting.org', metadata={'_additional': {'vector': [0.053567585, -0.0251535, 0.019101305, 0.0066484925, -0.062171742, -0.04195238, -0.088961646, 0.029081617, -0.009508389, -0.005521557, 0.036933485, 0.038459525, -0.013330513, -0.10212877, -0.023793692, 0.03594412, 0.007122637, -0.039632343, -0.010635164, -0.01424749, 0.0057760137, -0.04446114, -0.041216586, 0.026710706, -0.014804762, -0.054996327, 0.023391552, 0.013136665, -0.036958776, -0.03909738, 0.052267905, -0.011480347, 0.0067614955, -0.046085354, 0.04469198, -0.009438213, -0.023576789, -0.022953432, 0.0030165846, -0.0012048823, -0.06619726, 0.025584906, 0.017831862, -0.0054667275, 0.0027591179, -0.0104497885, -0.0023296648, 0.019857239, -0.019289603, -0.03826433, -0.013847204, -0.053075723, 0.021076266, -0.03891018, 0.017620891, 0.024064133, -0.026787126, -0.0139441285, -0.036185306, 0.05172352, 0.024646377, 0.021141836, 0.00

In [17]:
db._client.schema.get("LangChain_138df2f1432f4c5c825cff1cf8d79ee9")

{'class': 'LangChain_138df2f1432f4c5c825cff1cf8d79ee9',
 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
  'cleanupIntervalSeconds': 60,
  'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
 'moduleConfig': {'text2vec-transformers': {'poolingStrategy': 'masked_mean',
   'vectorizeClassName': True}},
 'multiTenancyConfig': {'enabled': False},
 'properties': [{'dataType': ['text'],
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-transformers': {'skip': False,
     'vectorizePropertyName': False}},
   'name': 'text',
   'tokenization': 'word'},
  {'dataType': ['number'],
   'description': "This property was generated by Weaviate's auto-schema feature on Mon Sep 18 13:51:44 2023",
   'indexFilterable': True,
   'indexSearchable': False,
   'moduleConfig': {'text2vec-transformers': {'skip': False,
     'vectorizePropertyName': False}},
   'name': 'page'},
  {'dataType': ['text'],
   'description': "This property was generated b

# Vectorstore and LLM test

Creating the vectorstore to query information from the DB using the LLM

In [11]:
vectorstore = Weaviate(client, "LangChain_138df2f1432f4c5c825cff1cf8d79ee9", "text")

Init LLM

In [5]:
model_id = 'meta-llama/Llama-2-70b-chat-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

        # set quantization configuration to load large model with less GPU memory
        # this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=bfloat16
        )

        # begin initializing HF items, need auth token for these
model_config = transformers.AutoConfig.from_pretrained(
            model_id
        )

model = transformers.AutoModelForCausalLM.from_pretrained(
            model_id,
            trust_remote_code=True,
            config=model_config,
            quantization_config=bnb_config,
            device_map='auto'
        )
model.eval()

tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
generate_text = transformers.pipeline(
            model=model, tokenizer=tokenizer,
            return_full_text=True,  # langchain expects the full text
            task='text-generation',
            # we pass model parameters here too
            #stopping_criteria=stopping_criteria,  # without this model rambles during chat
            temperature=0.01,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
            max_new_tokens=512,  # mex number of tokens to generate in the output
            repetition_penalty=1.1  # without this output begins repeating
        )
llm = HuggingFacePipeline(pipeline=generate_text)

Loading checkpoint shards: 100%|██████████| 15/15 [00:18<00:00,  1.25s/it]
Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


Creating the chains: 

In [12]:
qa = ChatVectorDBChain.from_llm(llm, vectorstore)
chat_history = []



In [15]:
QA_document = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever())

In [16]:
QA_document.run("what are the gri standards?")

ValueError: Error during query: [{'locations': [{'column': 6, 'line': 1}], 'message': "explorer: get class: vector search: object vector search at index langchain_138df2f1432f4c5c825cff1cf8d79ee9: shard langchain_138df2f1432f4c5c825cff1cf8d79ee9_jqTX3xTHTaox: vector search: knn search: distance between entrypoint and query node: vector lengths don't match: 768 vs 384", 'path': ['Get', 'LangChain_138df2f1432f4c5c825cff1cf8d79ee9']}]

In [13]:
query = 'What are the GRI standards?'

result = qa({'question':query,'chat_history': chat_history})

ValueError: Error during query: [{'locations': [{'column': 6, 'line': 1}], 'message': "explorer: get class: vector search: object vector search at index langchain_138df2f1432f4c5c825cff1cf8d79ee9: shard langchain_138df2f1432f4c5c825cff1cf8d79ee9_jqTX3xTHTaox: vector search: knn search: distance between entrypoint and query node: vector lengths don't match: 768 vs 384", 'path': ['Get', 'LangChain_138df2f1432f4c5c825cff1cf8d79ee9']}]

# Weaviate database management functions

* Create Schema
* Create Class
* Add data with custom metadata, name, description etc...
* Query data
* Remove data
* Update data

In [None]:
#Schema and class creation: 

class_name = ""
vectorizer = ""
