In [None]:
# !pip install -qU \
#   langchain==0.0.276 \
#   openai==0.27.10 \
#   tiktoken==0.4.0 \
#   sentence-transformers==2.2.2 \
#   spacy==3.6.1 \
#   nltk==3.8.1 \
#   pinecone-client==2.2.2 \
#   pypdf==3.15.4

In [None]:
from langchain.embeddings import OpenAIEmbeddings   
# from langchain.document_loaders import PyPDFLoader
# from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter, TokenTextSplitter, SpacyTextSplitter, SentenceTransformersTokenTextSplitter
# from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.memory import ConversationBufferWindowMemory, ConversationSummaryMemory


import pinecone
import time
# import itertools
# import uuid
# from tqdm.autonotebook import tqdm

from config import OPENAI_API_KEY, PINECONE_API_KEY, PINECONE_ENVIRONMENT, PINECONE_INDEX_NAME, EMBEDDING_MODEL, SPLITTER_CHUNK_SIZE, SPLITTER_CHUNK_OVERLAP

In [85]:
def print_result(result):
    print("="*30)
    print(" "*10 + "Question")
    print("="*30)
    print(result["question"])
    print("="*30)
    print()

    print("="*30)
    print(" "*10 + "Answer")
    print("="*30)
    print(result["answer"])
    print("="*30)
    print()

    sources = result["source_documents"]

    for i in range(min(3, len(sources))):
        print("="*30)
        print(f"Source [{i+1}] \t File: [{sources[i].metadata['source']}] \t Page: [{int(sources[i].metadata['page'])}]")
        print("="*30)
        print(sources[i].page_content)
        print("="*30)
        print()
    

Open AI Init

In [86]:
# llm = OpenAI(openai_api_key=OPENAI_API_KEY)
llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY)

embedding_model = OpenAIEmbeddings(
    openai_api_key=OPENAI_API_KEY, 
    model=EMBEDDING_MODEL
)




Pinecone Init

In [87]:
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT
)

if PINECONE_INDEX_NAME not in pinecone.list_indexes():
    # we create a new index if it doesn't exist
    pinecone.create_index(
        name=PINECONE_INDEX_NAME,
        metric='cosine',
        dimension=1536  # 1536 dim of text-embedding-ada-002
    )
    # wait for index to be initialized
    time.sleep(1)

pinecone_index = pinecone.Index(PINECONE_INDEX_NAME)
pinecone_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.04929,
 'namespaces': {'': {'vector_count': 4929}},
 'total_vector_count': 4929}

[api doc](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.pinecone.Pinecone.html#langchain.vectorstores.pinecone.Pinecone.as_retriever)

[What is MMR](https://medium.com/tech-that-works/maximal-marginal-relevance-to-rerank-results-in-unsupervised-keyphrase-extraction-22d95015c7c5)

In [88]:
vectorstore = Pinecone(pinecone_index, embedding_model, "text")
retriever = vectorstore.as_retriever(
    search_type="mmr", 
    search_kwargs={
                    "k": 5,
                    "lambda_mult": 0.5, # the optimal mix of diversity and accuracy in the result set
                    }
)
# retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"score_threshold": 0.5})


# memory = ConversationSummaryMemory(llm=chat_model,memory_key="chat_history",return_messages=True)
memory = ConversationSummaryMemory(llm=llm, memory_key="chat_history", input_key='question', output_key='answer', return_messages=True)
# memory = ConversationBufferWindowMemory(k=12, memory_key="chat_history", return_messages=True)

memory.clear()
conversation_qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    memory=memory,
    return_source_documents=True, 
    verbose=False
)

# memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True)

In [77]:
query = "Why bolt looseness is harmful?"
res = conversation_qa_chain({"question": query})

print_result(res)




          Question
Why bolt looseness is harmful?

          Answer
Bolt looseness can be harmful because it can lead to failures in bolted joints, especially in critical structures like hydropower plants. When bolts become loose, the integrity and stability of the joint are compromised, which can result in catastrophic disasters such as flooding and power shortages. Loose bolts can also affect the performance and efficiency of machinery and equipment. Therefore, it is important to monitor and detect bolt looseness to prevent potential hazards and ensure the safety and reliability of structures and systems.

Source [1] 	 File: [data/mypaper.pdf] 	 Page: [6]
. The most direct method of monitoring bolt looseness is to
measure the axial force of the bolt using strain gauges; however, the complication of installation and
wiring introduces huge risks

Source [2] 	 File: [data/mypaper.pdf] 	 Page: [2]
 are widely used in hydropower plants. A failure of the bolted joint, especially the
ones o

In [89]:
query = "Who is Bill Gates?"
res = vectorstore.max_marginal_relevance_search(
    query=query,
    k=4,
    fetch_k=20,
    lambda_mult=0.5
)

print(res)

ValidationError: 1 validation error for Document
page_content
  str type expected (type=type_error.str)