In [2]:
from langchain.document_loaders import JSONLoader
from langchain.document_loaders import DirectoryLoader
import os
from pathlib import Path
import json
from pprint import pprint
from langchain.vectorstores import Chroma
from langchain.vectorstores import Qdrant
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

from langchain.chains.question_answering import load_qa_chain

from dotenv import load_dotenv, find_dotenv

In [6]:
load_dotenv(find_dotenv())

True

In [9]:
# Get the current directory
current_dir = os.getcwd()
# Get the parent directory
parent_dir = os.path.dirname(current_dir)

# Specify the folder name within the grandparent directory
folder_name = "media/files"

# Get the path of the folder within the grandparent directory
media_path = os.path.join(parent_dir, folder_name)

In [10]:
# Data Loader

file_path = os.path.join(media_path,"20201102_IPA_QA_en-US.json")
data = json.loads(Path(file_path).read_text())
#pprint(data)

loader = JSONLoader(
    file_path=file_path,
    jq_schema='.data[].paragraph[].context')

data = loader.load()

In [11]:
#Splitting the Data (documents) into chunks. Could be skipped since JSON loader already loads it into multiple documents
#chunk size should be a hyper parameter which the user can control depedning on the documents

 

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 250,
    chunk_overlap  = 30,
    length_function = len,
    add_start_index = True
)

texts = text_splitter.split_documents(data)

In [12]:
len(data)

1617

In [13]:
len(texts)

1785

In [14]:
embeddings_generator = OpenAIEmbeddings()
#db_index = Qdrant.from_documents(texts,embeddings_generator, collection_name = "agent_name-agent_id")

url = os.environ.get("QUADRANT_URL")
api_key = os.environ.get("QUADRANT_API_KEY")
qdrant_index = Qdrant.from_documents(
    documents=texts, 
    embedding=embeddings_generator, 
    url=url, prefer_grpc=True, api_key=api_key, 
    collection_name="my_documents",
)

In [15]:
qdrant_index

<langchain.vectorstores.qdrant.Qdrant at 0x119cd9550>

In [None]:
## everything below this is for chat functionaolity. everything above this is ingesting data into vector store

In [16]:
# This might not work with Pinecone (retriever object is a new thing in Langchain)
# Newer connvention

retriever = qdrant_index.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": .2, "k":3, "score":True})
docs = retriever.get_relevant_documents("What has four wheels and flies into space") #query

In [17]:
docs

[Document(lc_kwargs={'page_content': 'Maybe a joke can lighten your mood. Have you heard this one before? What has four wheels and flies into space? An astronauto!', 'metadata': {'seq_num': 673, 'source': '/Users/nikhil.menon/Library/CloudStorage/OneDrive-BigPictureGmbH/BMW-BP/bmw-llm-dms/backend/media/files/20201102_IPA_QA_en-US.json', 'start_index': 0}}, page_content='Maybe a joke can lighten your mood. Have you heard this one before? What has four wheels and flies into space? An astronauto!', metadata={'seq_num': 673, 'source': '/Users/nikhil.menon/Library/CloudStorage/OneDrive-BigPictureGmbH/BMW-BP/bmw-llm-dms/backend/media/files/20201102_IPA_QA_en-US.json', 'start_index': 0}),
 Document(lc_kwargs={'page_content': 'Maybe a joke can lighten your mood. Have you heard this one before? What has four wheels and flies into space? An astronauto!', 'metadata': {'seq_num': 675, 'source': '/Users/nikhil.menon/Library/CloudStorage/OneDrive-BigPictureGmbH/BMW-BP/bmw-llm-dms/backend/media/files

In [18]:
# older but probably more widely used convention
#Use this when using pinecone

def get_similar_docs(query, k = 2, score = False):
    if score:
        similar_docs = qdrant_index.similarity_search_with_score(query, k =k)
    else:
        similar_docs =  qdrant_index.similarity_search(query, k = k)
    return similar_docs

query = "Maybe a joke can lighten your mood. Have you heard this one before? What has four wheels and flies into space? An astronauto"
similar_docs = get_similar_docs(query, 3,True)
similar_docs

[(Document(lc_kwargs={'page_content': 'Maybe a joke can lighten your mood. Have you heard this one before? What has four wheels and flies into space? An astronauto!', 'metadata': {'seq_num': 673, 'source': '/Users/nikhil.menon/Library/CloudStorage/OneDrive-BigPictureGmbH/BMW-BP/bmw-llm-dms/backend/media/files/20201102_IPA_QA_en-US.json', 'start_index': 0}}, page_content='Maybe a joke can lighten your mood. Have you heard this one before? What has four wheels and flies into space? An astronauto!', metadata={'seq_num': 673, 'source': '/Users/nikhil.menon/Library/CloudStorage/OneDrive-BigPictureGmbH/BMW-BP/bmw-llm-dms/backend/media/files/20201102_IPA_QA_en-US.json', 'start_index': 0}),
  0.9871869683265686),
 (Document(lc_kwargs={'page_content': 'Maybe a joke can lighten your mood. Have you heard this one before? What has four wheels and flies into space? An astronauto!', 'metadata': {'seq_num': 675, 'source': '/Users/nikhil.menon/Library/CloudStorage/OneDrive-BigPictureGmbH/BMW-BP/bmw-ll

In [148]:
#model_name = "gpt-3.5-turbo"
chat_llm = OpenAI(temperature = 0)
chain = load_qa_chain(chat_llm, chain_type="stuff")

def get_answer(query):
    similar_docs = get_similar_docs(query, 3,False)
    answer = chain.run(input_documents=similar_docs, question=query)
    return answer

In [149]:
query = "What has four wheels and flies into space? An astronauto?"
get_answer(query)

" Yes, that's correct. An astronauto has four wheels and flies into space."

In [146]:
docs[1]

Document(lc_kwargs={'page_content': 'Maybe a joke can lighten your mood. Have you heard this one before? What has four wheels and flies into space? An astronauto!', 'metadata': {'source': '/Users/nikhil.menon/Library/CloudStorage/OneDrive-BigPictureGmbH/BMW-BP/bmw-llm-dms/backend/media/files/20201102_IPA_QA_en-US.json', 'seq_num': 676, 'start_index': 0}}, page_content='Maybe a joke can lighten your mood. Have you heard this one before? What has four wheels and flies into space? An astronauto!', metadata={'source': '/Users/nikhil.menon/Library/CloudStorage/OneDrive-BigPictureGmbH/BMW-BP/bmw-llm-dms/backend/media/files/20201102_IPA_QA_en-US.json', 'seq_num': 676, 'start_index': 0})

In [147]:
similar_docs[1]

(Document(lc_kwargs={'page_content': 'Maybe a joke can lighten your mood. Have you heard this one before? What has four wheels and flies into space? An astronauto!', 'metadata': {'source': '/Users/nikhil.menon/Library/CloudStorage/OneDrive-BigPictureGmbH/BMW-BP/bmw-llm-dms/backend/media/files/20201102_IPA_QA_en-US.json', 'seq_num': 675, 'start_index': 0}}, page_content='Maybe a joke can lighten your mood. Have you heard this one before? What has four wheels and flies into space? An astronauto!', metadata={'source': '/Users/nikhil.menon/Library/CloudStorage/OneDrive-BigPictureGmbH/BMW-BP/bmw-llm-dms/backend/media/files/20201102_IPA_QA_en-US.json', 'seq_num': 675, 'start_index': 0}),
 0.14820362627506256)

In [88]:
from langchain.indexes import VectorstoreIndexCreator

# here yo can pass multiple loaders (of different files)
index = VectorstoreIndexCreator().from_loaders([loader])

In [45]:
query = "I think I have feeelings for you"
index.query_with_sources(query)

{'question': 'I think I have feeelings for you',
 'answer': ' This question cannot be answered with the given information.\n',
 'sources': '/Users/nikhil.menon/Library/CloudStorage/OneDrive-BigPictureGmbH/BMW-BP/bmw-llm-dms/backend/media/files/20201102_IPA_QA_en-US.json'}

In [79]:
index.vectorstore

<langchain.vectorstores.chroma.Chroma at 0x1279235d0>

In [61]:

embeddings = OpenAIEmbeddings()