##### Load Package

In [1]:
from langchain_openai import OpenAI
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

from langchain_community.vectorstores import FAISS
from langchain_core.vectorstores import VectorStoreRetriever
from langchain.chains import RetrievalQA

from langchain.docstore.document import Document
import os
from dotenv import load_dotenv

##### Set Up Environment

In [3]:
load_dotenv(dotenv_path = "../Key/.env")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# Document Loader

In [15]:
transcript_text = "Hello, and welcome to our Q4 2016 financial results conference call\nJoining us today to answer your questions is Brian Olsavsky, our CFO\nAs you listen to today's conference call"

document = [Document(page_content = transcript_text,
                    metadata = {"doc_id": "text_1"})]

document

[Document(page_content="Hello, and welcome to our Q4 2016 financial results conference call\nJoining us today to answer your questions is Brian Olsavsky, our CFO\nAs you listen to today's conference call", metadata={'doc_id': 'text_1'})]

* use document loaders to load data from a source

In [9]:
loader = TextLoader("Amazon_Transcript.txt")

document = loader.load()

document

[Document(page_content="Hello, and welcome to our Q4 2016 financial results conference call\nJoining us today to answer your questions is Brian Olsavsky, our CFO\nAs you listen to today's conference call, we encourage you to have our press release in front of you, which includes our financial results as well as metrics and commentary on the quarter\nPlease note, unless otherwise stated, all comparisons in this call will be against our results for the comparable period of 2015. Our comments and responses to your questions reflect management's views as of today, February 2, 2017 only and will include forward-looking statements\nActual results may differ materially\nAdditional information about factors that could potentially impact our financial results is included in today's press release and our filings with the SEC, including our most recent Annual Report on Form 10-K and subsequent filings\nDuring this call, we may discuss certain non-GAAP financial measures\nIn our press release, sli

# Document Splitter

* we may want to split a long document into smaller chunks that can fit into the model's context window \
  LangChain has a number of built-in document transformers that split, combine, and otherwise manipulate document \
  step 1: split the text into smalle, semantically meaningful chunks (often sentences) \
  step 2: start combining these small chunks into a larger chunk until reach a certain size \
  step 3: once we reach that size, make that chunk its own piece of text and then start crerating a new chunk of text with some overlap (to keeep context between chunks)

In [16]:
doc_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 0,
    length_function  = len
)

doc_split = doc_splitter.split_documents(document)

print(doc_split)
print(len(doc_split))
print(len(doc_split[0].page_content))


[Document(page_content="Hello, and welcome to our Q4 2016 financial results conference call\nJoining us today to answer your questions is Brian Olsavsky, our CFO\nAs you listen to today's conference call", metadata={'doc_id': 'text_1'})]
1
177


In [17]:
len(doc_split[0].page_content)

177

# Embeddings And Vector Stores

In [18]:
embedding = OpenAIEmbeddings()

xb = FAISS.from_documents(doc_split, embedding)

In [19]:
query_1 = "what is the plan for amazon"

query_1_answer = xb.similarity_search(query_1)
query_1_answer_score = xb.similarity_search_with_score(query_1)

print(query_1_answer[0].page_content)
print(query_1_answer_score[0][1])

Hello, and welcome to our Q4 2016 financial results conference call
Joining us today to answer your questions is Brian Olsavsky, our CFO
As you listen to today's conference call
0.5845087


# Retriever

## Vector Store-backed Retriever

In [20]:
retriever = xb.as_retriever(search_type="similarity_score_threshold", 
                            search_kwargs={"score_threshold": 0.5,
                                           "k": 2})
# * we can set a retrieval method that sets a similarity socre threshold and only returns documents with a score above that threshold
# * we can specify search kwargs like k

relevant_doc = retriever.get_relevant_documents("what is the plan for amazon")

relevant_doc

[Document(page_content="Hello, and welcome to our Q4 2016 financial results conference call\nJoining us today to answer your questions is Brian Olsavsky, our CFO\nAs you listen to today's conference call", metadata={'doc_id': 'text_1'})]