In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

### Load The Data

In [2]:
loader = PyPDFLoader('field-guide-to-data-science.pdf')

In [4]:
data = loader.load()
print(f'You have {len(data)} document(s) in your data')
print(f'There are {len(data[30].page_content)} characters in your document')

You have 126 document(s) in your data
There are 2812 characters in your document


### Chunking up data into smaller documents

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [7]:
print(f'Now you have {len(texts)} documents')

Now you have 162 documents


### Creating embeddings for semantic search

In [None]:
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
from apikey import OPENAI_API_KEY, PINECONE_API_KEY, PINECONE_API_ENV

In [15]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [20]:
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_API_ENV
)
index_name = 'langchaintest'

In [23]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [24]:
query = 'What are examples of good data science teams?'
docs = docsearch.similarity_search(query)

In [26]:
print(docs[0].page_content[:450])

Intelligence and cloud infrastructure development  
work. We saw the need for a  
new approach to distill value 
from our clients’ data. We 
approached the problem 
with a multidisciplinary 
team of computer scientists, 
mathematicians and domain 
experts. They immediately 
produced new insights and 
analysis paths, solidifying the 
validity of the approach. Since 
that time, our Data Science  
team has grown to 250 staff 
supporting dozens of cl


### Query the docs to get an answer

In [27]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [28]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type='stuff')

In [30]:
query = 'What is the collect stage of data maturity?'
docs = docsearch.similarity_search(query)

In [31]:
chain.run(input_documents=docs, question=query)

' The collect stage of data maturity focuses on collecting internal or external datasets, such as gathering sales records and corresponding weather data.'