In [1]:
# Install Python 3.10.0: https://www.python.org/ftp/python/3.10.0/python-3.10.0-amd64.exe
# (optionally) upgrade pip: python3 -m pip install --upgrade pip

# pip install wheel
# pip install unstructured[local-inference]
# pip install poppler-utils
# pip install git+https://github.com/facebookresearch/detectron2.git

# Download tesseract-ocr: https://digi.bib.uni-mannheim.de/tesseract/tesseract-ocr-w64-setup-5.3.1.20230401.exe
# Download poppler: https://github.com/oschwartz10612/poppler-windows/releases/download/v23.01.0-0/Release-23.01.0-0.zip
# Put poppler in path: C:\Program Files\Poppler\Library\bin

# pip install langchain
# pip install pinecone-client[grpc]
# pip install tiktoken
# pip install python-dotenv


In [4]:
import pytesseract as pt

pt.pytesseract.tesseract_cmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"

from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


### Load your data

In [5]:
loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [6]:
data = loader.load()

In [7]:
import pickle

with open("../data/field-guide-to-data-science.pkl", "wb") as f:
    pickle.dump(data, f)


In [8]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1 document(s) in your data
There are 167575 characters in your document


### Chunk your data up into smaller documents

In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [10]:
print (f'Now you have {len(texts)} documents')

Now you have 226 documents


### Create embeddings of your documents to get ready for semantic search

In [11]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [24]:
from dotenv import load_dotenv
load_dotenv()

import os

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_API_ENV = os.getenv("PINECONE_API_ENV")


In [13]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [14]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "test" # put in the name of your pinecone index here

In [16]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [26]:
query = "What are examples of good data science teams?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [27]:
# Here's an example of the first document that was returned
docs[0].page_content[:250]

'At Booz Allen Hamilton, we built an industry-leading team of Data Scientists. Over the course of hundreds of analytic challenges for countless clients, we’ve unraveled the DNA of Data Science. Many people have put forth their thoughts on single aspec'

### Query those docs to get your answer back

In [28]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [29]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [30]:
query = "What is data science?"
docs = docsearch.similarity_search(query, include_metadata=True)
chain.run(input_documents=docs, question=query)


' Data Science is the art of turning data into actions. This is accomplished through the creation of data products, which provide actionable information without exposing decision makers to the underlying data or analytics (e.g., buy/sell strategies for financial instruments, a set of actions to improve product yield, or steps to optimize processes).'