In [1]:
# Install Python 3.10.0: https://www.python.org/ftp/python/3.10.0/python-3.10.0-amd64.exe
# (optionally) upgrade pip: python3 -m pip install --upgrade pip

# pip install wheel
# pip install unstructured[local-inference]
# pip install poppler-utils
# pip install git+https://github.com/facebookresearch/detectron2.git

# Download tesseract-ocr: https://digi.bib.uni-mannheim.de/tesseract/tesseract-ocr-w64-setup-5.3.1.20230401.exe
# Download poppler: https://github.com/oschwartz10612/poppler-windows/releases/download/v23.01.0-0/Release-23.01.0-0.zip
# Put poppler in path: C:\Program Files\Poppler\Library\bin

# pip install pinecone-client[grpc] chromadb openai langchain tiktoken python-dotenv


In [4]:
# import pytesseract as pt

# pt.pytesseract.tesseract_cmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"

# from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader


### Load your data


In [5]:
# loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")


In [6]:
# data = loader.load()


In [1]:
import pickle

# with open("../data/field-guide-to-data-science.pkl", "wb") as f:
#     pickle.dump(data, f)

with open("../data/lecture_notes.pkl", "rb") as f:
    data = pickle.load(f)


In [2]:
print(f"You have {len(data)} document(s) in your data")
print(f"There are {len(data[0].page_content)} characters in your document")


You have 1 document(s) in your data
There are 481103 characters in your document


### Chunk your data up into smaller documents


In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)


In [4]:
print(f"Now you have {len(texts)} documents")


Now you have 618 documents


### Create embeddings of your documents to get ready for semantic search


In [5]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
import chromadb


In [6]:
from dotenv import load_dotenv

load_dotenv()

import os

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


In [7]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


In [8]:
collection_name = "lecture_notes"
docsearch = Chroma.from_texts(
    [t.page_content for t in texts], embeddings, collection_name=collection_name
)


Using embedded DuckDB without persistence: data will be transient


In [9]:
query = "What is this book about?"
docs = docsearch.similarity_search(query, include_metadata=True)


In [10]:
# Here's an example of the first document that was returned
docs[0].page_content[:250]


'Each chapter of this document provides a literature section that describes suggested reading. The suggested reading is not part of the exam. It is for students who are interested in background information and/or a different perspective on or presenta'

### Query those docs to get your answer back


In [11]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain


In [12]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)


In [25]:
query = """
"""

docs = docsearch.similarity_search(query, include_metadata=True)
chain = load_qa_chain(llm, chain_type="stuff")
chain.run(input_documents=docs, question=query)


' The security issue in this article is related to protecting the online music store from attackers and their goals. This includes protecting the store from unauthorized access, data breaches, and other malicious activities. The security requirements engineering process should identify the actors, their interests, and interdependencies, as well as potential attackers and their goals. The security design should then be enhanced by considering potential threats and techniques introduced in chapters 8, 9, and 10, and by applying countermeasures, threats, and techniques to protect the store from attackers.'

In [18]:
# from langchain.chains.summarize import load_summarize_chain

# chain = load_summarize_chain(llm, chain_type="map_reduce")
# chain.run(docs)
