# 환경설정

In [17]:
#| label: openai-setup
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import openai
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key  = os.getenv('ENV_OPENAI_API_KEY')

# 데이터

In [2]:
# !pip install unstructured
# !pip install pdf2image
# !pip install pytesseract
# !pip install torchvision 
# !pip install 'git+https://github.com/facebookresearch/detectron2.git'
# !pip install unstructured_inference
loader = UnstructuredPDFLoader("../data/field-guide-to-data-science.pdf")
data = loader.load()


In [3]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1 document(s) in your data
There are 167738 characters in your document


# 텍스트 쪼개기

In [12]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
print (f'Now you have {len(texts)} documents')

Now you have 227 documents


In [16]:
texts[:1]

[Document(page_content='FIELD GUIDEto DATA SCIENCEBooz | Allen | Hamilton\n\nTHE FI ELD GUI DE DATA SCIENCE to S E C O N D E D I T I O N © COPYRIGHT 2015 BOOZ ALLEN HAMILTON INC. ALL RIGHTS RESERVED.\n\nA\n\n\n\nData Science touches every aspect of our lives on a daily basis. When we visit the doctor, drive our get on an airplane, or shop for services, Data is changing the way we interact with and explore\n\nOur world is now measured, mapped, and recorded in digital bits. Entire lives, from birth to death, are now catalogued in the digital realm. These data, originating from such diverse sources as connected vehicles, underwater microscopic cameras, and photos we post to social media, have propelled us into the greatest age of discovery humanity has ever known. It is through Data Science that we are unlocking the secrets hidden within these data. We are making discoveries that will forever change how we live and interact', metadata={'source': '../data/field-guide-to-data-science.pdf'})

# 임베딩

In [5]:
# !pip isntall pinecone-client
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [21]:
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('ENV_OPENAI_API_KEY'))
# initialize pinecone
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),  # find at app.pinecone.io
    environment=os.getenv('PINECONE_API_ENV')  # next to api key in console
)
index_name = "langchain-pdf" # put in the name of your pinecone index here
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [23]:
query = "What are examples of good data science teams?"
docs = docsearch.similarity_search(query, include_metadata=True)
docs[0].page_content[:250]

'At Booz Allen Hamilton, we built an industry-leading team of Data Scientists. Over the course of hundreds of analytic challenges for countless clients, we’ve unraveled the DNA of Data Science. Many people have put forth their thoughts on single aspec'

# 질의응답

In [25]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
llm = OpenAI(temperature=0, openai_api_key=os.getenv('ENV_OPENAI_API_KEY'))
chain = load_qa_chain(llm, chain_type="stuff")

In [27]:
query = "What is the collect stage of data maturity?"
docs = docsearch.similarity_search(query, include_metadata=True)
chain.run(input_documents=docs, question=query)

' The collect stage of data maturity is focused on collecting internal or external datasets. It seeks to enhance or refine raw data as well as leverage basic analytic functions such as counts.'

In [28]:
query =" What is the ideal data science team structure?"
docs = docsearch.similarity_search(query, include_metadata=True)
chain.run(input_documents=docs, question=query)

' It depends on the size, complexity, and business drivers of the organization. The three models to consider are Centralized, Deployed, and Diffused.'

In [29]:
query =" 가장 일반적인 데이터 과학 팀 구조는 어떻게 됩니까?"
docs = docsearch.similarity_search(query, include_metadata=True)
chain.run(input_documents=docs, question=query)

' The most common Data Science team structure is the Deployed Model.'