## Retrieve arguments generated from scratch (vector database)

In [54]:
# Import necessary libraries
## 設定 OpenAI API Key 變數
from dotenv import load_dotenv
import os

# Load the environment variables from .env file
load_dotenv()

# Access the API key
openai_api_key = os.getenv('OPENAI_API_KEY')


In [55]:
# !pip install langchain -q

In [56]:
# !pip install pymupdf -q

In [57]:
import requests
import json
from pprint import pp

## openai api GPT LLM model

In [58]:
def get_completion(messages, model="gpt-3.5-turbo", temperature=0, max_tokens=1000):
  payload = { "model": model, "temperature": temperature, "messages": messages, "max_tokens": max_tokens }
  headers = { "Authorization": f'Bearer {openai_api_key}', "Content-Type": "application/json" }
  response = requests.post('https://api.openai.com/v1/chat/completions', headers = headers, data = json.dumps(payload) )
  obj = json.loads(response.text)
  if response.status_code == 200 :
    return obj["choices"][0]["message"]["content"]
  else :
    return obj["error"]

## openai api GPT embedding model

In [59]:
def get_embeddings(input, dimensions = 1536, model="text-embedding-3-small"):
  payload = { "input": input, "model": model, "dimensions": dimensions }
  headers = { "Authorization": f'Bearer {openai_api_key}', "Content-Type": "application/json" }
  response = requests.post('https://api.openai.com/v1/embeddings', headers = headers, data = json.dumps(payload) )
  obj = json.loads(response.text)
  if response.status_code == 200 :
    return obj["data"][0]["embedding"]
  else :
    return obj["error"]

## PDF Document Loader from langchain

https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf

Load PDF using pypdf into array of documents, where each document contains the page content and metadata with page number.

In [60]:
# !pip install pypdf -q

## 測試資料

2023台灣產業AI化大調查完整報告.pdf

In [61]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("./2023台灣產業AI化大調查完整報告.pdf")
text_docs = loader.load()

# 抓取網站資料
#from langchain.document_loaders import WebBaseLoader
#loader = WebBaseLoader("https://eugeneyan.com/writing/llm-patterns/")
#text_docs = loader.load()


In [62]:
len(text_docs)

36

In [63]:
text_docs[0]

Document(page_content='', metadata={'source': './2023台灣產業AI化大調查完整報告.pdf', 'page': 0})

## install Chroma vector database

https://www.trychroma.com/

In [64]:
# !pip install chromadb -q

In [65]:
import chromadb
# In-memory chroma
chroma_client = chromadb.Client()

collection = chroma_client.create_collection(name="collections")

UniqueConstraintError: Collection collections already exists

In [None]:
# # delete a collection if you want
# chroma_client.delete_collection("collections")

## 利用 langchain 來對提取出來的 PDF 文本進行 chunks 分割

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)


## 將切割好的 chunk 加入到 chroma collections

In [None]:
text_docs[0].metadata

{'source': './2023台灣產業AI化大調查完整報告.pdf', 'page': 0}

In [70]:
for page in text_docs:
    chunks = text_splitter.split_text(page.page_content)

    if chunks == []:
        continue

    collection.add(
        documents = chunks,
        embeddings = [ get_embeddings(chunk) for chunk in chunks ] , # chroma 有內建的 embedding 功能，可以直接使用，但也可以指定自己的 embedding
        metadatas = [ { "page": page.metadata['page'], "date": "2024-03-31" } for x in range( len(chunks) ) ], # 可以加入 metadata 作為之後檢索過濾條件
        ids=[f"doc-1-page-{page.metadata['page']}-chunk-{x}" for x in range( len(chunks) ) ]
    )

In [74]:
question = "哪一個產業對於AI的落地發展較全面?"

## 透過 chroma 提供的 API 匹配最相似的 chunks

Chroma API 文件: https://docs.trychroma.com/usage-guide

In [75]:
results = collection.query(
    query_embeddings = get_embeddings(question),
    # 可有 where 參數可針對上述的 metadatas 做過濾，例如日期、頁數等
    n_results=3
)


In [80]:
results

{'ids': [['doc-1-page-24-chunk-0',
   'doc-1-page-6-chunk-0',
   'doc-1-page-6-chunk-1']],
 'distances': [[0.6292528510093689, 0.7028548717498779, 0.7053892612457275]],
 'metadatas': [[{'date': '2024-03-31', 'page': 24},
   {'date': '2024-03-31', 'page': 6},
   {'date': '2024-03-31', 'page': 6}]],
 'embeddings': None,
 'documents': [['25\n2023 �� AI化大調查趨勢與管理 策略與分析Total 資通訊科技業 (ICT) 專業服務業 零售貿易服務業 製造業 / 政府機關 / 其他\n17.8  \n平均分數 (0-100  分) 25.5 14.3 14.1 10.4\n目前仍在規劃中 / 沒有應用過任何 AI技術 37.2% 22.3% 35.1% 43.2% 60.5%\n生成式 AI( 不包含使用現成服務，如 ChatGPT 、Copilot ) 28.6% 37.7% 26.0% 35.1% 13.6%\nMachine Learning(ML) 25.8% 37.7% 16.9% 21.6% 17.3%\n電腦視覺 22.5% 34.6% 14.3% 13.5% 14.8%\n自然語言處理 20.0% 26.9% 20.8% 18.9% 8.6%\n語音與音訊處理 18.8% 23.8% 26.0% 10.8% 7.4%\nDeep Learning(DL) 16.9% 28.5% 11.7% 2.7% 9.9%\n推薦系統 12.9% 11.5% 11.7% 18.9% 13.6%\nAutoML / Low Code AI / No Code AI 9.8% 13.8% 6.5% 10.8% 6.2%\nEdge AI 9.5% 18.5% 5.2% .0% 3.7%\nMLOps 8.0% 14.6% 3.9% 2.7% 3.7%\nReinforcement Learning(RL) 5.5% 9.2% .0%

## 將 retrieve 的結果整理成一個完整的 context

In [83]:
documents = results['documents'][0]
context = '\n'.join('* ' + doc for doc in documents)


print(context)

* 25
2023 �� AI化大調查趨勢與管理 策略與分析Total 資通訊科技業 (ICT) 專業服務業 零售貿易服務業 製造業 / 政府機關 / 其他
17.8  
平均分數 (0-100  分) 25.5 14.3 14.1 10.4
目前仍在規劃中 / 沒有應用過任何 AI技術 37.2% 22.3% 35.1% 43.2% 60.5%
生成式 AI( 不包含使用現成服務，如 ChatGPT 、Copilot ) 28.6% 37.7% 26.0% 35.1% 13.6%
Machine Learning(ML) 25.8% 37.7% 16.9% 21.6% 17.3%
電腦視覺 22.5% 34.6% 14.3% 13.5% 14.8%
自然語言處理 20.0% 26.9% 20.8% 18.9% 8.6%
語音與音訊處理 18.8% 23.8% 26.0% 10.8% 7.4%
Deep Learning(DL) 16.9% 28.5% 11.7% 2.7% 9.9%
推薦系統 12.9% 11.5% 11.7% 18.9% 13.6%
AutoML / Low Code AI / No Code AI 9.8% 13.8% 6.5% 10.8% 6.2%
Edge AI 9.5% 18.5% 5.2% .0% 3.7%
MLOps 8.0% 14.6% 3.9% 2.7% 3.7%
Reinforcement Learning(RL) 5.5% 9.2% .0% 5.4% 4.9%
Base:N=325 130 77 37 81Q. 貴公司曾應用以上數據在哪些 AI 技術領域？ (複 選) Technical 1企業曾應用的 AI技術項目
整體產業有近四成仍在規劃或沒有應用任何 AI 技術項目。而生成式 AI 的出現，
提供了企業新的技術選擇，相較於去年使用的技術以機器學習 (含深度學習 ) 技術為主， 並多使用在電腦視覺相關的應用場景， 如辨識、 偵測、 追蹤與分割等功能上。
2023 年的前三大應用 AI 的技術領域為生成式 AI、Machine Learning (ML)、電腦
視覺。2023 整體產業 AI 化表現
* 7
2023 �� AI化大調查趨勢與管理 策略與分析模型） +Automation（自動化流程），三者缺一不可。簡單來說

## 將整理出來的 context 結合設計好的 prompt 送入 LLM 進行問答

prompt template 參考自 https://docs.anthropic.com/claude/docs/advanced-text-analysis

In [84]:
prompt = f"""
I'm going to give you a document. Then I'm going to ask you a question about it. I'd like you to first write down exact quotes of parts of the document that would help answer the question, and then I'd like you to answer the question using facts from the quoted content. Here is the document:

<document>
{context}
</document>

Here is the first question:  {question}

First, find the quotes from the document that are most relevant to answering the question, and then print them in numbered order. Quotes should be relatively short.

If there are no relevant quotes, write "No relevant quotes" instead.

Then, answer the question, starting with "Answer:".  Do not include or reference quoted content verbatim in the answer. Don't say "According to Quote [1]" when answering. Instead make references to quotes relevant to each section of the answer solely by adding their bracketed numbers at the end of relevant sentences.

Thus, the format of your overall response should look like what's shown between the <example></example> tags.  Make sure to follow the formatting and spacing exactly.

<example>

Relevant quotes:
[1] "Company X reported revenue of $12 million in 2021."
[2] "Almost 90% of revenue came from widget sales, with gadget sales making up the remaining 10%."

Answer:
Company X earned $12 million. [1]  Almost 90% of it was from widget sales. [2]

</example>

If the question cannot be answered by the document, say so.

Answer the question immediately without preamble.
請用台灣繁體中文回答.
"""

In [86]:
result = get_completion([ {"role": "user", "content": prompt }], model="gpt-4-turbo-preview")
print(result)

Relevant quotes:
[1] "生成式 AI( 不包含使用現成服務，如 ChatGPT 、Copilot ) 28.6% 37.7% 26.0% 35.1% 13.6%"
[2] "Machine Learning(ML) 25.8% 37.7% 16.9% 21.6% 17.3%"
[3] "電腦視覺 22.5% 34.6% 14.3% 13.5% 14.8%"
[4] "2023 年的前三大應用 AI 的技術領域為生成式 AI、Machine Learning (ML)、電腦視覺。"

Answer:
專業服務業對於AI的落地發展較全面。[1][2][3][4]


## demo UI 介面

In [87]:
import gradio as gr
gr.close_all()

def handle_input(query):
  results = collection.query(
    query_embeddings = get_embeddings(query),
    n_results=3
  )

  documents = results['documents'][0]
  context = '\n'.join('* ' + doc for doc in documents)

  prompt = f"""
I'm going to give you a document. Then I'm going to ask you a question about it. I'd like you to first write down exact quotes of parts of the document that would help answer the question, and then I'd like you to answer the question using facts from the quoted content. Here is the document:

<document>
{context}
</document>

Here is the first question:  {query}

First, find the quotes from the document that are most relevant to answering the question, and then print them in numbered order. Quotes should be relatively short.

If there are no relevant quotes, write "No relevant quotes" instead.

Then, answer the question, starting with "Answer:".  Do not include or reference quoted content verbatim in the answer. Don't say "According to Quote [1]" when answering. Instead make references to quotes relevant to each section of the answer solely by adding their bracketed numbers at the end of relevant sentences.

Thus, the format of your overall response should look like what's shown between the <example></example> tags.  Make sure to follow the formatting and spacing exactly.

<example>

Relevant quotes:
[1] "Company X reported revenue of $12 million in 2021."
[2] "Almost 90% of revenue came from widget sales, with gadget sales making up the remaining 10%."

Answer:
Company X earned $12 million. [1]  Almost 90% of it was from widget sales. [2]

</example>

If the question cannot be answered by the document, say so.

Answer the question immediately without preamble.
請用台灣繁體中文回答.
"""

  result = get_completion([ {"role": "user", "content": prompt }], model="gpt-4-turbo-preview")
  return result

demo = gr.Interface(fn=handle_input,
                    inputs=[gr.Textbox(label="您的問題", lines=1)],
                    outputs=[gr.Textbox(label="回答", lines=10)],
                    allow_flagging="never",
                    title="與財經報告 PDF 聊天",
                    examples=[ ["AI產業趨勢如何?"], ["美國經濟如何?"], ["中國經濟如何?"], ["台灣經濟如何?"]]
                   )
demo.launch(share=True, debug=True)

Running on local URL:  http://127.0.0.1:7860

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app
