## Langchain Documents and Vector DB - sentence-transformers, ChromaDB

1. VECTOR DB
- https://huggingface.co/sentence-transformers
- https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

2. LANGCHAIN 手冊
https://python.langchain.com/docs/integrations/vectorstores

3. 學習LANGCHAIN -> Documents and Vector DB 

## 初始環境設定

In [None]:
import os
from pathlib import Path
HOME = str(Path.home())
Add_Binarry_Path=HOME+'/.local/bin'
os.environ['PATH']=os.environ['PATH']+':'+Add_Binarry_Path
current_foldr=!pwd
current_foldr=current_foldr[0]
current_foldr

## 安裝套件

In [None]:
## For colab
!pip install chromadb gdown langchain openai pypdf python-dotenv sentence-transformers -q

### LOAD LIBRARY### 2. LOAD LIBRARY

In [None]:
# Load library
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

### 文件處理

In [None]:
!mkdir -p data/pdf/
!gdown 1AldhEWVCtcE50XARgSnXR0azZ965nNmT -O data/pdf/

In [None]:
# 文件入庫
pdf_file='./data/pdf/e2729e76-29a0-4be5-9eef-67809b05d6b9.pdf'
loader= PyPDFLoader(pdf_file)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

print(len(texts))
print(texts[3:4])

### 片段文字向量化與暫時存入資料庫

In [None]:
Embeddings_ID="sentence-transformers/all-MiniLM-L6-v2"
embeddings=HuggingFaceEmbeddings(model_name=Embeddings_ID)
vectortdb = Chroma.from_documents(texts, embeddings)
#DB_PATH = 'vectorstore/db_chroma'
#vectortdb = Chroma.from_documents(documents=texts, embedding=embeddings, persist_directory=DB_PATH)

### 輸入文字像量化與暫存資料庫搜尋

In [None]:
import os

# Load DB
#Embeddings_ID="sentence-transformers/all-MiniLM-L6-v2"
#embeddings=HuggingFaceEmbeddings(model_name=Embeddings_ID)
#DB_PATH = 'vectorstore/db_chroma'
#vectortdb = Chroma(persist_directory=DB_PATH, embedding_function=embeddings)

#: Test Search in Vector DB
query = "請說明本季季報內容？請依以下順序描述重點：收入、毛利率、營運支出、營運利潤率、淨利潤和每股盈餘"
source_documents=vectortdb.similarity_search(query, k=3)

for i, doc in enumerate(source_documents):
    page_content=source_documents[i].page_content
    page=source_documents[i].metadata["page"]
    source=source_documents[i].metadata["source"]
    file = os.path.basename(source) 
    print("Source: "+file+", Page "+str(page+1) )
    print(page_content)
    print("\n\n")

In [None]:
## Content
content= source_documents[0].page_content+"\n"+source_documents[1].page_content+"\n"+source_documents[2].page_content+"\n"
print(content)

In [None]:
## RAG

## OPENAI KEY
from openai import OpenAI

client = OpenAI(
  base_url = "https://integrate.api.nvidia.com/v1",
  api_key = "nvapi-"
)

response = client.chat.completions.create(
  #model="nvidia/nemotron-4-340b-instruct",
  model="microsoft/phi-3-medium-4k-instruct",    
  messages=[
      {"role":"user","content":"請參考下方的資料, 說明本季季報內容？請依以下順序描述重點：收入、毛利率、營運支出、營運利潤率、淨利潤和每股盈餘"},
      {"role":"user","content": content}    
  ],
  temperature=0.2,
  top_p=0.7,
  max_tokens=1024,
)
print(response.choices[0].message.content)