In [None]:
## INSTALL PACKAGE
!pip install -q markdown pinecone-client openai gdown xformers ctransformers tokenizers transformers accelerate langchain chainlit sentence_transformers chromadb unstructured PyPDF2 pypdf bitsandbytes faiss_cpu faiss_gpu huggingface_hub hf_transfer optimum -q
!pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/  -q # Use cu117 if on CUDA 11.7

In [None]:
# 01: Configure
pdf_file='Medical_Chatbot.pdf'
PINECONE_API_KEY='xxxxx'
PINECONE_API_ENV='gcp-starter'
index_name="cjz-medical"
Embeddings_ID="sentence-transformers/all-MiniLM-L6-v2"

In [None]:
!gdown 1pUDgs3YMnlr8See8Rld3L1ZRiTeeOlMM
#!mkdir pdfs
#!gdown 1pUDgs3YMnlr8See8Rld3L1ZRiTeeOlMM -O pdfs/Medical_Chatbot.pdf

In [None]:
!ls Medical_Chatbot.pdf

In [None]:
# 02: Load LIBRARY
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone

In [None]:
# 03: Locad PDF
loader= PyPDFLoader(pdf_file)
data=loader.load()
data[0:10]  #PAGE

In [None]:
# 04: Text splitter
text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
docs=text_splitter.split_documents(data)
docs[1000:1010]  # CHUNK

In [None]:
# 05: Embeddings 模型 384維度
#embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
embeddings=HuggingFaceEmbeddings(model_name=Embeddings_ID)

In [None]:
# 06: 儲存至 pinecone 向量資料庫
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
docsearch = Pinecone.from_texts([t.page_content for t in docs], embeddings, index_name=index_name)

In [None]:
# 07. 向量驗證
# 0.00100201822,-0.0879787,-0.0121345911,0.000141251716,-0.0305802971,-0.0635476932,-0.00554865832,0.0155437924,0.0310709625,-0.0551753715,-0.0272236336,0.0335728712,-0.0554190911,0.0753775463,-0.0810973197,0.0584557615,-0.0399320163,-0.0538939722,-0.0225356463,-0.0228182543,0.0983394831,-0.0244991,-0.0298304707,0.0355674885,0.0172437858,-0.0290708039,0.00247312849,-0.00694054458,0.00934407953,-0.00675766775,0.0333429798,-0.0124766016,-0.0394415967,0.0457644,-0.0280858278,-0.0427881218,-0.0202028789,-0.0481561273,-0.04873503,-0.0477370508,0.0865190178,-0.150480494,-0.059747465,0.0476341136,-0.0901464596,0.0377526321,-0.0655688867,-0.0507967845,-0.0195009783,0.00771954097,-0.0589987785,-0.047139097,-0.0696333125,0.0627996847,0.0106975567,0.0732211769,0.0296002813,-0.0103422487,0.0250978488,-0.13682501,-0.127584696,0.0958755761,-0.00386591605,-0.0296135414,0.0230881833,0.0138041703,-0.00896615628,-0.0221488737,0.0524216853,-0.0445768125,-0.0183927678,-0.00776625,0.0319657773,-0.00278262165,-0.0487341136,-0.0595503077,-0.0783658326,-0.00741075212,-0.0194750428,-0.0124614071,0.0757125393,0.0245649051,0.0455409102,0.0816792771,-0.0166955534,-0.00727742072,-0.0399827473,0.0791423321,-0.0169981532,0.0228592027,0.0322342776,0.0220089331,-0.0803845599,-0.0361780152,0.0310536,-0.0731141567,0.0447544456,-0.0957708806,0.0532552712,-0.0466087721,-0.0489482,0.0236297511,0.00609217,-0.00254002307,-0.049574744,0.0289640799,0.0697844326,0.0402957909,-0.0338448025,0.0117008,0.0293432549,-0.0149991047,0.0869607,-0.0211856663,0.0100638177,0.123417199,0.0284997802,0.0490373857,-0.0340036042,-0.0103725092,0.0208557956,0.0284522176,0.104265921,-0.0161122531,0.0369163714,0.163942218,0.021986492,2.20114697e-33,0.00960067566,0.00104967237,-0.033008378,0.0417723432,-0.00391326286,0.0338333957,0.0128404535,0.0195410717,0.0746719539,-0.0779181048,-0.0450207144,-0.0242992,-0.0512985587,0.0279173553,-0.0642742,-0.0334506929,0.0669692159,0.0240977444,-0.00223330781,0.0035340609,0.0273019671,-0.0420355089,0.00951039512,0.0369071253,-0.0114566712,0.0268442761,-0.0680790916,0.093645364,-0.0670209825,-0.0118308933,0.0512493029,0.0372889452,-0.097853452,-0.0900082737,0.000475601584,0.0852474943,0.0702580512,0.0604681596,0.0181875732,0.0353608802,0.107898802,-0.020102283,0.127937973,-0.0304531362,0.027359169,-0.0426012538,-0.18722938,0.0118591515,-0.0162597187,0.076915212,-0.0327949077,-0.0150794284,-0.012617101,-0.0486724563,-0.00720037147,0.0281194896,-0.00927174557,-0.035107743,-0.0114448303,0.0261171851,0.0657774,0.0287979208,0.0377273075,-0.0133055691,0.00259780418,0.0280126166,0.00226550736,-0.0329276361,-0.0624877959,0.0838453323,-0.0382638201,-0.0379554629,0.0931681395,-0.0650517717,-0.00807021186,-0.0411833301,-0.0181588605,-0.0337772,0.0340428054,-0.0515089966,0.0788559,-0.0139547503,0.0716609,0.0195527915,0.0879508853,-0.0486497842,-0.0261932295,-0.0284969099,-0.0803705305,-0.00554771116,-0.0485168546,-0.118322894,-0.00429766951,-0.0592155978,0.0616180934,-2.57615189e-33,0.180445179,-0.0680550337,0.00672602095,0.0606214851,0.0976018086,0.0191470161,0.0243674498,0.10021738,0.0316511653,0.0886515453,0.00493130693,0.0803146511,-0.0365655608,-0.112197742,0.0133931348,0.0129120192,0.0258666985,-0.0653172359,0.056789469,-0.00234458642,-0.0555559434,0.015143333,-0.0284009166,0.00808381476,-0.0171454232,-0.0544834659,0.0282964967,-0.0324391425,0.0100439508,-0.0575333796,-0.0442775898,-0.00380633702,-0.00157593749,-0.039747674,-0.0605684,0.0551547967,-0.00835766178,-0.0311399177,0.017943494,0.0311389938,0.0334365331,-0.0431456603,-0.00318896235,-0.0283167362,-0.0234446414,0.021555528,-0.0174043961,-0.0651953295,-0.0309972782,0.0215819106,0.0116195921,0.0315252058,-0.0388435684,0.065880917,-0.0112401992,-0.00884662755,0.00454388,0.0496965908,-0.0522938706,0.0579095632,-0.0414180905,0.058839906,-0.0032678633,-0.0225830115,-0.0216255095,-0.0107437922,-0.0165638831,0.0784634501,0.0633835346,-0.0326286033,0.0205901638,-0.0887152,-0.0118591497,0.0399153717,0.0173136983,-0.11658635,-0.0837596804,0.022702422,0.0342835672,-0.0765177086,-0.0364477895,-0.0388797484,-0.00506030722,0.0562029704,0.00842982344,0.00860879291,0.0440128632,0.00267004245,0.0113677988,-0.0444276184,0.0167592466,-0.0560447574,-0.0211058501,-0.0105939489,0.0526761524,-3.28363043e-8,-0.0394373201,-0.06284751,0.00941727869,0.114079416,0.0497311503,-0.00309879868,0.0341236,0.123353027,-0.0260487609,-0.0583149418,0.0293672904,-0.00690054428,0.0786150098,0.0166235138,0.0173619166,0.0679017529,0.00652776659,-0.0362875536,-0.0287146103,-0.0469377078,-0.0203419682,0.0336826481,-0.019795049,-0.0130497487,0.00362700527,0.0222900473,0.00331193884,0.0669610426,-0.10506095,-0.0362477191,0.0678471178,-0.00246920274,-0.0629017,0.031316936,0.0763968751,0.0349495336,0.03701986,-0.0240127575,0.0431582294,0.0214637164,-0.0396364108,-0.0612524934,0.0200236645,-0.0307636093,0.0193338394,0.0346818455,-0.0563787,0.0292163305,0.049440641,0.0346936919,0.0547613651,0.0251639839,-0.0485375375,-0.0115103517,0.00470818812,-0.013961493,0.0454141349,0.0281176493,0.0755603,-0.0431908332,0.140430063,0.0270475149,0.00777886901,-0.0391872264
text="""lase levels. The kidneys quickly move extra amylasefrom the blood into the urine. Urine levels rise six to 10hours after blood levels and stay high longer. Urine isusually collected throughout a 2- or 24-hour time period.Results are usually available the same day.
Preparation
In most cases, no special preparation is necessary for"""
text
embedded_query = embeddings.embed_query(text)
print(embedded_query)

In [None]:
# 08. 向量搜尋
# 0.00100201822,-0.0879787,-0.0121345911,0.000141251716,-0.0305802971,-0.0635476932,-0.00554865832,0.0155437924,0.0310709625,-0.0551753715,-0.0272236336,0.0335728712,-0.0554190911,0.0753775463,-0.0810973197,0.0584557615,-0.0399320163,-0.0538939722,-0.0225356463,-0.0228182543,0.0983394831,-0.0244991,-0.0298304707,0.0355674885,0.0172437858,-0.0290708039,0.00247312849,-0.00694054458,0.00934407953,-0.00675766775,0.0333429798,-0.0124766016,-0.0394415967,0.0457644,-0.0280858278,-0.0427881218,-0.0202028789,-0.0481561273,-0.04873503,-0.0477370508,0.0865190178,-0.150480494,-0.059747465,0.0476341136,-0.0901464596,0.0377526321,-0.0655688867,-0.0507967845,-0.0195009783,0.00771954097,-0.0589987785,-0.047139097,-0.0696333125,0.0627996847,0.0106975567,0.0732211769,0.0296002813,-0.0103422487,0.0250978488,-0.13682501,-0.127584696,0.0958755761,-0.00386591605,-0.0296135414,0.0230881833,0.0138041703,-0.00896615628,-0.0221488737,0.0524216853,-0.0445768125,-0.0183927678,-0.00776625,0.0319657773,-0.00278262165,-0.0487341136,-0.0595503077,-0.0783658326,-0.00741075212,-0.0194750428,-0.0124614071,0.0757125393,0.0245649051,0.0455409102,0.0816792771,-0.0166955534,-0.00727742072,-0.0399827473,0.0791423321,-0.0169981532,0.0228592027,0.0322342776,0.0220089331,-0.0803845599,-0.0361780152,0.0310536,-0.0731141567,0.0447544456,-0.0957708806,0.0532552712,-0.0466087721,-0.0489482,0.0236297511,0.00609217,-0.00254002307,-0.049574744,0.0289640799,0.0697844326,0.0402957909,-0.0338448025,0.0117008,0.0293432549,-0.0149991047,0.0869607,-0.0211856663,0.0100638177,0.123417199,0.0284997802,0.0490373857,-0.0340036042,-0.0103725092,0.0208557956,0.0284522176,0.104265921,-0.0161122531,0.0369163714,0.163942218,0.021986492,2.20114697e-33,0.00960067566,0.00104967237,-0.033008378,0.0417723432,-0.00391326286,0.0338333957,0.0128404535,0.0195410717,0.0746719539,-0.0779181048,-0.0450207144,-0.0242992,-0.0512985587,0.0279173553,-0.0642742,-0.0334506929,0.0669692159,0.0240977444,-0.00223330781,0.0035340609,0.0273019671,-0.0420355089,0.00951039512,0.0369071253,-0.0114566712,0.0268442761,-0.0680790916,0.093645364,-0.0670209825,-0.0118308933,0.0512493029,0.0372889452,-0.097853452,-0.0900082737,0.000475601584,0.0852474943,0.0702580512,0.0604681596,0.0181875732,0.0353608802,0.107898802,-0.020102283,0.127937973,-0.0304531362,0.027359169,-0.0426012538,-0.18722938,0.0118591515,-0.0162597187,0.076915212,-0.0327949077,-0.0150794284,-0.012617101,-0.0486724563,-0.00720037147,0.0281194896,-0.00927174557,-0.035107743,-0.0114448303,0.0261171851,0.0657774,0.0287979208,0.0377273075,-0.0133055691,0.00259780418,0.0280126166,0.00226550736,-0.0329276361,-0.0624877959,0.0838453323,-0.0382638201,-0.0379554629,0.0931681395,-0.0650517717,-0.00807021186,-0.0411833301,-0.0181588605,-0.0337772,0.0340428054,-0.0515089966,0.0788559,-0.0139547503,0.0716609,0.0195527915,0.0879508853,-0.0486497842,-0.0261932295,-0.0284969099,-0.0803705305,-0.00554771116,-0.0485168546,-0.118322894,-0.00429766951,-0.0592155978,0.0616180934,-2.57615189e-33,0.180445179,-0.0680550337,0.00672602095,0.0606214851,0.0976018086,0.0191470161,0.0243674498,0.10021738,0.0316511653,0.0886515453,0.00493130693,0.0803146511,-0.0365655608,-0.112197742,0.0133931348,0.0129120192,0.0258666985,-0.0653172359,0.056789469,-0.00234458642,-0.0555559434,0.015143333,-0.0284009166,0.00808381476,-0.0171454232,-0.0544834659,0.0282964967,-0.0324391425,0.0100439508,-0.0575333796,-0.0442775898,-0.00380633702,-0.00157593749,-0.039747674,-0.0605684,0.0551547967,-0.00835766178,-0.0311399177,0.017943494,0.0311389938,0.0334365331,-0.0431456603,-0.00318896235,-0.0283167362,-0.0234446414,0.021555528,-0.0174043961,-0.0651953295,-0.0309972782,0.0215819106,0.0116195921,0.0315252058,-0.0388435684,0.065880917,-0.0112401992,-0.00884662755,0.00454388,0.0496965908,-0.0522938706,0.0579095632,-0.0414180905,0.058839906,-0.0032678633,-0.0225830115,-0.0216255095,-0.0107437922,-0.0165638831,0.0784634501,0.0633835346,-0.0326286033,0.0205901638,-0.0887152,-0.0118591497,0.0399153717,0.0173136983,-0.11658635,-0.0837596804,0.022702422,0.0342835672,-0.0765177086,-0.0364477895,-0.0388797484,-0.00506030722,0.0562029704,0.00842982344,0.00860879291,0.0440128632,0.00267004245,0.0113677988,-0.0444276184,0.0167592466,-0.0560447574,-0.0211058501,-0.0105939489,0.0526761524,-3.28363043e-8,-0.0394373201,-0.06284751,0.00941727869,0.114079416,0.0497311503,-0.00309879868,0.0341236,0.123353027,-0.0260487609,-0.0583149418,0.0293672904,-0.00690054428,0.0786150098,0.0166235138,0.0173619166,0.0679017529,0.00652776659,-0.0362875536,-0.0287146103,-0.0469377078,-0.0203419682,0.0336826481,-0.019795049,-0.0130497487,0.00362700527,0.0222900473,0.00331193884,0.0669610426,-0.10506095,-0.0362477191,0.0678471178,-0.00246920274,-0.0629017,0.031316936,0.0763968751,0.0349495336,0.03701986,-0.0240127575,0.0431582294,0.0214637164,-0.0396364108,-0.0612524934,0.0200236645,-0.0307636093,0.0193338394,0.0346818455,-0.0563787,0.0292163305,0.049440641,0.0346936919,0.0547613651,0.0251639839,-0.0485375375,-0.0115103517,0.00470818812,-0.013961493,0.0454141349,0.0281176493,0.0755603,-0.0431908332,0.140430063,0.0270475149,0.00777886901,-0.0391872264
text=f"""What are Allergies?"""
text
embedded_query = embeddings.embed_query(text)
print(embedded_query)