In [1]:
%pwd

'c:\\Users\\Lenovo ThinkPad X280\\Downloads\\End-to-end-Medical-Chatbot-Generative-AI\\research'

In [2]:
import os
import re
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import pipeline
from langchain_core.prompts import ChatPromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\Lenovo ThinkPad X280\\Downloads\\End-to-end-Medical-Chatbot-Generative-AI'

### **I. Load PDF & Split**

In [5]:
def load_pdf_file(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [6]:

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    return text_splitter.split_documents(extracted_data)

In [None]:
project_dir = r"C:\Users\Lenovo ThinkPad X280\Downloads\End-to-end-Medical-Chatbot-Generative-AI"
data_dir = os.path.join(project_dir, 'Data')

print("Trying to access:", data_dir)
if not os.path.exists(data_dir):
    print(f"Directory {data_dir} does not exist!")
else:
    print(f"Directory {data_dir} exists!")
    extracted_data = load_pdf_file(data=data_dir)
    text_chunks = text_split(extracted_data)
    print("Jumlah potongan teks:", len(text_chunks))

Trying to access: C:\Users\Lenovo ThinkPad X280\Downloads\End-to-end-Medical-Chatbot-Generative-AI\Data
Directory C:\Users\Lenovo ThinkPad X280\Downloads\End-to-end-Medical-Chatbot-Generative-AI\Data exists!
Jumlah potongan teks: 5859


### **II. Download Embedding & Create FAISS**

In [8]:
def download_hugging_face_embeddings():
    return HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [9]:
embeddings = download_hugging_face_embeddings()
query_result = embeddings.embed_query("Hello world")
print("Dimensi embedding:", len(query_result))

  return HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


Dimensi embedding: 384


In [10]:
# Create and save FAISS index
vectorstore = FAISS.from_documents(text_chunks, embeddings)
vectorstore.save_local("faiss_index")

In [11]:
# Load back FAISS index
vectorstore = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

### **III. Load LLM of Hugging Face**

In [None]:
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-large", device=0)  # pakai GPU jika ada

def hf_llm(prompt):
    result = qa_pipeline(prompt, max_new_tokens=256, do_sample=False)
    return result[0]['generated_text']

Device set to use cpu


### **IV. Create Promt**

In [13]:
system_prompt = (
    "You are a medical assistant specialized in dermatology. "
    "Using only the context provided below, answer the user question precisely. "
    "Do not use prior knowledge or guess. If the answer is not in the context, reply: 'Sorry, I couldn't find the answer in the documents.' "
    "Answer in maximum 3 short sentences."
    "\n\n"
    "Context:\n{context}"
    "\n\nQuestion: {input}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])

### **V. Cleaning Helper**

In [14]:
def clean_text(text):
    text = text.replace('\n', ' ').strip()
    text = re.sub(r'\s+', ' ', text)
    return text

### **VI. Chat History**

In [15]:
chat_history = []

def save_chat_history(user_input, response):
    chat_history.append({"question": user_input, "answer": response})

### **VII. Final Chain**

In [16]:
def custom_hf_chain(input_text):
    docs = retriever.invoke(input_text)
    context = "\n\n".join([clean_text(doc.page_content) for doc in docs])
    prompt_filled = system_prompt.replace("{context}", context).replace("{input}", input_text)
    answer = hf_llm(prompt_filled)
    save_chat_history(input_text, answer)
    return answer

### **VIII. Data Inference**

In [18]:
response = custom_hf_chain("What is Acne?")
print("\nAnswer:\n", response)


Answer:
 a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria.
