In [1]:
from openai import OpenAI
from PIL import Image
from io import BytesIO
from dotenv import load_dotenv
from langchain_postgres import PGVector
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import BaseMessage
from typing import List

import pymupdf
import base64
import os


print("CWD:", os.getcwd())

CWD: c:\Users\gito2\Downloads\RAG


In [2]:
load_dotenv()

api_key = os.getenv("GROQ_API_KEY")
print("API Key loaded:", api_key is not None) 

API Key loaded: True


In [3]:
client = OpenAI(
    api_key=os.getenv("GROQ_API_KEY"),
    base_url="https://api.groq.com/openai/v1"
)

In [4]:
def encode_image(pdf_path):
    base64_list = []
    pdf_document = pymupdf.open(pdf_path)
    for page in pdf_document:
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        buffered = BytesIO()
        img.save(buffered, format="JPEG")
        omage_bytes = buffered.getvalue()
        base64_image = base64.b64encode(omage_bytes).decode('utf-8')
        base64_list.append(base64_image)
    return base64_list

In [5]:
def extract_pdf(base64_images):
    api_key = os.getenv("GROQ_API_KEY")
    if not api_key:
        raise ValueError("GROQ_API_KEY not found in environment variables.")

    client = OpenAI(
        api_key=api_key,
        base_url="https://api.groq.com/openai/v1"
    )

    ocr_response = client.chat.completions.create(
        model="meta-llama/llama-4-maverick-17b-128e-instruct",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Extract the text from the image."},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_images}"}}
                ]
            }
        ],
        max_tokens=2000
    )

    return ocr_response.choices[0].message.content

In [6]:
path = "C:/Users/gito2/Downloads/RAG/data/pdf/PIIS0022522319384089.pdf"

In [7]:
base64_list = encode_image(path)
base64_list

['/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAMYAkADASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD3+iiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigA

In [8]:
def pdf_to_text(pdf_path, txt_output_path=None):
    doc = pymupdf.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    
    if txt_output_path:
        with open(txt_output_path, "w", encoding="utf-8") as f:
            f.write(text)
    
    return text

In [10]:
text = pdf_to_text("data/pdf/PIIS0022522319384089.pdf", "data/txt/PIIS0022522319384089.txt")
text1 = pdf_to_text("data/pdf/TAH2015.pdf", "data/txt/TAH2015.txt")
text2 = pdf_to_text("data/pdf/Living_With_a_Total_Artificial_Heart.pdf","data/txt/Living_With_a_Total_Artificial_Heart.txt")
text3 = pdf_to_text("data/pdf/Artificial_hearts.pdf", "data/txt/Artificial_hearts.txt")
text4 = pdf_to_text("data/pdf/16877accepted_Version.pdf", "data/txt/16877accepted_Version.txt")


In [11]:
print(text4)

 
The ongoing quest for the first total artificial heart as
destination therapy
Citation for published version (APA):
Vis, A., Arfaee, M., Khambati, H., Slaughter, M. S., Gummert, J. F., Overvelde, J. T. B., & Kluin, J. (2022). The
ongoing quest for the first total artificial heart as destination therapy. Nature Reviews. Cardiology, 19(12), 813-
828. https://doi.org/10.1038/s41569-022-00723-8
DOI:
10.1038/s41569-022-00723-8
Document status and date:
Published: 01/12/2022
Document Version:
Accepted manuscript including changes made at the peer-review stage
Please check the document version of this publication:
• A submitted manuscript is the version of the article upon submission and before peer-review. There can be
important differences between the submitted version and the official published version of record. People
interested in the research are advised to contact the author for the final version of the publication, or visit the
DOI to the publisher's website.
• The final author ver

In [12]:
loader = DirectoryLoader("data/txt", glob="**/*.txt")
docs = loader.load()
docs

[Document(metadata={'source': 'data\\txt\\16877accepted_Version.txt'}, page_content="The ongoing quest for the first total artificial heart as\n\ndestination therapy\n\nCitation for published version (APA):\n\nVis, A., Arfaee, M., Khambati, H., Slaughter, M. S., Gummert, J. F., Overvelde, J. T. B., & Kluin, J. (2022). The\n\nongoing quest for the first total artificial heart as destination therapy. Nature Reviews. Cardiology, 19(12), 813-\n\n828. https://doi.org/10.1038/s41569-022-00723-8\n\nDOI:\n\n10.1038/s41569-022-00723-8\n\nDocument status and date:\n\nPublished: 01/12/2022\n\nDocument Version:\n\nAccepted manuscript including changes made at the peer-review stage\n\nPlease check the document version of this publication:\n\nA submitted manuscript is the version of the article upon submission and before peer-review. There can be\n\nimportant differences between the submitted version and the official published version of record. People\n\ninterested in the research are advised to co

In [13]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 100
)

In [14]:
chunks = text_splitter.split_documents(docs)
chunks

[Document(metadata={'source': 'data\\txt\\16877accepted_Version.txt'}, page_content='The ongoing quest for the first total artificial heart as\n\ndestination therapy\n\nCitation for published version (APA):\n\nVis, A., Arfaee, M., Khambati, H., Slaughter, M. S., Gummert, J. F., Overvelde, J. T. B., & Kluin, J. (2022). The\n\nongoing quest for the first total artificial heart as destination therapy. Nature Reviews. Cardiology, 19(12), 813-\n\n828. https://doi.org/10.1038/s41569-022-00723-8\n\nDOI:\n\n10.1038/s41569-022-00723-8\n\nDocument status and date:\n\nPublished: 01/12/2022'),
 Document(metadata={'source': 'data\\txt\\16877accepted_Version.txt'}, page_content='DOI:\n\n10.1038/s41569-022-00723-8\n\nDocument status and date:\n\nPublished: 01/12/2022\n\nDocument Version:\n\nAccepted manuscript including changes made at the peer-review stage\n\nPlease check the document version of this publication:\n\nA submitted manuscript is the version of the article upon submission and before peer

In [23]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings 

embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [16]:
question = "the use of artificial heart ?" 
question_embedding = embedding.embed_query(question)

In [17]:
len(question_embedding)

1024

In [19]:
openai_api_key = os.getenv("OPENAI_API_KEY")  # now you can access it

In [34]:
from langchain_openai import OpenAI

llm = OpenAI(
    api_key=api_key,
    base_url="https://api.groq.com/openai/v1"
)

In [27]:
from langchain.vectorstores import Chroma

vectorstore = Chroma.from_documents(chunks, embedding=embedding, persist_directory="chroma_store")
retriever = vectorstore.as_retriever()

In [29]:
vectorstore = Chroma(
    persist_directory="chroma_db",
    embedding_function=embedding
)

  vectorstore = Chroma(


In [36]:
from langchain import HuggingFacePipeline
from transformers import pipeline

llm_pipeline = pipeline("text-generation", model="google/flan-t5-base", device=0)
llm = HuggingFacePipeline(pipeline=llm_pipeline)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)


config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['PeftModelForCausalLM', 'ArceeForCausalLM', 'AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BitNetForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'Dots1ForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconH1ForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'Gemma3nForConditionalGeneration', 'Gemma3nForCausa

In [38]:
query = "What is artificial heart ?"
response = qa.run(query)

print(response)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Qual Health Res. 2007;17:730Y743.

What’s New and Important

h This is the first study examining the experiences of

patients living with the total artificial heart to provide

clinicians and researchers with insight into these

patients’ unique experiences.

h It is important to recognize the psychological

aspects of being technologically dependent on an

artificial organ.

Living With a Total Artificial Heart E7

Intern. Organs 7, 378–387 (1961).

16. Liotta, D. et al. Artificial heart in the chest: preliminary report. Trans. Am. Soc. Artif. Intern. Organs 7,318–322 (1961).

17. Atsumi, K. et al. Artificial heart incorporated in the chest. Trans. Am. Soc. Artif. Intern. Organs 9, 292–298 (1963).

18. Pierce, W. S. et al. Total heart replacement by a single intrathoracic blood pump. J. Surg. Res. 5, 387–394 (1965).

Intern

Saya mengusulkan judul  artificial heart karena judul ini terus mengalami perkembangan dari teknologi pembuatan artificial heart atau biasa disebut jantung buatan saya menggunakan model embbedding sentence transformer karena cepat dan ringan untuk real-time aplikasi.vector database nya saya menggunakan chroma karena tidak memperlukan database eksternal,model dapat update knowledge dengan menambah dokumen atau update Dokumen Hapus lalu Tambah lagi.kekurangan dari model kurang akurat dalam menjawab pertanyaan dan hanya dapat pada dimasukan dokumen sedikit
