In [190]:
from langchain_community.llms import HuggingFaceHub
from langchain_community.chat_models import ChatHuggingFace
from langchain.prompts import ChatPromptTemplate
import os
import shutil
import fitz
from llama_index.core.node_parser import SentenceSplitter

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from IPython.display import display
import openai
import base64
from openai import OpenAI
from langchain import PromptTemplate
from tqdm import tqdm
import pandas as pd


In [None]:
os.environ["OPENAI_API_KEY"] = None

In [192]:
def clear_output_folder(folder_path):
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
        print(f'Existing folder "{folder_path}" and its contents have been removed.')
    os.makedirs(folder_path, exist_ok=True)
    print(f'Folder "{folder_path}" is ready for new content.')

In [193]:
def extract_images_and_text(pdf_path, images_output_folder, text_output_file):
    os.makedirs(images_output_folder, exist_ok=True)

    doc = fitz.open(pdf_path)

    with open(text_output_file, "w", encoding="utf-8") as txt_file:
        for page_num in range(len(doc)):
            page = doc.load_page(page_num) 

            text = page.get_text()
            txt_file.write(f"--- Page {page_num + 1} ---\n")
            txt_file.write(text)
            txt_file.write("\n\n")

            image_list = page.get_images(full=True)
            if image_list:
                for img_index, img in enumerate(image_list, start=1):
                    xref = img[0]
                    base_image = doc.extract_image(xref)
                    image_bytes = base_image["image"]
                    image_ext = base_image["ext"]
                    image_filename = f"page{page_num + 1}_img{img_index}.{image_ext}"
                    image_path = os.path.join(images_output_folder, image_filename)
                    with open(image_path, "wb") as img_file:
                        img_file.write(image_bytes)
            else:
                print(f"No images found on page {page_num + 1}.")
        
        print(f"\nText extraction complete. Saved to '{text_output_file}'.")
        print(f"Image extraction complete. Images saved in '{images_output_folder}'.")

In [194]:
pdf_path = 'files/demo4.pdf'
images_output_folder = 'extracted_content'
text_output_file = f'{images_output_folder}/pages.txt'
clear_output_folder(images_output_folder)
extract_images_and_text(pdf_path, images_output_folder, text_output_file)

Existing folder "extracted_content" and its contents have been removed.
Folder "extracted_content" is ready for new content.
No images found on page 2.

Text extraction complete. Saved to 'extracted_content/pages.txt'.
Image extraction complete. Images saved in 'extracted_content'.


In [203]:
openai_client = OpenAI()

image_folder = 'extracted_content'
supported_formats = ('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')
caption_pairs = []

for image_name in os.listdir(image_folder):
    if image_name.lower().endswith(supported_formats):
        image_path = os.path.join(image_folder, image_name)

        with open(image_path, "rb") as image_file:
            base64_image = base64.b64encode(image_file.read()).decode('utf-8')

        messages=[
            {
            "role": "system",
            "content": "You are an image reading expert. Tell me what you find from this image."
            },
            {
            "role": "user",
            "content": [
                {"type": "text", "text": "Explain this image."},
                {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{base64_image}"
                },
                },
            ],
            }
        ]

        response = openai_client.chat.completions.create(
            model="gpt-4o-2024-08-06",
            messages=messages,
            max_tokens=300,
        )
        summary = response.choices[0].message.content
        caption_pairs.append((summary, image_path))
        print(f"----------{image_path}---------\n")
        print(summary)
        print('\n')


----------extracted_content/page3_img1.jpeg---------

The image shows three close-up views of a person's mouth, featuring different oral lesions:

1. **Top Image**: The inside of the lip with small, raised bumps that appear to be inflamed.
2. **Middle Image**: The surface of the tongue with a small ulcer, indicated by an arrow.
3. **Bottom Image**: The lower lip with rounded indentations, possibly due to trauma or habitual biting.

These images appear to illustrate various types of oral mucosal conditions, possibly related to minor injuries or irritations such as aphthous ulcers or trauma from biting. It's recommended to consult a healthcare professional for an accurate diagnosis and treatment.


----------extracted_content/page5_img1.jpeg---------

This image shows a close-up of a person's lips. There is a noticeable scar on the upper lip, likely from a repaired cleft lip. Adjacent to the lips, there is a tattoo or marking with Greek letters. The skin texture and color are normal, asi

In [196]:
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

pdf_text = read_text_file(text_output_file)

In [197]:
def chunk_text(text, chunk_size=500, chunk_overlap=0):
    text_splitter = SentenceSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    chunks = text_splitter.split_text(text)
    print(f"Total text chunks created: {len(chunks)}")
    return chunks

pdf_chunks = chunk_text(pdf_text)

Total text chunks created: 3


In [None]:
def create_text_documents(chunks):
    text_documents = []
    for i, chunk in enumerate(chunks):
        cleaned_chunk = chunk.replace('\n', '')
        doc_id = f"text_{i}" 
        text_documents.append({
            "id": doc_id,
            "text": cleaned_chunk,
        })
    return text_documents

def create_image_documents(caption_pairs):
    caption_documents = []
    for _, (caption, image_path) in enumerate(caption_pairs):
        doc_id = image_path 
        caption_documents.append({
            "id": doc_id,
            "text": caption,
        })
    return caption_documents

pdf_docs = create_text_documents(pdf_chunks)
caption_docs = create_image_documents(caption_pairs)
docs = pdf_docs + caption_docs

[{'id': 'text_0', 'text': "--- Page 1 ---1.Normal Anatomic Variants3Fig. 1. Linea alba.Fig. 2. Normal pigmentation of thegingiva.Fig. 3. Leukoedema of the buccalmucosa.--- Page 2 ---2. Developmental AnomaliesFordyce's GranulesFordyce's granules are a developmental anomalycharacterized by collections of heterotopic seba-ceous glands in the oral mucosa. Clinically, thereare many small, slightly raised whitish-yellowspots that are well circumscribed and rarelycoalesce, forming plaques (Fig. 4). They occurmost often in the mucosal surface of the upper lip,commissures, and the buccal mucosa adjacent tothe molar teeth in a symmetrical bilateral pattern.They are a frequent finding in about 80% ofpersons of both sexes. These granules are asymp-tomatic and come to the patient's attention bychance. With advancing age, they may becomemore prominent but should not be a cause forconcern.The differential diagnosis includes lichen planus,candidosis, and leukoplakia.Laboratory test.Histopathologic exa

In [199]:
# notebook_login()

In [None]:
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = None
API_TOKEN = None

llm = HuggingFaceHub(
    repo_id=repo_id,
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    },
    
)

chat_model = ChatHuggingFace(llm=llm)


QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.

Requirements:
- The factoid question must be answerable with a specific, concise piece of factual information from the context.
- The question must be in a style that could be typed into a search engine (i.e., do not mention "the passage" or "context" explicitly).
- The answer must directly address the question without additional commentary.

The context is given as follows:
{context}

Format your output **exactly** as follows (without extra text):
Factoid question: <your factoid question>
Answer: <the specific, concise answer>
"""

QA_generation_prompt = ChatPromptTemplate.from_template(QA_generation_prompt)
QA_generation_agent = QA_generation_prompt | chat_model

In [211]:
N_GENERATIONS = (
    5
)

print(f"Generating {N_GENERATIONS} QA couples...")
outputs = []
for context in tqdm(docs):
    for _ in range(N_GENERATIONS):
        output_QA_couple = QA_generation_agent.invoke({"context": context["text"]}).content

        QA_pairs = output_QA_couple.split("Factoid question: ")[2:]
        for QA_pair in QA_pairs:
            question = QA_pair.split("Answer:")[0]
            answer = QA_pair.split("Answer:")[1]

            outputs.append(
                {
                    "context": context["text"],
                    "question": question,
                    "answer": answer,
                    "source_doc": context["id"],
                }
            )


Generating 5 QA couples...


100%|██████████| 9/9 [00:11<00:00,  1.26s/it]


In [212]:
df = pd.DataFrame(outputs)
df_unique = df.drop_duplicates(subset=["question"], keep="first").reset_index(drop=True)
pd.set_option('display.max_colwidth', None)
display(df_unique)

Unnamed: 0,context,question,answer,source_doc
0,"--- Page 1 ---1.Normal Anatomic Variants3Fig. 1. Linea alba.Fig. 2. Normal pigmentation of thegingiva.Fig. 3. Leukoedema of the buccalmucosa.--- Page 2 ---2. Developmental AnomaliesFordyce's GranulesFordyce's granules are a developmental anomalycharacterized by collections of heterotopic seba-ceous glands in the oral mucosa. Clinically, thereare many small, slightly raised whitish-yellowspots that are well circumscribed and rarelycoalesce, forming plaques (Fig. 4). They occurmost often in the mucosal surface of the upper lip,commissures, and the buccal mucosa adjacent tothe molar teeth in a symmetrical bilateral pattern.They are a frequent finding in about 80% ofpersons of both sexes. These granules are asymp-tomatic and come to the patient's attention bychance. With advancing age, they may becomemore prominent but should not be a cause forconcern.The differential diagnosis includes lichen planus,candidosis, and leukoplakia.Laboratory test.Histopathologic examinationsupports the clinical diagnosis.Treatment. Surgical removal is recommended.Congenital Lip PitsCongenital lip pits represent a rare developmentalmalformation that may occur alone or in combina-tion with commissural pits, cleft lip, or cleftpalate. Clinically, they present as bilateral orunilateral depressions at the vermilion border ofthe lower lip (Fig. 6). A small amount of mucoussecretion may accumulate at the depth of the pit.The lip may be enlarged and swollen.Treatment of choice is surgical excision, but onlyfor esthetic purposes.Treatment. No treatment is required.Oral HairHair and hair follicles are extremely unusualwithin the oral cavity. Only five cases have beenreported so far. There is no satisfactory explana-tion for the occurrence of oral hair although adevelopmental anomaly is the most likely possibil-ity. All reported patients have been white males.",What are Fordyce's granules?\n,Fordyce's granules are a developmental anomaly characterized by collections of sebaceous glands appearing as yellowish-white papules on the buccal mucosa and mucosal surface of the lips.\n\n,text_0
1,"--- Page 1 ---1.Normal Anatomic Variants3Fig. 1. Linea alba.Fig. 2. Normal pigmentation of thegingiva.Fig. 3. Leukoedema of the buccalmucosa.--- Page 2 ---2. Developmental AnomaliesFordyce's GranulesFordyce's granules are a developmental anomalycharacterized by collections of heterotopic seba-ceous glands in the oral mucosa. Clinically, thereare many small, slightly raised whitish-yellowspots that are well circumscribed and rarelycoalesce, forming plaques (Fig. 4). They occurmost often in the mucosal surface of the upper lip,commissures, and the buccal mucosa adjacent tothe molar teeth in a symmetrical bilateral pattern.They are a frequent finding in about 80% ofpersons of both sexes. These granules are asymp-tomatic and come to the patient's attention bychance. With advancing age, they may becomemore prominent but should not be a cause forconcern.The differential diagnosis includes lichen planus,candidosis, and leukoplakia.Laboratory test.Histopathologic examinationsupports the clinical diagnosis.Treatment. Surgical removal is recommended.Congenital Lip PitsCongenital lip pits represent a rare developmentalmalformation that may occur alone or in combina-tion with commissural pits, cleft lip, or cleftpalate. Clinically, they present as bilateral orunilateral depressions at the vermilion border ofthe lower lip (Fig. 6). A small amount of mucoussecretion may accumulate at the depth of the pit.The lip may be enlarged and swollen.Treatment of choice is surgical excision, but onlyfor esthetic purposes.Treatment. No treatment is required.Oral HairHair and hair follicles are extremely unusualwithin the oral cavity. Only five cases have beenreported so far. There is no satisfactory explana-tion for the occurrence of oral hair although adevelopmental anomaly is the most likely possibil-ity. All reported patients have been white males.",How are congenital lip pits treated?\n,"Treatment for congenital lip pits is surgical excision, but only for esthetic purposes as they are generally harmless.\n\n",text_0
2,"--- Page 1 ---1.Normal Anatomic Variants3Fig. 1. Linea alba.Fig. 2. Normal pigmentation of thegingiva.Fig. 3. Leukoedema of the buccalmucosa.--- Page 2 ---2. Developmental AnomaliesFordyce's GranulesFordyce's granules are a developmental anomalycharacterized by collections of heterotopic seba-ceous glands in the oral mucosa. Clinically, thereare many small, slightly raised whitish-yellowspots that are well circumscribed and rarelycoalesce, forming plaques (Fig. 4). They occurmost often in the mucosal surface of the upper lip,commissures, and the buccal mucosa adjacent tothe molar teeth in a symmetrical bilateral pattern.They are a frequent finding in about 80% ofpersons of both sexes. These granules are asymp-tomatic and come to the patient's attention bychance. With advancing age, they may becomemore prominent but should not be a cause forconcern.The differential diagnosis includes lichen planus,candidosis, and leukoplakia.Laboratory test.Histopathologic examinationsupports the clinical diagnosis.Treatment. Surgical removal is recommended.Congenital Lip PitsCongenital lip pits represent a rare developmentalmalformation that may occur alone or in combina-tion with commissural pits, cleft lip, or cleftpalate. Clinically, they present as bilateral orunilateral depressions at the vermilion border ofthe lower lip (Fig. 6). A small amount of mucoussecretion may accumulate at the depth of the pit.The lip may be enlarged and swollen.Treatment of choice is surgical excision, but onlyfor esthetic purposes.Treatment. No treatment is required.Oral HairHair and hair follicles are extremely unusualwithin the oral cavity. Only five cases have beenreported so far. There is no satisfactory explana-tion for the occurrence of oral hair although adevelopmental anomaly is the most likely possibil-ity. All reported patients have been white males.",What is the treatment for oral hair?\n,"No treatment is required for oral hair as it is extremely unusual within the oral cavity, with only five cases reported so far.",text_0
3,"The buccal mucosa, gingiva, and tongue are thepreferred areas of hair growth.Oral hair presents as an asymptomatic blackhair 0.3-3.5 cm in length (Fig. 5). The patientsare usually anxious and nervous. The presence oforal hair and hair follicles may offer an explana-tion for the rare occurrence of keratoacanthomaintraorally.The differential diagnosis should be made fromtraumatically implanted hair and the presence ofhair in skin grafts after surgical procedures in theoral cavity.4--- Page 3 ---2. Developmental Anomalies5Fig. 4. Fordyce's granules in thebuccal mucosa.Fig. 5. Black hair on the tip of thetongue (arrow).Fig. 6. Congenital lip pits.",What are some common areas where oral hair grows?\n,"Buccal mucosa, gingiva, and tongue are common areas where oral hair grows.",text_1
4,"--- Page 4 ---62. Developmental AnomaliesFig. 7. Ankyloglossia.AnkyloglossiaCleft PalateAnkyloglossia, or tongue-tie, is a rare develop-mental disturbance in which the lingual frenum isshort or is attached close to the tip of the tongue(Fig. 7). In these cases the frenum is often thickand fibrous. Rarely, the condition may occur as aresult of fusion between the tongue and the floorof the mouth or the alveolar mucosa. The malfor-mation may cause speech difficulties.Treatment. Surgical clipping of the frenum cor-rects the problem.Cleft LipCleft palate is a developmental malformation dueto failure of the two embryonic palatal processesto fuse. The cause remains unknown, althoughheredity may play a role. Clinically, the patientsexhibit a defect at the midline of the palate thatmay vary in severity (Fig. 9). Bifid uvula repre-sents a minor expression of cleft palate and maybe seen alone or in combination with more severemalformations (Fig. 10).Cleft palate may occur alone or in combinationwith cleft lip. The incidence of cleft palate alonevaries between 0.29 and 0.56 per 1000 births. Itmay occur in the hard or soft palate or both.Serious speech, feeding, and psychologic prob-lems may occur.Cleft lip is a developmental malformation thatusually involves the upper lip and very rarely thelower lip (Fig. 8). It frequently coexists with cleftpalate and it rarely occurs alone. The incidence ofcleft lip alone or in combination with cleft palatevaries from 0.52 to 1.34 per 1000 births.The disorder may be unilateral or bilateral,complete or incomplete.Treatment. Plastic surgery as early as possiblecorrects the esthetic and functional problems.Treatment. Early surgical correction is recom-mended.--- Page 5 ---2. Developmental Anomalies7Fig. 8. Cleft lip.Fig. 9. Cleft palate.Fig. 10. Bifid uvula.",What is the incidence of cleft palate alone?\n,The incidence of cleft palate alone varies between 0.29 and 0.56 per 1000 births.,text_2
5,"The image shows three close-up views of a person's mouth, featuring different oral lesions:\n\n1. **Top Image**: The inside of the lip with small, raised bumps that appear to be inflamed.\n2. **Middle Image**: The surface of the tongue with a small ulcer, indicated by an arrow.\n3. **Bottom Image**: The lower lip with rounded indentations, possibly due to trauma or habitual biting.\n\nThese images appear to illustrate various types of oral mucosal conditions, possibly related to minor injuries or irritations such as aphthous ulcers or trauma from biting. It's recommended to consult a healthcare professional for an accurate diagnosis and treatment.","What conditions are illustrated in the top, middle, and bottom images of the given mouth image?\n","The top image shows Fordyce spots, a harmless condition of small, raised bumps on the lips. The middle image shows a small ulcer, possibly an aphthous ulcer. The bottom image displays rounded indentations, potentially caused by trauma or habitual biting. It is essential to consult a healthcare professional for accurate diagnosis and treatment.",extracted_content/page3_img1.jpeg
6,"This image shows a close-up of a person's lips. There is a noticeable scar on the upper lip, likely from a repaired cleft lip. Adjacent to the lips, there is a tattoo or marking with Greek letters. The skin texture and color are normal, aside from the scar and the marking.",What is the likely cause of the scar on this person's upper lip?\n,The scar on the upper lip is likely from a repaired cleft lip.,extracted_content/page5_img1.jpeg
7,"This image shows a close-up of the inside of a person's mouth, focusing on the tongue and the frenulum, which is a small fold of tissue connecting the underside of the tongue to the floor of the mouth. The teeth and gums are also visible. It appears to illustrate oral anatomy, particularly highlighting the connection between the tongue and the mouth floor.",What is the anatomical structure that connects the underside of the tongue to the floor of the mouth?\n,The frenulum,extracted_content/page4_img1.jpeg
8,"The image shows three stages of tobacco-induced changes in the oral cavity:\n\n1. **Top Image**: Leukoplakia, a white patch on the oral mucosa, possibly due to chronic irritation from tobacco use.\n2. **Middle Image**: Staining of the teeth and gums, likely from tobacco or heavy use of products like betel nut, characterized by brown to black discoloration.\n3. **Bottom Image**: A cleaner oral environment, suggesting a reversal or reduction in staining and irritation, possibly due to reduced exposure to irritants or improved oral hygiene.\n\nThe sequence illustrates the detrimental effects of tobacco on oral health and the potential for improvement with changes in habits.",What does the top image in the given context represent in terms of oral health?\n,"The top image represents leukoplakia, a white patch on the oral mucosa caused by chronic irritation from tobacco use.\n\n",extracted_content/page1_img1.jpeg
9,"The image shows three stages of tobacco-induced changes in the oral cavity:\n\n1. **Top Image**: Leukoplakia, a white patch on the oral mucosa, possibly due to chronic irritation from tobacco use.\n2. **Middle Image**: Staining of the teeth and gums, likely from tobacco or heavy use of products like betel nut, characterized by brown to black discoloration.\n3. **Bottom Image**: A cleaner oral environment, suggesting a reversal or reduction in staining and irritation, possibly due to reduced exposure to irritants or improved oral hygiene.\n\nThe sequence illustrates the detrimental effects of tobacco on oral health and the potential for improvement with changes in habits.",What does the middle image in the given context indicate about oral health?\n,"The middle image shows staining of the teeth and gums, likely from tobacco or heavy use of products like betel nut, characterized by brown to black discoloration.\n\n",extracted_content/page1_img1.jpeg
