<a href="https://colab.research.google.com/github/aihyvari/DLTK/blob/master/SPC_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install PyPDF2
!pip install gradio
!pip install fuzzywuzzy
!pip install nltk

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp31

In [2]:
import PyPDF2
import re
import gradio as gr
from fuzzywuzzy import fuzz
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to preprocess text (e.g., lemmatization)
def preprocess_text(text):
    tokens = word_tokenize(text, language="finnish")
    lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    return " ".join(lemmatized_tokens)

# Function to find keywords in the text with fuzzy matching
def find_keywords_in_text(text, keywords, threshold=80):
    found_keywords = []
    processed_text = preprocess_text(text)

    for keyword in keywords:
        lemmatized_keyword = preprocess_text(keyword)
        words = word_tokenize(processed_text, language="finnish")

        for word in words:
            if fuzz.ratio(word, lemmatized_keyword) >= threshold:
                found_keywords.append(keyword)
                break

    return found_keywords

# Function to extract text between sections from the PDF
def extract_section_from_pdf(pdf_path, start_section_pattern, end_section_pattern):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)

        section_text = ""
        section_found = False

        for i in range(num_pages):
            page = reader.pages[i]
            text = page.extract_text()

            if section_found:
                end_match = end_section_pattern.search(text)
                if end_match:
                    section_text += text[:end_match.start()]
                    break
                else:
                    section_text += text
            else:
                start_match = start_section_pattern.search(text)
                if start_match:
                    section_found = True
                    section_text += text[start_match.start():]
                    end_match = end_section_pattern.search(section_text)
                    if end_match:
                        section_text = section_text[:end_match.start()]
                        break

        return section_text.strip() if section_found else None

# Function to get the first 100 words from a text
def get_sample_text(text, word_limit=100):
    words = text.split()
    sample = " ".join(words[:word_limit])
    if len(words) > word_limit:
        sample += " ..."
    return sample

# Define the function that will be connected to the Gradio interface
def gradio_keyword_search(pdf_path, keyword_input):
    # Adjusted pattern to allow for optional spaces between the section number and the heading
   # start_section_pattern = re.compile(
    #r'4\s*[\.\-]?\s*8\s*[\.\-]?\s*(?:Haittavaikutukset|Adverse\s*Effects)?',
    #re.IGNORECASE | re.DOTALL
    #)
    start_section_pattern = re.compile(
    r'4\.8[\t\s\u00A0\u200B\u200C\u200D\uFEFF\-–—]*(H\s*aittavaikutukset)',
    re.IGNORECASE | re.DOTALL
    )
    end_section_pattern = re.compile(
    #r'4\.9\s*[\.\-]?\s*(?:Yliannostus|Overdose)?',
    r'4\.9[\t\s\u00A0\u200B\u200C\u200D\uFEFF\-–—]*(Y\s*liannostus)',
    re.IGNORECASE | re.DOTALL
    )

    extracted_text = extract_section_from_pdf(pdf_path, start_section_pattern, end_section_pattern)

    # Convert the user's keyword input into a list of keywords
    keywords = [kw.strip() for kw in keyword_input.split(",")]

    if extracted_text:
        sample_text = get_sample_text(extracted_text, word_limit=100)
        found_keywords = find_keywords_in_text(extracted_text, keywords)
        # Determine not found keywords
        not_found_keywords = [kw for kw in keywords if kw not in found_keywords]

        # Make the keywords stand out in the output
        return (f"Sample of extracted text (first 100 words):\n\n{sample_text}\n\n"
                "Note: Only a sample of the text is shown.\n\n"
                f"**Found Keywords:**\n\n" + ", ".join(found_keywords) + "\n\n"
                f"**Not Found Keywords:**\n\n" + ", ".join(not_found_keywords) + "\n\n"
                )
    else:
        return "Section 4.8 Haittavaikutukset not found."

# Create the Gradio interface
iface = gr.Interface(
    fn=gradio_keyword_search,  # The function to be called
    inputs=[
        gr.File(),  # PDF file input
        gr.Textbox(lines=2, placeholder="Enter keywords separated by commas", label="Keywords (comma-separated)"),
    ],
    outputs="text",  # Output type
    title="Finnish SPC PDF Section Extraction and Keyword Search",  # Title of the app
    description="Upload a Finnish SPC PDF to extract the section between '4.8 Haittavaikutukset' " \
                "and '4.9 Yliannostus', then search for user-defined keywords with fuzzy matching. " \
                "Separate multiple keywords using commas."
)

# Launch the interface
iface.launch()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://0fda2c44add6ce1aa5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




From url

In [None]:
def gradio_keyword_search(pdf_input, keywords):
    # Add your processing logic here to handle both file and URL inputs
    pass

def process_input(pdf_input, keywords):
    if isinstance(pdf_input, str):
        # Handle PDF URL input
        pdf_url = pdf_input
        # Add logic to download and process the PDF from the URL
    else:
        # Handle PDF file input
        pdf_file = pdf_input
        # Add logic to process the uploaded PDF file
    return gradio_keyword_search(pdf_input, keywords)

iface = gr.Interface(
    fn=process_input,  # The function to be called
    inputs=[
        gr.Radio(choices=["Upload PDF File", "Enter PDF URL"], label="Input Method", type="value"),
        gr.File(),  # PDF file input
        gr.Textbox(placeholder="Enter the URL of the PDF", label="PDF URL"),
        gr.Textbox(lines=2, placeholder="Enter keywords separated by commas", label="Keywords (comma-separated)")
    ],
    outputs="text",  # Output type
    title="Finnish SPC PDF Section Extraction and Keyword Search",  # Title of the app
    description="Upload a Finnish SPC PDF or enter its URL to extract the section between '4.8 Haittavaikutukset' " \
                "and '4.9 Yliannostus', then search for user-defined keywords with fuzzy matching. " \
                "Separate multiple keywords using commas."
)

# Launch the interface
iface.launch(debug=True)



Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://083f423ee2ff9c0e32.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/gradio/queueing.py", line 536, in process_events
    response = await route_utils.call_process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1935, in process_api
    result = await self.call_function(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1520, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "/usr/local/lib/python3.10/dist-packages/anyio/to_thread.py", line 33, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
    return await future
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 8

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://a59af989ef10a83d4d.gradio.live
Killing tunnel 127.0.0.1:7861 <> https://ae00377f91deaf6a42.gradio.live
Killing tunnel 127.0.0.1:7862 <> https://a4d068548b797fd725.gradio.live
Killing tunnel 127.0.0.1:7863 <> https://f03b63735da8ed58b7.gradio.live
Killing tunnel 127.0.0.1:7864 <> https://b1521cf6a94c9a069f.gradio.live
Killing tunnel 127.0.0.1:7865 <> https://2ae2c9f4a92309232f.gradio.live
Killing tunnel 127.0.0.1:7866 <> https://e03ac53353f4a5552b.gradio.live
Killing tunnel 127.0.0.1:7867 <> https://d45e8cb196db786842.gradio.live
Killing tunnel 127.0.0.1:7868 <> https://1295ef5d7bcbaac006.gradio.live
Killing tunnel 127.0.0.1:7869 <> https://8b21df015facea0d74.gradio.live
Killing tunnel 127.0.0.1:7870 <> https://2e32edb631459da90c.gradio.live
Killing tunnel 127.0.0.1:7871 <> https://895a2ddcc1ddde1653.gradio.live
Killing tunnel 127.0.0.1:7872 <> https://a810a61e2a567d4c6f.gradio.live
Killing 

