In [1]:
from epo.tipdata.epab import EPABClient

In [2]:
epab = EPABClient(env="TEST")

In [3]:
epab.fields()

WidDatabaseFields(header='', input_data={'': [{'name': 'epab_doc_id', 'type': 'STRING', 'mode': 'REQUIRED', 'd…

In [4]:
statement = f"""
SELECT epab_doc_id, description
FROM `{epab.full_table_name}`
WHERE description.language="EN"
ORDER BY RAND()
LIMIT 5;"""

In [5]:
results = epab.sql_query(statement)

In [6]:
!pip install regex



In [7]:
import regex as re

In [12]:
import regex as re
import pandas as pd
from IPython.display import display
import html

# Example list of documents (each document is a dictionary)
docs = [{'epab_doc_id': item['epab_doc_id'], 'text': item['description']['text']} for item in results]

# List of keywords to search in headings (fuzzy matching will allow up to one error)
keywords = ["background", "field of invention"]

# Build a fuzzy matching regex pattern for the keywords.
# The regex module (not built-in re) allows fuzzy matching via {e<=1} (one error allowed).
keyword_pattern = r"(?:" + "|".join([f"({re.escape(kw)}){{e<=1}}" for kw in keywords]) + r")"

# Compile a regex pattern that finds a section starting with a heading
# that contains one of the keywords (using fuzzy matching).
# The pattern (with flags for case-insensitive, multiline and dot-all):
#   - Looks for a line starting at the beginning (^) that contains (some text + one keyword match)
#   - Then a newline followed by content (non-greedy) until a new section is encountered.
#     (A new section is assumed when a new line starts with a non-space character)
pattern = re.compile(
    r"(?ims)^(?P<section>(?P<heading>.*" + keyword_pattern + r".*)\n(?P<content>.*?))(?=^\S|\Z)"
)

extracted_docs = []
num_found = 0
total_docs = len(docs)

# Process each document
for doc in docs:
    text = doc['text']
    extracted_sections = []
    # Find all sections that match the pattern
    for match in pattern.finditer(text):
        section = match.group("section").strip()
        extracted_sections.append(section)
    # Combine sections (if more than one) with a double newline separator
    combined_text = "\n\n".join(extracted_sections)
    if combined_text:  # if at least one section was found, count this document as "found"
        num_found += 1
    extracted_docs.append({
        'epab_doc_id': doc['epab_doc_id'],
        'text': combined_text
    })

# Create a DataFrame with the extracted sections
df = pd.DataFrame(extracted_docs)

# Calculate and display the ratio of texts where at least one section was found
ratio = num_found / total_docs if total_docs > 0 else 0
print("Ratio of texts found:", ratio)

# Display the DataFrame
print("\nExtracted Sections DataFrame:")
print(df)


Ratio of texts found: 0.8

Extracted Sections DataFrame:
           epab_doc_id                                               text
0  EP0410387B220010711                                                   
1  EP0142374B119920826  <heading id="h0001"><u style="single">BACKGROU...
2  EP0035334A219810909  <heading id="h0001">INTRODUCTION</heading><p i...
3  EP1696584B120071226  <heading id="h0001"><b>BACKGROUND OF THE INVEN...
4  EP1151233B120050406  <heading id="h0001"><b>TECHNICAL FIELD</b></he...


In [14]:
import regex as re
import pandas as pd
import ipywidgets as widgets
from IPython.display import display

# --- Step 1. Create the DataFrame with extracted sections and original text ---

# Example list of documents
docs = [{'epab_doc_id': item['epab_doc_id'], 'text': item['description']['text']} for item in results]


# Keywords to search for in headings (allowing fuzzy matching with up to one error)
keywords = ["background",  "prior art", "summary", "state of the art", "field of the invention", "technical field"]
keyword_pattern = r"(?:" + "|".join([f"({re.escape(kw)}){{e<=1}}" for kw in keywords]) + r")"

# Regex pattern: look for a heading containing one of the keywords and capture until the next heading.
pattern = re.compile(
    r"(?ims)^(?P<section>(?P<heading>.*" + keyword_pattern + r".*)\n(?P<content>.*?))(?=^\S|\Z)"
)

extracted_docs = []
for doc in docs:
    text = doc['text']
    extracted_sections = []
    for match in pattern.finditer(text):
        section = match.group("section").strip()
        extracted_sections.append(section)
    combined_text = "\n\n".join(extracted_sections)
    extracted_docs.append({
        'epab_doc_id': doc['epab_doc_id'],
        'extracted_text': combined_text,
        'original_text': text
    })

df = pd.DataFrame(extracted_docs)
total_docs = len(df)

# --- Step 2. Create the JupyterLab widget with two side-by-side displays and navigation buttons ---

# Global state variables.
current_index = 0
source_mode = False  # False: rendered view; True: source view (raw text with highlighted tags)

# Create two HTML widgets for the displays.
# Both are set to 50% width and a fixed height with a vertical scrollbar.
extracted_html = widgets.HTML(
    value="",
    layout=widgets.Layout(width='50%', height='300px', overflow_y='scroll')
)
original_html = widgets.HTML(
    value="",
    layout=widgets.Layout(width='100%', height='300px', overflow_y='scroll')
)

# Create a toggle button (placed above the right display).
toggle_button = widgets.Button(
    description="Switch to Source Display",
    layout=widgets.Layout(width='100%')
)

# Navigation buttons.
prev_button = widgets.Button(description="← Previous", layout=widgets.Layout(width='150px'))
next_button = widgets.Button(description="Next →", layout=widgets.Layout(width='150px'))

# Optional document indicator.
doc_indicator = widgets.HTML(value="")

# Function to update the displays based on the current document and display mode.
def update_text(index):
    if 0 <= index < total_docs:
        doc = df.iloc[index]
        # Left side: display extracted text (raw, preserving whitespace)
        extracted_html.value = f"<div style='white-space: pre-wrap;'>{doc['extracted_text'] or '(No extracted section found)'}</div>"
        
        # Right side: display original text.
        if source_mode:
            # Source view: escape HTML so that tags are visible.
            raw = doc['original_text']
            escaped = html.escape(raw)
            # Highlight HTML tags (non-greedy match) with a yellow background.
            highlighted = re.sub(r'(&lt;.*?&gt;)', r'<span style="background-color: yellow;">\1</span>', escaped)
            # Use a <pre> block to preserve formatting.
            original_html.value = f"<pre style='white-space: pre-wrap;'>{highlighted}</pre>"
        else:
            # Rendered view: display the original HTML.
            original_html.value = f"<div style='white-space: pre-wrap;'>{doc['original_text']}</div>"
        
        doc_indicator.value = f"<b>Document {index+1}/{total_docs}</b> (ID: {doc['epab_doc_id']})"

# Callback functions for the navigation and toggle buttons.
def on_prev_clicked(b):
    global current_index
    if current_index > 0:
        current_index -= 1
        update_text(current_index)

def on_next_clicked(b):
    global current_index
    if current_index < total_docs - 1:
        current_index += 1
        update_text(current_index)

def on_toggle_clicked(b):
    global source_mode
    source_mode = not source_mode
    toggle_button.description = "Switch to Rendered Display" if source_mode else "Switch to Source Display"
    update_text(current_index)

prev_button.on_click(on_prev_clicked)
next_button.on_click(on_next_clicked)
toggle_button.on_click(on_toggle_clicked)

# Assemble the layout.
# The right side now contains the toggle button above the original text display.
right_side = widgets.VBox([toggle_button, original_html], layout=widgets.Layout(width="50%"))
html_areas = widgets.HBox([extracted_html, right_side])
nav_buttons = widgets.HBox([prev_button, next_button])
widget_box = widgets.VBox([doc_indicator, html_areas, nav_buttons])

# Display the widget.
display(widget_box)

# Initialize with the first document (if available).
if total_docs > 0:
    update_text(current_index)

VBox(children=(HTML(value=''), HBox(children=(HTML(value='', layout=Layout(height='300px', width='50%')), VBox…