# Patent data extraction analysis

The purpose of this notebook is to test and investigate the different methods to extract the SDG-relevant data from the patents 

In [1]:
!pip install autocorrect

Collecting autocorrect
  Downloading autocorrect-2.6.1.tar.gz (622 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m622.8/622.8 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25ldone
[?25h  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622364 sha256=d5c2ba240646037f456abe63124a9d845f1d2ca766f5d55fee33ce5057db478d
  Stored in directory: /home/jovyan/.cache/pip/wheels/5e/90/99/807a5ad861ce5d22c3c299a11df8cba9f31524f23ae6e645cb
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.6.1


In [2]:
from epo.tipdata.epab import EPABClient
epab = EPABClient(env="TEST")
import re
import pandas as pd
import ipywidgets as widgets
from ipywidgets import (
    HTML, IntText, Text, Dropdown, SelectMultiple, Checkbox, Button, VBox, HBox, Layout
)
from IPython.display import display
import pickle
import json
from autocorrect import Speller #TODO check if we do the spelling stuff here
import gzip

Patents data loading

First we test here different methods to load representative sets of patents for further analysis

In [3]:
# Create the log display area
log_output = widgets.Textarea(
    value='',
    placeholder='Logs will appear here.',
    description='Logs:',
    disabled=False,
    layout=widgets.Layout(width='100%', height='200px')  # Adjust size as needed
)

# Create a VBox to hold the log display
log_widget = widgets.VBox([log_output])

# Display the log widget
display(log_widget)

def log(text):
    """
    Adds text to the log display widget.
    """
    log_output.value += text + '\n'  # Append the new text with a newline character

VBox(children=(Textarea(value='', description='Logs:', layout=Layout(height='200px', width='100%'), placeholde…

In [4]:
# Helper functions to load and save lists of dictionaries to a gzipped jsonl FILE
#-----------------------------
def save_list(data, filename):
    """
    Saves a list of dictionaries to a gzipped JSON Lines (jsonl) file.

    Parameters:
        data (list): List of dictionaries to be saved.
        filename (str): The filename (including .gz if desired) where the data will be saved.
    """
    with gzip.open(filename, 'wt', encoding='utf-8') as f:
        for item in data:
            # Write each dictionary as a JSON line.
            f.write(json.dumps(item) + "\n")

def load_list(filename):
    """
    Loads a gzipped JSON Lines (jsonl) file and returns a list of dictionaries.

    Parameters:
        filename (str): The filename of the gzipped jsonl file.

    Returns:
        list: A list of dictionaries read from the file.
    """
    result = []
    with gzip.open(filename, 'rt', encoding='utf-8') as f:
        for line in f:
            result.append(json.loads(line))
    return result


## REQUESTING PATENTS FROM EPO DATABASE

In [5]:
# Define the save_data function to save the request parameters for further use
def save_data(params):
    # Create a configuration file with the filename from params
    conf_filename = f"{params['filename']}.conf"
    with open(conf_filename, "w") as f:
        json.dump(params, f, indent=4)
    log(f"File saved to {conf_filename}")

In [6]:
def create_sql_request_random(params):
    num_files=params["nbr"]
    nbrextract=params["nbrextract"]
    statement = f"""
          SELECT epab_doc_id, LEFT(description.text, {nbrextract}) as description, publication.date as pubdate, publication.number as pubnbr
        FROM `{epab.full_table_name}`
        WHERE description.language="EN"
        ORDER BY RAND()
        LIMIT {num_files};"""
    return statement

def create_sql_request_ipc(params):
    num_files=params["nbr"]
    nbrextract=params["nbrextract"]
    query=f"""WITH flattened AS (
      SELECT
        epab_doc_id, 
        ipc_struct.symbol AS ipc_class
      FROM `{epab.full_table_name}`,
           UNNEST(ipc) AS ipc_struct
    ),
    stratified_sample AS (
      SELECT
        ipc_class,
        epab_doc_id,
        LEFT(description.text, {nbrextract}) as description,
        publication.date as pubdate, publication.number as pubnbr,
        ROW_NUMBER() OVER(PARTITION BY ipc_class ORDER BY RAND() LIMIT {num_files}) AS rn
      FROM flattened
    )
    SELECT
      ipc_class,
      epab_doc_id,
      pubdate, pubnbr, description,
    FROM stratified_sample
    WHERE rn = 1;"""
    return query

def create_sql_request_date(params):
    nbrextract=params["nbrextract"]
    num_files=params["nbr"]
    statement = f"""
   DECLARE num_files INT64 DEFAULT {num_files};

WITH DateRange AS (
  SELECT
    MIN(TIMESTAMP(PARSE_DATE('%Y%m%d', publication.date))) AS min_timestamp,
    CURRENT_TIMESTAMP() AS max_timestamp
  FROM
    `{epab.full_table_name}`
  WHERE publication.date IS NOT NULL -- Handle nulls
),
DateBuckets AS (
  SELECT
    TIMESTAMP(DATE_ADD(DATE(DateRange.min_timestamp), INTERVAL CAST(offset * (DATE_DIFF(DateRange.max_timestamp, DateRange.min_timestamp, DAY) / 10) AS INT64) DAY)) AS bucket_start,
    TIMESTAMP(DATE_ADD(DATE(DateRange.min_timestamp), INTERVAL CAST((offset + 1) * (DATE_DIFF(DateRange.max_timestamp, DateRange.min_timestamp, DAY) / 10) AS INT64) DAY)) AS bucket_end
  FROM
    DateRange,
    UNNEST(GENERATE_ARRAY(0, 9)) AS offset
),
GroupedData AS (
  SELECT
    epab_doc_id,
    LEFT(description.text, {nbrextract}) as description, publication.number as pubnbr,
    publication.date as pubdate,
    TIMESTAMP(PARSE_DATE('%Y%m%d', publication.date)) as publication_timestamp,
    bucket_start,
    bucket_end
  FROM
    `{epab.full_table_name}`
  JOIN
    DateBuckets
  ON
    TIMESTAMP(PARSE_DATE('%Y%m%d', publication.date)) >= bucket_start AND TIMESTAMP(PARSE_DATE('%Y%m%d', publication.date)) < bucket_end
  WHERE description.language="EN" AND publication.date IS NOT NULL
),
SampledData AS (
  SELECT
    epab_doc_id,
    description,
    pubdate, pubnbr,
    bucket_start
  FROM
    GroupedData
  QUALIFY ROW_NUMBER() OVER (PARTITION BY bucket_start ORDER BY RAND()) <= (num_files / 10)
)
SELECT
  epab_doc_id,
  description,
  pubdate, pubnbr
FROM
  SampledData;"""
    return statement

Enter here the filename to save raw patent descriptions requested from the database. The purpose is to avoid permanently sending requests to the database

In [7]:
# main function to create and send the request to the database
def load_raw(params):
    data_selec=params["extract_method"]
    if data_selec=="Random":
        statement=create_sql_request_random(params)
        print(statement)
    if data_selec=="IPC":
        statement=create_sql_request_ipc(params)
    if data_selec=="Date":
        statement=create_sql_request_date(params)
    results = epab.sql_query(statement) 
    log("finished loading")
    #TODO add CPC and other interesting metrics parameters
    docs = [{'id': item['epab_doc_id'], 'original_text': item['description'], 'pubdate':item['pubdate'],'pubnbr':item['pubnbr']} for item in results]
    save_list(docs,params["filename"])
    return docs

In [8]:
# Define a style with a wider description width
style = {'description_width': '150px'}  # Increase as needed

# Create form widgets with the updated style
file_name_widget = widgets.Text(
    value='',
    description='File Name:',
    placeholder='Enter file name',
    style=style
)

number_widget = widgets.IntText(
    value=10,
    description='Number of patents:',
    style=style
)

numberextract_widget = widgets.IntText(
    value=5000,
    description='Number of characters:',
    style=style
)

extraction_widget = widgets.Dropdown(
    options=["Random", "IPC", "Length", "Asian", "Date"],
    description='Extraction method:',
    style=style
)

data_widget=widgets.Output()

start_button = widgets.Button(
    description='Start',
    button_style='success'
)

# Define the function to be executed when the button is clicked
def on_start_clicked(b):
    params = {
        "filename": file_name_widget.value,
        "nbr": number_widget.value,
        "nbrextract": numberextract_widget.value,
        "extract_method": extraction_widget.value
    }
    save_data(params)
    
    docs=load_raw(params)
    docs=pd.DataFrame(docs)
    with data_widget:
        display(docs)
    log("finished loading patents")

start_button.on_click(on_start_clicked)

# Display the form

form_items = widgets.VBox([
    file_name_widget,
    number_widget,
    numberextract_widget,
    extraction_widget,
    start_button,
    data_widget
])

display(form_items)

VBox(children=(Text(value='', description='File Name:', placeholder='Enter file name', style=TextStyle(descrip…

In [9]:
# Keywords to search for in headings (allowing fuzzy matching with up to one error)
keyword1 = ["background",  "prior art", "state of the art", "field of the invention", "technical field","summary","industrial applicability"]
keyword2=["background","herein described subject matter", "technology described herein", "subject of the invention", "belongs to the field", "invention is","invention relates to", "present invention refers to"]

New code below for extraction of text based on sentences

In [17]:
import re


def clean_sentences(s):
    if s.endswith('.'):
        # Remove all content in parentheses or brackets, including the symbols
        s = re.sub(r'\([^)]*\)', '', s)  # remove ( ... )
        s = re.sub(r'\[[^\]]*\]', '', s)  # remove [ ... ]

        # Remove HTML tags
        s = re.sub(r'<[^>]+>', '', s)

        # Collapse multiple spaces and strip again
        s = re.sub(r'\s+', ' ', s).strip()
    return s

def extract_text(text, keyword1, keyword2, min_sentence_length=5):
    """
    Extracts text segments that start with <heading ...> containing keyword1,
    and go until the next <heading ...> tag.
    If none found, extracts paragraphs containing keyword2 and the next paragraph.
    Returns a list of cleaned sentences.
    """
    # Pattern to match headings and content following them
    pattern = re.compile(r'(<heading[^>]*?>)(.*?)</heading>(.*?)(?=<heading|$)', re.DOTALL | re.IGNORECASE)
    
    matches = pattern.findall(text)

    extracted_segments = []
    for full_tag, heading_text, following_text in matches:
        # Check if heading text contains any keyword1
        if any(kw.lower() in heading_text.lower() for kw in keyword1):
            extracted_segments.append(following_text.strip())

    if extracted_segments:
        combined_text = "\n".join(extracted_segments)
    else:
        # Fallback: extract paragraphs based on keyword2
        paragraphs = text.split("</p>")
        extracted_paragraphs = []
        for i, para in enumerate(paragraphs):
            if any(kw.lower() in para.lower() for kw in keyword2):
                extracted_paragraphs.append(para)
                if i + 1 < len(paragraphs):
                    extracted_paragraphs.append(paragraphs[i + 1])
        combined_text = "\n\n".join(extracted_paragraphs)

    # Split into sentences using simple punctuation rule
    sentences = re.split(r'(?<=[.!?])\s+', combined_text)
    sentences = [s.strip() for s in sentences if (len(s.split()) >= min_sentence_length)]
    sentences=[clean_sentences(s) for s in sentences if s.endswith('.')]

    return sentences

def process_texts(texts, keyword1, keyword2, min_sentence_length=5):
    """
    Processes a list of texts, each with an associated id.
    
    Args:
        texts (list of tuples): Each tuple is (text_id, text).
        keyword1 (list): Keywords for heading-based extraction.
        keyword2 (list): Fallback keywords for paragraph-based extraction.
        min_sentence_length (int): Minimum number of words a sentence must have.
        
    Returns:
        list of tuples: Each tuple is (text_id, sentence) for each extracted sentence.
    """
    results = []
    for item in texts:
        sentences = extract_text(item["original_text"], keyword1, keyword2, min_sentence_length)
        if len(sentences)==0:
            sentences=[""]
        for sentence in sentences:
            results.append({"id":item["id"], "sentence":sentence})

    return results


def merge_sentence(processed_texts):
    """
    Merges sentences by text_id from the processed texts.
    
    Args:
        processed_texts (list of tuples): Each tuple is (text_id, sentence).
        
    Returns:
        list of dict: Each dictionary has the text_id as key and the merged sentences as value.
    """
    merged = {}
    for item in processed_texts:
        text_id=item["id"]
        sentence=item["sentence"]
        if text_id not in merged:
            merged[text_id] = sentence
        else:
            merged[text_id] += "\n" + sentence
    # Convert to list of dictionaries as required.
    return [{"id":text_id,"text":sentences, "status":""} for text_id, sentences in merged.items()]

In [18]:
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets

def compute_sentence_stats(df, plot_widget):
    """
    Computes statistics on the number of sentences per text_id.

    Parameters:
        df (pandas.DataFrame): DataFrame with columns 'text_id' and 'sentence'
    
    Returns:
        stats (dict): Dictionary containing the following keys:
                      - 'mean': average number of sentences per text_id
                      - 'min': minimum number of sentences in any text_id
                      - 'max': maximum number of sentences in any text_id
                      - 'square_mean': mean of the squared sentence counts per text_id
    """
    # Group by text_id and count the sentences per group
    sentence_counts = df.groupby('id')['sentence'].count()
    
    # Compute the required statistics
    mean_sentences = sentence_counts.mean()
    min_sentences = sentence_counts.min()
    max_sentences = sentence_counts.max()
    square_mean_sentences = np.mean(sentence_counts**2)
    
    # Prepare the results in a dictionary
    stats = {
        'mean': mean_sentences,
        'min': min_sentences,
        'max': max_sentences,
        'square_mean': square_mean_sentences
    }

    with plot_widget:
        plt.figure(figsize=(8, 5))
        # Plot the distribution (histogram) of sentence counts per text_id
        plt.hist(sentence_counts, bins=20, edgecolor='black')
        plt.title("Distribution of Sentence Counts per text_id")
        plt.xlabel("Number of Sentences")
        plt.ylabel("Frequency")
        plt.show()
    
    return stats, plot_widget

def analyze_sentence_data(data, plot_widget):
    return compute_sentence_stats(pd.DataFrame(data), plot_widget)    

def analyze_text_data(data,plot_widget):
    """
    Analyzes a list of dictionaries containing text data.
    
    Parameters:
        data (list): A list where each element is a dictionary with an 'id'
                     and a nested dictionary under 'data' that contains a 
                     'sentence' and 'status'.
                     
    Returns:
        stats (dict): A dictionary containing the mean, square-mean, min, and max word counts.
        plot_widget (ipywidgets.Output): An ipywidget containing a histogram of the word counts.
    """
    word_counts = []
    nbr=0
    for entry in data:
        # Adjust the keys if your structure is different.
        nbr+=1
        sentence = entry['text']
        count = len(sentence.split())
        word_counts.append(count)
    word_counts = np.array(word_counts)
    
    # Compute statistics
    stats = {
        'Nbr of entries': nbr,
        'mean': word_counts.mean(),
        'square_mean': np.mean(word_counts**2),
        'min': word_counts.min(),
        'max': word_counts.max()
    }
    
    # Create an ipywidget Output for the plot
    
    with plot_widget:
        plt.figure(figsize=(8, 5))
        plt.hist(word_counts, bins=20, edgecolor='black')
        plt.title("Distribution of Word Counts")
        plt.xlabel("Number of Words")
        plt.ylabel("Frequency")
        plt.show()
        
    return stats



In [19]:
def merge_by_id(list1, list2):
    # Create a lookup dictionary from list2 using 'id' as the key
    lookup = {item['id']: item['original_text'] for item in list2}
    lookup1 = {item['id']: item['pubnbr'] for item in list2}
    lookup2 = {item['id']: item['pubdate'] for item in list2}

    # Merge with corresponding entry in list1
    merged = []
    for item in list1:
        merged_item = {
            'id': item['id'],
            'text': item['text'],
            'pubdate': lookup2.get(item['id']) ,
            'pubnbr':lookup1.get(item['id']) ,
            'original_text': lookup.get(item['id'])  # Use .get() to avoid KeyError
        }
        merged.append(merged_item)
    
    return merged

## DATA extraction

This is the configuration file to define the extraction process. The extraction parameters are saved in a ".ext" file and the result of the extraction in ".dat.gz"

In [20]:
import os
import json
import ipywidgets as widgets
from IPython.display import display

# Create the left side: Update button and file list
update_button = widgets.Button(description="Update")
file_list = widgets.Select(options=[], rows=10, description="Files:")

# Create the right side: Inputs for filename, raw source, comment, and cut-off length.
filename_input = widgets.Text(description="Filename:")
raw_source_input = widgets.Text(description="Raw source file:")
comment_input = widgets.Textarea(
    description="Comment:",
    layout=widgets.Layout(width='100%', height='80px')
)
select_button = widgets.Button(description="Extract")

def update_file_list(button):
    """
    Updates the file list with files ending in '.ext' in the current directory.
    """
    files = [f for f in os.listdir('.') if f.endswith('.ext')]
    file_list.options = files

def load_file_content(change):
    """
    Loads the content of the selected file and populates the input widgets.

    Args:
        change (dict): The change dictionary from the file_list.observe event.
    """
    selected_file = change.new
    if selected_file:
        try:
            with open(selected_file, 'r') as f:
                file_content = f.read()
                # Try to parse as JSON first, if it fails, treat as raw text
                try:
                    data = json.loads(file_content)
                    filename_input.value = data.get("filename", "")
                    raw_source_input.value = data.get("raw_source_file", "")
                    comment_input.value = data.get("comment", "")

                except json.JSONDecodeError:
                    # If it's not JSON, put the whole content into raw_source_input
                    raw_source_input.value = file_content
                    filename_input.value = os.path.splitext(selected_file)[0]  # Set filename without extension
                    comment_input.value = ""

        except Exception as e:
            print(f"Error loading file: {e}")
            filename_input.value = ""
            raw_source_input.value = ""
            comment_input.value = ""

def on_select(button):
    """
    Gathers parameters, determines the output filename, saves parameters as JSON,
    and calls the process_texts function (which is assumed to be defined elsewhere).
    """
    params = {
        "filename": filename_input.value,
        "raw_source_file": raw_source_input.value,
        "comment": comment_input.value,
        "selected_file": file_list.value
    }

    save_filename = filename_input.value
    if not save_filename.endswith(".ext"):
        save_filename = save_filename + ".ext"

    try:
        with open(save_filename, "w") as f:
            json.dump(params, f, indent=4)
    except Exception as e:
        print(f"Error saving file: {e}")
    texts=load_list(params["raw_source_file"])
    extracted=process_texts(texts, keyword1, keyword2)
    metrics0=analyze_sentence_data(extracted,plot_widget0)
    merged=merge_sentence(extracted)
    result=merge_by_id(merged,texts)
    metrics=analyze_text_data(merged,plot_widget)
    log2(str(metrics0))
    save_list(extracted,params["filename"]+".sen.gz")
    save_list(result,params["filename"]+".dat.gz")
    metrics["filename"]=params["filename"]+".met"
    #save_data(metrics)
    log2(str(metrics))  
    log2("finished")

# Set up event handlers
update_button.on_click(update_file_list)
file_list.observe(load_file_content, names='value')  # Observe changes to file_list.value
select_button.on_click(on_select)

# Organize the layout with two columns.
left_box = widgets.VBox([update_button, file_list])
right_box = widgets.VBox([
    filename_input,
    raw_source_input,
    comment_input,
    select_button
])
ui1 = widgets.HBox([left_box, right_box])
log_output2 = widgets.Textarea(
    value='',
    placeholder='Results will appear here.',
    description='Logs:',
    disabled=False,
    layout=widgets.Layout(width='80%', height='100px')  # Adjust size as needed
)
plot_widget = widgets.Output()
plot_widget0 = widgets.Output()
log_output2.value=""
ui=widgets.VBox([ui1,log_output2,plot_widget0,plot_widget])

# Display the UI in the notebook.
display(ui)

def log2(text):
    """
    Adds text to the log display widget.
    """
    log_output2.value += text + '\n'  # Append the new text with a newline character
    
# Initial update of the file list
update_file_list(None)



VBox(children=(HBox(children=(VBox(children=(Button(description='Update', style=ButtonStyle()), Select(descrip…

In [14]:
def filter_fails(data):
    return [item for item in data if len(item.get('extracted_text', '')) < 50 or len(item.get('extracted_text', '')) > 800]

In [15]:
from bs4 import BeautifulSoup
def html_to_text(html):
    soup = BeautifulSoup(html, "html.parser")
    # The separator="\n" inserts a newline between blocks of text
    return soup.get_text(separator=" ")

In [16]:
import os
import ipywidgets as widgets
from IPython.display import display

def filter_fails(data):
    return [item for item in data if item.get('difficult')]

# Get list of files ending with ".dat.gz" from current directory
files = [f for f in os.listdir('.') if f.endswith('dat.gz')]

# Widget for listing files
file_selector = widgets.Select(
    options=files,
    description='Files:',
    rows=10,
    layout=widgets.Layout(width='200px')
)
patentnbr_input = widgets.Text(description="Patent number:")
patentdate_input = widgets.Text(description="Patent date:")

status_dropdown = widgets.Dropdown(
    options=['OK', 'WRONG', 'NONE'],
    value='NONE',
    description='Status:'
)

# Text area to display the contents of the corresponding ".met.conf" file
met_conf_text = widgets.Textarea(
    description='Config:',
    layout=widgets.Layout(width='500px', height='200px')
)

save_button = widgets.Button(description="SAVE")

# Callback to load and display the .met.conf file content when a file is selected
def on_file_select(change):
    selected_file = file_selector.value
    if selected_file:
        met_conf_filename = selected_file.replace('.dat.gz', '.ext')
        if os.path.exists(met_conf_filename):
            with open(met_conf_filename, 'r') as f:
                content = f.read()
        else:
            content = f"File not found: {met_conf_filename}"
        met_conf_text.value = str(content)

file_selector.observe(on_file_select, names='value')

# Button to load the list (using load_list function)
load_button = widgets.Button(description='Load')
list_data = []  # Will store the list of dictionaries loaded by load_list
filtered_data = []
current_index = 0  # To keep track of the current row

# Checkbox to toggle difficult-only filtering
difficult_only_checkbox = widgets.Checkbox(
    value=False,
    description='Display difficult files only'
)

# Text areas for displaying the extracted_text and original_text fields
extracted_text_area = widgets.Textarea(
    description='Extracted:',
    layout=widgets.Layout(width='500px', height='200px')
)
original_text_area = widgets.Textarea(
    description='Original:',
    layout=widgets.Layout(width='500px', height='200px')
)

def get_display_data():
    return filter_fails(list_data) if difficult_only_checkbox.value else list_data

# Update the text areas based on the current index
def update_text_areas():
    display_data = get_display_data()
    if display_data and 0 <= current_index < len(display_data):
        extracted_text_area.value = display_data[current_index].get('text', '')
        original_text_area.value = display_data[current_index].get('original_text', '')
        patentnbr_input.value=display_data[current_index].get('pubnbr')
        patentdate_input.value=display_data[current_index].get('pubdate')
        status_dropdown.value = display_data[current_index].get('status', 'NONE')
    else:
        extracted_text_area.value = ''
        original_text_area.value = ''
        patentnbr_input.value=''
        patentdate_input.value=''

def on_load_button_click(b):
    global list_data, current_index
    if file_selector.value:
        selected_file=file_selector.value
        list_data = load_list(file_selector.value)
        current_index = 0
        update_text_areas()
        met_conf_filename = selected_file.replace('.dat.gz', '.ext')
        if os.path.exists(met_conf_filename):
            with open(met_conf_filename, 'r') as f:
                content = f.read()
        else:
            content = f"File not found: {met_conf_filename}"
        met_conf_text.value = str(content)
    else:
        extracted_text_area.value = 'No file selected'
        original_text_area.value = ''

def on_status_change(change):
    # Ensure we're only handling a value change event
    if change['name'] == 'value' and change['type'] == 'change':
        display_data = get_display_data()
        if display_data and 0 <= current_index < len(display_data):
            display_data[current_index]['status'] = change['new']

status_dropdown.observe(on_status_change, names='value')

load_button.on_click(on_load_button_click)
difficult_only_checkbox.observe(lambda change: update_text_areas(), names='value')

# Navigation buttons for moving through rows
left_arrow = widgets.Button(description='<')
right_arrow = widgets.Button(description='>')

def on_left_arrow_click(b):
    global current_index
    if current_index > 0:
        current_index -= 1
        update_text_areas()

def on_right_arrow_click(b):
    global current_index
    if current_index < len(get_display_data()) - 1:
        current_index += 1
        update_text_areas()

left_arrow.on_click(on_left_arrow_click)
right_arrow.on_click(on_right_arrow_click)

# Define the button's click event handler
def on_save_click(b):
    save_list(display_data,file_selector.value+".check")

# Attach the event handler to the button's click event
save_button.on_click(on_save_click)

# Layout the widgets
file_list_box = widgets.VBox([file_selector, load_button])
file_and_config = widgets.HBox([file_list_box, met_conf_text])
text_areas_box = widgets.HBox([extracted_text_area, original_text_area])
nav_buttons = widgets.HBox([left_arrow, right_arrow])

# Combine all into one UI layout
ui = widgets.VBox([file_and_config, difficult_only_checkbox, patentdate_input, patentnbr_input,    status_dropdown, text_areas_box, nav_buttons,save_button])
display(ui)


VBox(children=(HBox(children=(VBox(children=(Select(description='Files:', layout=Layout(width='200px'), option…