# Patent data extraction analysis

The purpose of this notebook is to test and investigate the different methods to extract the SDG-relevant data from the patents 

In [59]:
!pip install autocorrect



In [60]:
from epo.tipdata.epab import EPABClient
epab = EPABClient(env="PROD")
import re
import pandas as pd
import ipywidgets as widgets
from ipywidgets import (
    HTML, IntText, Text, Dropdown, SelectMultiple, Checkbox, Button, VBox, HBox, Layout
)
from IPython.display import display
import pickle
import json
from autocorrect import Speller #TODO check if we do the spelling stuff here
import gzip

Patents data loading

First we test here different methods to load representative sets of patents for further analysis

In [4]:
# Create the log display area
log_output = widgets.Textarea(
    value='',
    placeholder='Logs will appear here.',
    description='Logs:',
    disabled=False,
    layout=widgets.Layout(width='100%', height='200px')  # Adjust size as needed
)

# Create a VBox to hold the log display
log_widget = widgets.VBox([log_output])

# Display the log widget
display(log_widget)

def log(text):
    """
    Adds text to the log display widget.
    """
    log_output.value += text + '\n'  # Append the new text with a newline character

VBox(children=(Textarea(value='', description='Logs:', layout=Layout(height='200px', width='100%'), placeholde…

In [5]:
# Helper functions to load and save lists of dictionaries to a gzipped jsonl FILE
#-----------------------------
def save_list(data, filename):
    """
    Saves a list of dictionaries to a gzipped JSON Lines (jsonl) file.

    Parameters:
        data (list): List of dictionaries to be saved.
        filename (str): The filename (including .gz if desired) where the data will be saved.
    """
    with gzip.open(filename, 'wt', encoding='utf-8') as f:
        for item in data:
            # Write each dictionary as a JSON line.
            f.write(json.dumps(item) + "\n")

def load_list(filename):
    """
    Loads a gzipped JSON Lines (jsonl) file, separating the first line as metadata
    and the rest as a list of dictionaries.

    Parameters:
        filename (str): The filename of the gzipped jsonl file.

    Returns:
        tuple: A tuple containing:
            - dict: The metadata dictionary from the first line.
            - list: A list of dictionaries (data records) read from the file.
    """
    metadata = {}
    data_records = []
    
    with gzip.open(filename, 'rt', encoding='utf-8') as f:
        # Read the first line as metadata
        first_line = f.readline()
        if first_line:
            try:
                metadata = json.loads(first_line)
            except json.JSONDecodeError as e:
                print(f"Warning: Could not decode first line as metadata: {e}")
                pass     
        # Read the rest of the lines as data records
        for line in f:
            try:
                data_records.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Warning: Could not decode line as JSON, skipping: {line.strip()} - {e}")
                continue # Skip malformed lines

    return metadata, data_records

def load_list2(filename):
    data_records = []
    
    with gzip.open(filename, 'rt', encoding='utf-8') as f:
        # Read the rest of the lines as data records
        for line in f:
            try:
                data_records.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Warning: Could not decode line as JSON, skipping: {line.strip()} - {e}")
                continue # Skip malformed lines

    return  data_records



## REQUESTING PATENTS FROM EPO DATABASE

In [6]:
# Define the save_data function to save the request parameters for further use
def save_data(params):
    # Create a configuration file with the filename from params
    conf_filename = f"{params['filename']}.conf"
    with open(conf_filename, "w") as f:
        json.dump(params, f, indent=4)
    log(f"File saved to {conf_filename}")

In [7]:
def create_sql_request_sdg(params):
    statement = f"""
          SELECT epab_doc_id,  description.text as description, publication.date as pubdate, publication.number as pubnbr
        FROM `{epab.full_table_name}`
        WHERE LOWER(description.text) LIKE '%sustainable development goal%'"""

    return statement

def create_sql_request_ipc(params):
    num_files=params["nbr"]
    nbrextract=params["nbrextract"]
    query=f"""WITH flattened AS (
      SELECT
        epab_doc_id, 
        ipc_struct.symbol AS ipc_class
      FROM `{epab.full_table_name}`,
           UNNEST(ipc) AS ipc_struct
    ),
    stratified_sample AS (
      SELECT
        ipc_class,
        epab_doc_id,
        LEFT(description.text, {nbrextract}) as description,
        publication.date as pubdate, publication.number as pubnbr,
        ROW_NUMBER() OVER(PARTITION BY ipc_class ORDER BY RAND() LIMIT {num_files}) AS rn
      FROM flattened
    )
    SELECT
      ipc_class,
      epab_doc_id,
      pubdate, pubnbr, description,
    FROM stratified_sample
    WHERE rn = 1;"""
    return query

def create_sql_request_date(params):
    nbrextract=params["nbrextract"]
    num_files=params["nbr"]
    statement = f"""
   DECLARE num_files INT64 DEFAULT {num_files};

WITH DateRange AS (
  SELECT
    MIN(TIMESTAMP(PARSE_DATE('%Y%m%d', publication.date))) AS min_timestamp,
    CURRENT_TIMESTAMP() AS max_timestamp
  FROM
    `{epab.full_table_name}`
  WHERE publication.date IS NOT NULL -- Handle nulls
),
DateBuckets AS (
  SELECT
    TIMESTAMP(DATE_ADD(DATE(DateRange.min_timestamp), INTERVAL CAST(offset * (DATE_DIFF(DateRange.max_timestamp, DateRange.min_timestamp, DAY) / 10) AS INT64) DAY)) AS bucket_start,
    TIMESTAMP(DATE_ADD(DATE(DateRange.min_timestamp), INTERVAL CAST((offset + 1) * (DATE_DIFF(DateRange.max_timestamp, DateRange.min_timestamp, DAY) / 10) AS INT64) DAY)) AS bucket_end
  FROM
    DateRange,
    UNNEST(GENERATE_ARRAY(0, 9)) AS offset
),
GroupedData AS (
  SELECT
    epab_doc_id,
    LEFT(description.text, {nbrextract}) as description, publication.number as pubnbr,
    publication.date as pubdate,
    TIMESTAMP(PARSE_DATE('%Y%m%d', publication.date)) as publication_timestamp,
    bucket_start,
    bucket_end
  FROM
    `{epab.full_table_name}`
  JOIN
    DateBuckets
  ON
    TIMESTAMP(PARSE_DATE('%Y%m%d', publication.date)) >= bucket_start AND TIMESTAMP(PARSE_DATE('%Y%m%d', publication.date)) < bucket_end
  WHERE description.language="EN" AND publication.date IS NOT NULL
),
SampledData AS (
  SELECT
    epab_doc_id,
    description,
    pubdate, pubnbr,
    bucket_start
  FROM
    GroupedData
  QUALIFY ROW_NUMBER() OVER (PARTITION BY bucket_start ORDER BY RAND()) <= (num_files / 10)
)
SELECT
  epab_doc_id,
  description,
  pubdate, pubnbr
FROM
  SampledData;"""
    return statement



Enter here the filename to save raw patent descriptions requested from the database. The purpose is to avoid permanently sending requests to the database

In [8]:
# main function to create and send the request to the database
def load_raw(params):
    data_selec=params["extract_method"]
    if data_selec=="Random":
        statement=create_sql_request_random(params)
        print(statement)
    if data_selec=="IPC":
        statement=create_sql_request_ipc(params)
    if data_selec=="Date":
        statement=create_sql_request_date(params)
    if data_selec=="SDG":
        statement=create_sql_request_sdg(params)
        
    results = epab.sql_query(statement) 
    log("finished loading")
    #TODO add CPC and other interesting metrics parameters
    docs = [{'id': item['epab_doc_id'], 'original_text': item['description'], 'pubdate':item['pubdate'],'pubnbr':item['pubnbr']} for item in results]
    save_list(docs,params["filename"])
    return docs

Two files are created:
- one .conf for the parameters containing a json   with "filename": the general name without extension "nbr": the number of files to query in the database "nbrextract": the number of characters to load from the database "extract_method" saved as jsonl compressed dat.gz:
- one with the raw date: id, original_text, pubdate, pubnbr

In [9]:
# Define a style with a wider description width
style = {'description_width': '150px'}  # Increase as needed

# Create form widgets with the updated style
file_name_widget = widgets.Text(
    value='',
    description='File Name:',
    placeholder='Enter file name',
    style=style
)

number_widget = widgets.IntText(
    value=10,
    description='Number of patents:',
    style=style
)

numberextract_widget = widgets.IntText(
    value=5000,
    description='Number of characters:',
    style=style
)

extraction_widget = widgets.Dropdown(
    options=["Random", "IPC", "Length", "Asian", "Date", "SDG"],
    description='Extraction method:',
    style=style
)

data_widget=widgets.Output()

start_button = widgets.Button(
    description='Start',
    button_style='success'
)

# Define the function to be executed when the button is clicked
def on_start_clicked(b):
    params = {
        "filename": file_name_widget.value,
        "nbr": number_widget.value,
        "nbrextract": numberextract_widget.value,
        "extract_method": extraction_widget.value
    }
    save_data(params)
    
    docs=load_raw(params)
    docs=pd.DataFrame(docs)
    print(docs)
    with data_widget:
        display(docs)
    log("finished loading patents")

start_button.on_click(on_start_clicked)

# Display the form

form_items = widgets.VBox([
    file_name_widget,
    number_widget,
    numberextract_widget,
    extraction_widget,
    start_button,
    data_widget
])

display(form_items)

VBox(children=(Text(value='', description='File Name:', placeholder='Enter file name', style=TextStyle(descrip…

In [61]:
keyword1 = ["scope of the invention","Description of the Related Art", "TECHNICAL SCOPE","Description of Related Art","REVEALING THE INVENTION","background of the invention", "background of the disclosure", "field of the invention", "field of invention", "technical field","summary","industrial applicability","Field of art","background","introduction","background art"]
keyword2=["background","The present invention regards","herein described subject matter", "It is well known" "technology described herein", "field of the disclosure", "field of the invention", "subject of the invention", "belongs to the field", "invention is","invention relates to", "present invention refers to","aspect of the invention","technical field"]
keyword3=["field of the disclosure","background",  "state of the art","prior art"]

New code below for extraction of text based on sentences

In [62]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.8.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.13-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.5 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.6-cp312-cp312-manylinux_2_17_x86

In [63]:
import re
import xml.etree.ElementTree as ET
import spacy
from typing import List, Optional

nlp = spacy.blank('en')
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7ac581ac3dd0>

In [107]:
import re
from bs4 import BeautifulSoup, Tag
from typing import List
import spacy

# Load the English spaCy model (make sure you have it installed: python -m spacy download en_core_web_sm)
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spacy model 'en_core_web_sm'...")
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

def remove_keywords(text: str, keywords: List[str]) -> str:
    """
    Removes specified keywords from text using whole word, case-insensitive matching.
    """
    if not isinstance(text, str): return ""
    if not keywords or not any(keywords): # Handle empty or all-empty keywords list
        return text
    
    # Filter out non-string or empty keywords and escape them
    valid_keywords = [re.escape(str(kw)) for kw in keywords if kw and isinstance(kw, str)]
    if not valid_keywords:
        return text
        
    pattern = r'\b(?:' + '|'.join(valid_keywords) + r')\b'
    return re.sub(pattern, '', text, flags=re.IGNORECASE)

def extract_text_simple(
    html_text: str,
    keyword_headings: List[str],
    keyword_paragraphs: List[str],
    keyword_fallback: List[str],
    max_words: int = 600
) -> str:
    """
    Extracts up to max_words words of text from HTML:
    1) Finds <heading> or <h1>-<h6> tags whose text contains any keyword_headings;
        collects text of all following sibling elements until the next heading.
    2) If no such headings, finds <p> tags containing any keyword_paragraphs,
        and takes that paragraph + the next one.
    3) If still nothing, extracts the first sentences of the text until near 500 words
       or max_words (whichever is met first).

    Removes the matched heading keywords, normalizes whitespace, then segments
    into sentences and accumulates full sentences up to max_words.
    Returns the concatenated string.
    """
    soup = BeautifulSoup(html_text, 'html.parser')

    # Lowercased keyword lists for matching
    kws_h = [kw.lower() for kw in keyword_headings]
    kws_p = [kw.lower() for kw in keyword_paragraphs]
    kws_f = [kw.lower() for kw in keyword_fallback]
    collecting_mode = False

    def clean_text(elem):
        return elem.get_text(separator=' ', strip=True)

    # 1) Heading-based extraction
    collected = []
    # consider both <heading> and standard heading tags
    headings = soup.find_all(lambda tag: tag.name == 'heading' or re.fullmatch(r'h[1-6]', tag.name))
    mode="headings"
    for h in headings:
        txt = h.get_text().lower()
        if any(kw in txt for kw in kws_h):
            for sib in h.find_next_siblings():
                collected.append(clean_text(sib))
            break # Stop after finding the first matching heading

    # 2) Paragraph-based fallback
    if not collected:
        mode="paragraphs"
        paragraphs = soup.find_all('p')
        for idx, p in enumerate(paragraphs):
            txt = clean_text(p).lower()
        
            if collecting_mode:
                # If we are already in collecting mode, just append the paragraph
                collected.append(clean_text(p))
            elif any(kw in txt for kw in kws_p):
                # If a keyword is found and we are not yet in collecting mode
                collecting_mode = True  # Set the flag to True
                collected.append(clean_text(p)) # Collect the current paragraph
                # All subsequent paragraphs will now be collected in the next iterations

    # 3) Final Fallback: First sentences of the text (now using word count)
    if not collected or (len(' '.join(collected).split())<100)::
        mode="fallback"
        # Get all text from the body, then segment into sentences
        full_text = soup.body.get_text(separator=' ', strip=True) if soup.body else html_text
        doc = nlp(full_text)
        current_word_count_initial = 0
        target_word_count_initial = 500 # Aim for around 500 words for the initial fallback
        for sent in doc.sents:
            cleaned_sent = sent.text.strip()
            # Calculate word count for the current sentence
            sent_word_count = len([token for token in nlp(cleaned_sent) if not token.is_space and not token.is_punct]) # Exclude punctuation
            
            # Ensure sentences have more than 15 characters (this constraint remains)
            if len(cleaned_sent) > 15:
                # Check if adding this sentence exceeds the target word count
                if current_word_count_initial + sent_word_count > target_word_count_initial and current_word_count_initial > 0:
                    # Allow some overshoot but try not to go wildly over
                    if current_word_count_initial + sent_word_count <= target_word_count_initial + 100: # Allow up to 100 words overshoot
                        collected.append(cleaned_sent)
                        current_word_count_initial += sent_word_count
                    else:
                        break
                                    
                collected.append(cleaned_sent)
                current_word_count_initial += sent_word_count
            

    # Combine collected text and remove heading keywords (only if collected via heading)
    combined_text = ' '.join(collected)
    # Check if any heading keyword exists in the lowercased combined text to decide if removal is needed
    if mode=="headings":
        if collected and any(kw in combined_text.lower() for kw in kws_h): 
               for kw in kws_h:
                    combined_text = re.sub(re.escape(kw), '', combined_text, flags=re.IGNORECASE)

   # --- Block-level cleaning ---
    # These are applied unconditionally to the combined_text block,
    # as the user's clean_sentences might not trigger if combined_text doesn't end with '.'
    combined_text = re.sub(r'\([^)]*\)', '', combined_text)  # remove (...)
    combined_text = re.sub(r'\[[^\]]*\]', '', combined_text)  # remove [...]
    combined_text = re.sub(r'<[^>]+>', '', combined_text)    # remove <...> HTML tags (extra safeguard)

    # Specific numeric/reference patterns (if these are distinct from general parenthesis removal)
    combined_text = re.sub(r'\(\s*\d+\s*\)', '', combined_text) # e.g. (1), ( 2 )
    combined_text = re.sub(r'\[\s*\d+\s*\]', '', combined_text) # e.g. [1], [ 2 ]
    # Removes e.g. [References], [Table A], but not brackets with only digits or only spaces
    combined_text = re.sub(r'\[\s*[^\]\d\s][^\]]*?\]', '', combined_text) 
    combined_text = re.sub(r'\s+', ' ', combined_text).strip() # Normalize whitespace

    total_words_output = 0
    target_word_count_final = 500 # Aim for around 500 words for the final output

    words = combined_text.split()  # Split the text into a list of words
    if len(words) > target_word_count_final:
        truncated_words = words[:target_word_count_final]
        return " ".join(truncated_words)
    else:
        return combined_text


def process_texts(texts, keyword1, keyword2, min_sentence_length=5): # min_sentence_length is now mostly handled in extract_text_simple
    results = []
    for item in texts:
        # Note: extract_text_simple now returns a list of strings (sentences)
        result = extract_text_simple(item["original_text"], keyword1, keyword2, keyword2)
        results.append(item|{ "cleaned_text": result})
    return results

In [110]:
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets


def analyze_text_data(data,plot_widget):
    """
    Analyzes a list of dictionaries containing text data.
    
    Parameters:
        data (list): A list where each element is a dictionary with an 'id'
                     and a nested dictionary under 'data' that contains a 
                     'sentence' and 'status'.
                     
    Returns:
        stats (dict): A dictionary containing the mean, square-mean, min, and max word counts.
        plot_widget (ipywidgets.Output): An ipywidget containing a histogram of the word counts.
    """
    word_counts = []
    nbr=0
    for entry in data:
        # Adjust the keys if your structure is different.
        nbr+=1
        text = entry['cleaned_text']
        count = len(text.split())
        word_counts.append(count)
    word_counts = np.array(word_counts)
    
    # Compute statistics
    stats = {
        'Nbr of entries': nbr,
        'mean': word_counts.mean(),
        'square_mean': np.mean(word_counts**2),
        'min': word_counts.min(),
        'max': word_counts.max()
    }
    
    # Create an ipywidget Output for the plot
    
    with plot_widget:
        plt.figure(figsize=(8, 5))
        plt.hist(word_counts, bins=20, edgecolor='black')
        plt.title("Distribution of Word Counts")
        plt.xlabel("Number of Words")
        plt.ylabel("Frequency")
        plt.show()
        
    return stats



## DATA extraction

This is the configuration file to define the extraction process.
The extraction parameters are saved in a ".ext" file with:    "filename", "raw_source_file", "comment"
and the result of the extraction in ".dat.gz"

In [111]:
import os
import json
import ipywidgets as widgets
from IPython.display import display

# Create the left side: Update button and file list
update_button = widgets.Button(description="Update")
file_list = widgets.Select(options=[], rows=10, description="Files:")

# Create the right side: Inputs for filename, raw source, comment, and cut-off length.
filename_input = widgets.Text(description="Filename:")
raw_source_input = widgets.Text(description="Raw source file:")
comment_input = widgets.Textarea(
    description="Comment:",
    layout=widgets.Layout(width='100%', height='80px')
)
select_button = widgets.Button(description="Extract")

def update_file_list(button):
    """
    Updates the file list with files ending in '.ext' in the current directory.
    """
    files = [f for f in os.listdir('.') if f.endswith('.dat.gz')]
    file_list.options = files


def on_select(button):
    """
    Gathers parameters, determines the output filename, saves parameters as JSON,
    and calls the process_texts function (which is assumed to be defined elsewhere).
    """
    params = {
        "filename": raw_source_input.value,
        "raw_source_file": file_list.value,
        "comment": comment_input.value,
        "selected_file": file_list.value
    }
    print("start extraction")
    data,texts=load_list(params["raw_source_file"])
    print(data)
    print(len(texts))
    extracted=process_texts(texts, keyword1, keyword2)
    print("processing finished")
    metrics=analyze_text_data(extracted,plot_widget)
    save_list(extracted,params["filename"]+".sen.gz")
    save_list(result,params["filename"]+".dat.gz")
    metrics["filename"]=params["filename"]+".met"
    #save_data(metrics)
    log2(str(metrics))  
    log2("finished")

# Set up event handlers
update_button.on_click(update_file_list)
select_button.on_click(on_select)

# Organize the layout with two columns.
left_box = widgets.VBox([update_button, file_list])
right_box = widgets.VBox([
    filename_input,
    raw_source_input,
    comment_input,
    select_button
])
ui1 = widgets.HBox([left_box, right_box])
log_output2 = widgets.Textarea(
    value='',
    placeholder='Results will appear here.',
    description='Logs:',
    disabled=False,
    layout=widgets.Layout(width='80%', height='100px')  # Adjust size as needed
)
plot_widget = widgets.Output()
plot_widget0 = widgets.Output()
log_output2.value=""
ui=widgets.VBox([ui1,log_output2,plot_widget0,plot_widget])

# Display the UI in the notebook.
display(ui)

def log2(text):
    """
    Adds text to the log display widget.
    """
    log_output2.value += text + '\n'  # Append the new text with a newline character

    
# Initial update of the file list
update_file_list(None)



VBox(children=(HBox(children=(VBox(children=(Button(description='Update', style=ButtonStyle()), Select(descrip…

In [104]:
def filter_fails(data):
    return [item for item in data if len(item.get('extracted_text', '')) < 50 or len(item.get('extracted_text', '')) > 800]

In [114]:
from bs4 import BeautifulSoup
def html_to_text(html):
    soup = BeautifulSoup(html, "html.parser")
    # The separator="\n" inserts a newline between blocks of text
    return soup.get_text(separator=" ")

In [119]:
import os
import ipywidgets as widgets
from IPython.display import display

def filter_fails(data):
    return [item for item in data if item.get('difficult')]

# Get list of files ending with ".dat.gz" from current directory
files = [f for f in os.listdir('.') if f.endswith('dat.gz')]

# Widget for listing files
file_selector = widgets.Select(
    options=files,
    description='Files:',
    rows=10,
    layout=widgets.Layout(width='200px')
)
patentid_input = widgets.Text(description="Patent id:")
patentnbr_input = widgets.Text(description="Patent number:")
patentdate_input = widgets.Text(description="Patent date:")

status_dropdown = widgets.Dropdown(
    options=['OK', 'WRONG', 'NONE'],
    value='NONE',
    description='Status:'
)

# Text area to display the contents of the corresponding ".met.conf" file
met_conf_text = widgets.Textarea(
    description='Config:',
    layout=widgets.Layout(width='500px', height='200px')
)

save_button = widgets.Button(description="SAVE")

# Callback to load and display the .met.conf file content when a file is selected
def on_file_select(change):
    selected_file = file_selector.value


# Button to load the list (using load_list function)
load_button = widgets.Button(description='Load')
list_data = []  # Will store the list of dictionaries loaded by load_list
filtered_data = []
current_index = 0  # To keep track of the current row

# Checkbox to toggle difficult-only filtering
difficult_only_checkbox = widgets.Checkbox(
    value=False,
    description='Display difficult files only'
)

# Text areas for displaying the extracted_text and original_text fields
extracted_text_area = widgets.Textarea(
    description='Extracted:',
    layout=widgets.Layout(width='500px', height='200px')
)
original_text_area = widgets.Textarea(
    description='Original:',
    layout=widgets.Layout(width='500px', height='200px')
)

def get_display_data():
    return filter_fails(list_data) if difficult_only_checkbox.value else list_data

# Update the text areas based on the current index
def update_text_areas():
    display_data = get_display_data()
    if display_data and 0 <= current_index < len(display_data):
        extracted_text_area.value = display_data[current_index].get('cleaned_text', '')
        original_text_area.value = display_data[current_index].get('original_text', '')
        patentid_input.value=display_data[current_index].get('id')
        patentnbr_input.value=display_data[current_index].get('pubnbr')
        patentdate_input.value=display_data[current_index].get('pubdate')
        status_dropdown.value = display_data[current_index].get('status', 'NONE')
    else:
        extracted_text_area.value = ''
        original_text_area.value = ''
        patentnbr_input.value=''
        patentdate_input.value=''

def on_load_button_click(b):
    global list_data, current_index
    if file_selector.value:
        selected_file=file_selector.value
        list_data = load_list2(file_selector.value)
        list_data=[item for item in list_data if len(item["cleaned_text"].split()) < 300]
        current_index = 0
        update_text_areas()
    else:
        extracted_text_area.value = 'No file selected'
        original_text_area.value = ''

def on_status_change(change):
    # Ensure we're only handling a value change event
    if change['name'] == 'value' and change['type'] == 'change':
        display_data = get_display_data()
        if display_data and 0 <= current_index < len(display_data):
            display_data[current_index]['status'] = change['new']

status_dropdown.observe(on_status_change, names='value')

load_button.on_click(on_load_button_click)
difficult_only_checkbox.observe(lambda change: update_text_areas(), names='value')

# Navigation buttons for moving through rows
left_arrow = widgets.Button(description='<')
right_arrow = widgets.Button(description='>')

def on_left_arrow_click(b):
    global current_index
    if current_index > 0:
        current_index -= 1
        update_text_areas()

def on_right_arrow_click(b):
    global current_index
    if current_index < len(get_display_data()) - 1:
        current_index += 1
        update_text_areas()

left_arrow.on_click(on_left_arrow_click)
right_arrow.on_click(on_right_arrow_click)

# Define the button's click event handler
def on_save_click(b):
    save_list(display_data,file_selector.value+".check")

# Attach the event handler to the button's click event
save_button.on_click(on_save_click)

# Layout the widgets
file_list_box = widgets.VBox([file_selector, load_button])
file_and_config = widgets.HBox([file_list_box, met_conf_text])
text_areas_box = widgets.HBox([extracted_text_area, original_text_area])
nav_buttons = widgets.HBox([left_arrow, right_arrow])

# Combine all into one UI layout
ui = widgets.VBox([file_and_config, difficult_only_checkbox, patentid_input,patentdate_input, patentnbr_input,    status_dropdown, text_areas_box, nav_buttons,save_button])
display(ui)


VBox(children=(HBox(children=(VBox(children=(Select(description='Files:', layout=Layout(width='200px'), option…

In [36]:
test1='<p id="p0001" num="0001">The invention relates to a method and a device for determining the presence and mass flow rate of milk flowing in intermittent slugs in a pipe with air, as for example, milk flowing in a milk line in animal milking apparatus as the milk is being drawn in pulses from the animal, although the invention is not limited to such a method and a device. Further, the invention relates to a method and a device for determining the presence or absence of milk flowing in a pipeline, and the invention also relates to a method and device for detecting the commencement and ending of milking of an animal. The invention further relates to a method and a device for detecting connection and disconnection of teat cups of a milking cluster to the teats of an animal, and the invention also relates to a method and a device for detecting disconnection ("kick-off") of a milking cluster from the teats of an animal during milking thereof.</p><p id="p0002" num="0002">Milking apparatus for milking animals is well known, and the action of milking apparatus whereby milk is drawn under vacuum from the teats of an animal in pulsed flow whereby the milk is drawn in time spaced apart slugs of milk with air intermediate the milk slugs is known. Because two-phase flow is involved, in other words, liquid and gaseous phase flow whereby the liquid phase comprises milk and the gaseous phase comprises air, it is relatively difficult to determine the rate of milk flow flowing through a pipeline where the milk is drawn under vacuum from the teats in such a pulsed flow manner. Furthermore, it is desirable to be able to detect both the presence and absence of milk flowing in a pipeline, and it is also desirable to be able to detect the commencement and completion of milking of an animal. Further, it is desirable to detect connection and disconnection of a milking cluster from the teats of an animal, and in particular, it is desirable to be able to detect disconnection of a milking cluster from the teats of an animal during milking, as a result of kick-off.</p><p id="p0003" num="0003">PCT Specification No. <patcit id="pcit0001" dnum="WO8905974A"><text>WO 89/05974 of Hope </text></patcit>discloses a state of the art method for acoustically determining one or more properties of a multi-phase medium flowing turbulently in a pipeline, for example, oil with sand entrained therein with the oil flowing in slugs with gas therebetween.</p><p id="p0004" num="0004">The invention is defined by the method of appended Claim 1 and the device of appended Claim 15.</p><p id="p0005" num="0005">Preferred embodiments of the invention are defined by the dependent claims.</p><p id="p0006" num="0006">Preferably, disconnection of the milking cluster from the teats of the animal is determined in response to the computed energy parameter value exceeding the predefined energy parameter value.</p><p id="p0007" num="0007">Advantageously, the absence of milk flowing in the pipeline is determined in response to the computed energy parameter value being less than the second predefined energy parameter value.</p><p id="p0008" num="0008">in one aspect of the invention a plurality of time spaced energy parameter values of the monitored signal within the predefined frequency bandwidth are computed, and the presence of milk flowing in the pipeline is determined in response to the time spaced computed energy parameter values being indicative of a liquid phase medium flowing with pulsed flow.</p><p id="p0009" num="0009">In another aspect of the invention the time spaced computed energy parameter values are consecutively computed energy parameter values.</p><p id="p0010" num="0010">Preferably, the commencement of milk flowing in the pipeline is determined in response to the computed energy parameter value transitioning from one of a value indicative of the teat cups of the milking cluster being attached to the teats of the animal without milk flowing in the pipeline, and a value indicative of air being drawn under vacuum into the pipeline through the teat cups to a value indicative of milk flowing in the pipeline.</p><p id="p0011" num="0011">Advantageously, the ceasing of milk to flow in the pipeline is determined in response to the computed energy parameter value transitioning from a value indicative of milk flowing in the pipeline to a value less than the second predefined energy parameter value.</p><p id="p0012" num="0012">Preferably, disconnection of the milking cluster from the teats of an animal during milking is determined in response to the computed energy parameter value transitioning from a value indicative of milk flowing in the pipeline to a value greater than the first predefined energy parameter value.</p><p id="p0013" num="0013">Advantageously, connection of the milking cluster to the teats of the animal is determined in response to the computed energy parameter value transitioning from a value indicative of air being drawn into the pipeline through the '

In [28]:
import re

def find_sdg_numbers(text):
    """
    Finds all unique Sustainable Development Goal (SDG) numbers in a given text,
    checking for both "sustainable development goal XX" and "SDG XX" patterns.

    Args:
        text (str): The input document text.

    Returns:
        set: A set of unique SDG numbers found in the text.
    """
    # Pattern to find "sustainable development goal XX" OR "SDG XX"
    # (sustainable development goal|SDG) - This is a non-capturing group for the alternatives.
    # \s* - Matches zero or more whitespace characters between "goal" or "SDG" and the number.
    # (\d+) - Captures the number (one or more digits).
    # re.IGNORECASE makes the search case-insensitive for both parts of the pattern.
    pattern = re.compile(r"(goal|SDG)\s*(\d+)", re.IGNORECASE)

    # Find all matches in the text
    matches = pattern.finditer(text)

    sdg_numbers = set() # Use a set to store unique numbers

    for match in matches:
        # match.group(2) because the first capturing group is (sustainable development goal|SDG)
        # and the second capturing group is (\d+)
        sdg_number = int(match.group(2))
        sdg_numbers.add(sdg_number) # Add to the set

    return sdg_numbers

In [50]:
data=load_list2("toto")

count=0
for i in data:
    result=find_sdg_numbers(i["original_text"])
    print(i["pubnbr"]+" - "+str(result))
    if result!=set():
        count=count+1
    
print(count/len(data))

3586956 - set()
3687640 - set()
3699845 - set()
3756376 - set()
3769176 - set()
3775904 - set()
3810320 - set()
3873794 - set()
3888043 - set()
3910755 - set()
3947291 - set()
3957222 - {3}
3960055 - {3}
3960214 - {3}
3996242 - set()
4004858 - set()
4007011 - {11, 3, 12, 7}
4015170 - set()
4034801 - {7}
4035691 - {3}
4044078 - set()
4054052 - set()
4053256 - {12}
4054739 - set()
4063011 - set()
4064499 - set()
4064500 - set()
4066648 - set()
4075568 - set()
4073311 - set()
4074775 - set()
4079716 - set()
3910755 - set()
4084989 - set()
4086162 - set()
4095117 - set()
4096056 - set()
4097670 - set()
4102514 - set()
4106015 - set()
4104942 - set()
4104065 - set()
4123521 - set()
4123522 - set()
4123278 - {12}
4128033 - set()
4131123 - set()
4144891 - {7}
4152996 - {9, 3, 12, 17}
4156092 - {12}
4160729 - {11, 3, 12, 7}
4163079 - {9, 11, 12, 7}
4163437 - set()
4163226 - set()
4168565 - set()
4174407 - {3, 12, 14}
4177147 - set()
4177152 - set()
4177150 - set()
4177155 - set()
4177154 - set

In [51]:
len(data)

339

In [52]:
import re

OFFICIAL_SDG_DESCRIPTIONS = {
    1: "End poverty in all its forms everywhere.",
    2: "End hunger, achieve food security and improved nutrition and promote sustainable agriculture.",
    3: "Ensure healthy lives and promote well-being for all at all ages.",
    4: "Ensure inclusive and equitable quality education and promote lifelong learning opportunities for all.",
    5: "Achieve gender equality and empower all women and girls.",
    6: "Ensure availability and sustainable management of water and sanitation for all.",
    7: "Ensure access to affordable, reliable, sustainable and modern energy for all.",
    8: "Promote sustained, inclusive and sustainable economic growth, full and productive employment and decent work for all.",
    9: "Build resilient infrastructure, promote inclusive and sustainable industrialization and foster innovation.",
    10: "Reduce inequality within and among countries.",
    11: "Make cities and human settlements inclusive, safe, resilient and sustainable.",
    12: "Ensure sustainable consumption and production patterns.",
    13: "Take urgent action to combat climate change and its impacts.",
    14: "Conserve and sustainably use the oceans, seas and marine resources for sustainable development.",
    15: "Protect, restore and promote sustainable use of terrestrial ecosystems, sustainably manage forests, combat desertification, and halt and reverse land degradation and halt biodiversity loss.",
    16: "Promote peaceful and inclusive societies for sustainable development, provide access to justice for all and build effective, accountable and inclusive institutions at all levels.",
    17: "Strengthen the means of implementation and revitalize the Global Partnership for Sustainable Development."
    }
# Define the 17 Sustainable Development Goal descriptions
# Using a dictionary for easy lookup by number
SDG_DESCRIPTIONS = {
    1: "No Poverty",
    2: "Zero Hunger",
    3: "Good Health and Well-being",
    4: "Quality Education",
    5: "Gender Equality",
    6: "Clean Water and Sanitation",
    7: "Affordable and Clean Energy",
    8: "Decent Work and Economic Growth",
    9: "Industry, Innovation and Infrastructure",
    10: "Reduced Inequalities",
    11: "Sustainable Cities and Communities",
    12: "Responsible Consumption and Production",
    13: "Climate Action",
    14: "Life Below Water",
    15: "Life on Land",
    16: "Peace, Justice and Strong Institutions",
    17: "Partnerships for the Goals"
}

def find_sdg_numbers_direct(text):
    """
    Finds all unique Sustainable Development Goal (SDG) numbers in a given text,
    checking for both "sustainable development goal XX" and "SDG XX" patterns.

    Args:
        text (str): The input document text.

    Returns:
        set: A set of unique SDG numbers directly mentioned in the text.
    """
    pattern = re.compile(r"(goal|SDG)\s*(\d+)", re.IGNORECASE)
    matches = pattern.finditer(text)
    sdg_numbers = set()
    for match in matches:
        sdg_number = int(match.group(2))
        sdg_numbers.add(sdg_number)
    return sdg_numbers

def find_sdg_numbers_from_description(full_description_text):
    """
    Finds SDG numbers by searching for the text of SDG descriptions within
    a given full description text.

    Args:
        full_description_text (str): The full description text of a document.

    Returns:
        list: A list of SDG numbers found based on their description text.
              (Can contain duplicates if a description is found multiple times).
    """
    found_sdgs_by_description = []
    # Create a case-insensitive regex pattern for each SDG description
    # Ensure word boundaries to avoid partial matches (e.g., "life" matching part of "wildlife")
    # Adding \b for word boundaries. For multi-word descriptions, the internal spaces are fine.
    # We escape any special regex characters in the description just in case.
    for sdg_num, sdg_desc in SDG_DESCRIPTIONS.items():
        # Escape potential regex special characters in the description
        escaped_desc = re.escape(sdg_desc)
        # Use a regex that allows for slight variations like punctuation around the words
        # \b ensures whole word match, but for longer phrases, we want the exact phrase.
        # So we'll use a slightly more flexible approach, matching the phrase itself.
        # Using re.finditer to find all occurrences
        desc_pattern = re.compile(rf"{escaped_desc}", re.IGNORECASE)
        
        for match in desc_pattern.finditer(full_description_text):
            found_sdgs_by_description.append(sdg_num)
            
    return found_sdgs_by_description

In [57]:
def process_documents_for_sdgs(documents_data):

    processed_results = []
    for doc in documents_data:
        doc_id = doc.get('pubnbr', 'N/A')
        text = doc.get('original_text', '')
        full_description = doc.get('original_text', '')

        # 1. Find SDG numbers directly from the main 'text'
        direct_sdgs = find_sdg_numbers_direct(text)

        description_sdgs = []
        # 2. If no direct SDGs found, look in the 'full_description'
        if not direct_sdgs:
            description_sdgs = find_sdg_numbers_from_description(full_description)
        
        # Combine all found SDGs and get unique ones
        all_unique_sdgs = set(direct_sdgs) # Start with direct SDGs
        if description_sdgs:
            all_unique_sdgs.update(description_sdgs) # Add description-based SDGs

        processed_results.append({
            'id': doc_id,
            'found_sdg_numbers_direct': sorted(list(direct_sdgs)),
            'found_sdg_numbers_from_description': sorted(description_sdgs),
            'all_found_sdg_numbers_unique': sorted(list(all_unique_sdgs))
        })
    
    return processed_results

In [58]:
processed_docs = process_documents_for_sdgs(data)
for doc in processed_docs:
    print(f"--- Document ID: {doc['id']} ---")
    print(f"Directly found SDGs: {doc['found_sdg_numbers_direct']}")
    print(f"SDGs found by description (if no direct): {doc['found_sdg_numbers_from_description']}")
    print(f"All unique SDGs for this document: {doc['all_found_sdg_numbers_unique']}")
    print("-" * 40)

--- Document ID: 3586956 ---
Directly found SDGs: []
SDGs found by description (if no direct): []
All unique SDGs for this document: []
----------------------------------------
--- Document ID: 3687640 ---
Directly found SDGs: []
SDGs found by description (if no direct): []
All unique SDGs for this document: []
----------------------------------------
--- Document ID: 3699845 ---
Directly found SDGs: []
SDGs found by description (if no direct): []
All unique SDGs for this document: []
----------------------------------------
--- Document ID: 3756376 ---
Directly found SDGs: []
SDGs found by description (if no direct): []
All unique SDGs for this document: []
----------------------------------------
--- Document ID: 3769176 ---
Directly found SDGs: []
SDGs found by description (if no direct): []
All unique SDGs for this document: []
----------------------------------------
--- Document ID: 3775904 ---
Directly found SDGs: []
SDGs found by description (if no direct): []
All unique SDGs f

In [49]:
len(data)

4090033