## Python Script Split and Parse

In [1]:
import fitz  # PyMuPDF
import json
import re


In [2]:

def clean_strange_characters(text):
    # Remove non-ASCII characters
    return re.sub(r'[^\x00-\x7F]+', '', text)

def extract_text_from_pdf(pdf_path):
    text_by_page = []
    text = ''
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
            text_by_page += [page.get_text()]
    return text_by_page, text

def parse_text_to_hierarchy(text):
    data = {"articles": [], "annex": {"lists": [], "tables": []}}
    current_article = None
    in_annex = False

    lines = text.split('\n')
    for line in lines:
        if re.match(r'^Article \d+', line):
            if current_article:
                data["articles"].append(current_article)
            current_article = {"title": line, "rules": []}
        elif line.strip().lower() == 'annex':
            in_annex = True
            if current_article:
                data["articles"].append(current_article)
            current_article = None
        elif in_annex:
            if 'table' in line.lower():
                data["annex"]["tables"].append(line)
            elif 'list' in line.lower():
                data["annex"]["lists"].append(line)
        else:
            if current_article and line.strip():
                current_article["rules"].append(line)
    
    if current_article:
        data["articles"].append(current_article)
    
    return data

def save_to_json(data, json_path):
    with open(json_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

pdf_path = 'RoHS.pdf'
# json_path = 'output.json'

text_by_page, text = extract_text_from_pdf(pdf_path)
# data = parse_text_to_hierarchy(text)
# save_to_json(data, json_path)

# print(f"PDF content successfully saved to {json_path}")

### Extract Amends list

In [3]:
def extract_amends_from_text(page_text):
    amend_pattern = re.compile(r"►M\d+.*?(?=►M\d+|$)", re.DOTALL)

    matches = amend_pattern.findall(page_text)

    
    return matches

amends = []
for i in range(4):
    amends += extract_amends_from_text(text_by_page[i])
amends


['►M1 \nCommission Delegated Directive 2012/50/EU of 10 October 2012 \nL 348 \n16 \n18.12.2012 \n',
 '►M2 \nCommission Delegated Directive 2012/51/EU of 10 October 2012 \nL 348 \n18 \n18.12.2012 \n',
 '►M3 \nCommission Delegated Directive 2014/1/EU of 18 October 2013 \nL 4 \n45 \n9.1.2014 \n',
 '►M4 \nCommission Delegated Directive 2014/2/EU of 18 October 2013 \nL 4 \n47 \n9.1.2014 \n',
 '►M5 \nCommission Delegated Directive 2014/3/EU of 18 October 2013 \nL 4 \n49 \n9.1.2014 \n',
 '►M6 \nCommission Delegated Directive 2014/4/EU of 18 October 2013 \nL 4 \n51 \n9.1.2014 \n',
 '►M7 \nCommission Delegated Directive 2014/5/EU of 18 October 2013 \nL 4 \n53 \n9.1.2014 \n',
 '►M8 \nCommission Delegated Directive 2014/6/EU of 18 October 2013 \nL 4 \n55 \n9.1.2014 \n',
 '►M9 \nCommission Delegated Directive 2014/7/EU of 18 October 2013 \nL 4 \n57 \n9.1.2014 \n',
 '►M10 \nCommission Delegated Directive 2014/8/EU of 18 October 2013 \nL 4 \n59 \n9.1.2014 \n',
 '►M11 \nCommission Delegated Directive

In [4]:
def parse_amend_list(amends):
    amend_dict = dict()
    for amend in amends:
        clean_amend = clean_strange_characters(amend)
        split_amend = clean_amend.strip().split('\n')
        amend_dict[split_amend[0]] = {'Institute': split_amend[1], 
                                      'No': split_amend[2],
                                      'Page': split_amend[3]}
    return amend_dict

amend_json_path = './amend_list.json'

amend_dict = parse_amend_list(amends)
save_to_json(amend_dict, amend_json_path)



### Extract Articles
1. double check whether the content matches with document.(especially last one)

In [5]:
text_by_page[9]

' \n02011L0065 — EN — 01.02.2024 — 020.001 — 10 \n— their elimination or substitution via design changes or materials \nand components which do not require any of the materials or \nsubstances listed in Annex II is scientifically or technically \nimpracticable, \n— the reliability of substitutes is not ensured, \n— the total negative environmental, health and consumer safety \nimpacts caused by substitution are likely to outweigh the total \nenvironmental, health and consumer safety benefits thereof. \nDecisions on the inclusion of materials and components of EEE in \nthe lists in Annexes III and IV and on the duration of any \nexemptions shall take into account the availability of substitutes \nand the socioeconomic impact of substitution. Decisions on the \nduration of any exemptions shall take into account any potential \nadverse impacts on innovation. Life-cycle thinking on the overall \nimpacts of the exemption shall apply, where relevant; \n(b) deletion of materials and component

In [6]:
def extract_28_articles(text):
    article_pattern = re.compile(r'\n(Article\s+\d+)\s*\n([^\n]+)\n(.*?)(?=\nArticle\s+\d+\s*\n[^\n]+\n|\nANNEX\s+I\s|$)', re.DOTALL)
    matches = article_pattern.findall(text)
    articles = matches[:28]
    return articles

def parse_plain_text(content):
    return content.replace('\n', '').strip()

    
    # return ordered_list

def parse_ordered_list(content):
    # Pattern to match main list items (e.g., "1.", "2.", etc.)
    list_pattern = re.compile(r'(\d+)\.\s*\n(.*?)(?=\n\d+\.\s|\Z)', re.DOTALL)
    matches = list_pattern.findall(content)
    ordered_list = {}

    for num, item in matches:
        # Pattern to match sublist items (e.g., "(a)", "(b)", etc.)
        sublist_pattern = re.compile(r'\(([a-z]{1,2})\)\s*(.*?)(?=\n\([a-z]{1,2}\)\s|\Z)', re.DOTALL)
        submatches = sublist_pattern.findall(item)
        
        if submatches:
            # Extract content before the first sublist item as the main content of the item
            main_content = re.split(r'\(([a-z]{1,2})\)', item, 1)[0].strip()
            sublist = {sub[0]: sub[1].strip().replace('\n', ' ') for sub in submatches}
            ordered_list[num] = {
                'Content': main_content,
                'Sublist': sublist
            }
        else:
            ordered_list[num] = {'Content': item.strip().replace('\n', ' ')}
    
    return ordered_list

def parse_parenthesized_list(content):
    # list_pattern = re.compile(r'\((\d+)\)\s*(.*?)(?=\n\(\d+\)\s|\n\d+\.\s*\n|$)', re.DOTALL)
    list_pattern = re.compile(r'\n\((\d+)\)\s*(.*?)(?=\n\(\d+\)\s|$)', re.DOTALL)
    matches = list_pattern.findall(content)
    return {num: item.replace('\n', '').strip() for num, item in matches}

def parse_alphabetical_sublist(content):
    # Extract initial content before the first sublist item
    initial_content = re.split(r'\n\([a-z]{1,2}\)\s', content, 1)[0].strip()
    sublist_pattern = re.compile(r'\n\(([a-z]{1,2})\)\s*(.*?)(?=\n\([a-z]{1,2}\)\s|\Z)', re.DOTALL)
    submatches = sublist_pattern.findall(content)
    sublist = {sub[0]: sub[1].strip().replace('\n', ' ') for sub in submatches}
    return {'Content': initial_content, 'Sublist': sublist}

def parse_hierarchical_content(content):
    # if re.search(r'\(\d+\)\s', content):
    if re.search(r'\n\(\d+\)\s', content):
        return parse_parenthesized_list(content)
    elif re.search(r'\d+\.\s*\n', content):
        return parse_ordered_list(content)
    elif re.search(r'\n\([a-z]{1,2}\)\s', content):
        return parse_alphabetical_sublist(content)
    else:
        return parse_plain_text(content)


def parse_articles(articles):
    article_dict = dict()
    for article in articles:
        article_dict[article[0]] = {'Title': article[1], 'Content': parse_hierarchical_content(article[2])}
    return article_dict

In [7]:
articles = extract_28_articles(clean_strange_characters(text))

article_dict = parse_articles(articles)
save_to_json(article_dict, './articles.json')


In [8]:
parse_hierarchical_content(articles[4][2])

{'1': {'Content': 'For the purposes of adapting Annexes III and IV to scientific and \ntechnical progress, and in order to achieve the objectives set out in \nArticle 1, the Commission shall adopt by means of individual \ndelegated acts in accordance with Article 20 and subject to the \nconditions laid down in Articles 21 and 22, the following measures:',
  'Sublist': {'a': 'inclusion of materials and components of EEE for specific appli cations in the lists in Annexes III and IV, provided that such  inclusion does not weaken the environmental and health protection  afforded by Regulation (EC) No 1907/2006 and where any of the  following conditions is fulfilled:  B   02011L0065  EN  01.02.2024  020.001  10   their elimination or substitution via design changes or materials  and components which do not require any of the materials or  substances listed in Annex II is scientifically or technically  impracticable,   the reliability of substitutes is not ensured,   the total negative envir

In [9]:
text_by_page[5]

' \n02011L0065 — EN — 01.02.2024 — 020.001 — 6 \n(i) photovoltaic panels intended to be used in a system that is designed, \nassembled and installed by professionals for permanent use at a \ndefined location to produce energy from solar light for public, \ncommercial, industrial and residential applications; \n(j) equipment specifically designed solely for the purposes of research \nand development only made available on a business-to-business \nbasis; \n▼M37 \n(k) pipe organs. \n▼B \nArticle 3 \nDefinitions \nFor the purposes of this Directive, the following definitions shall apply: \n(1) ‘electrical and electronic equipment’ or ‘EEE’ means equipment \nwhich is dependent on electric currents or electromagnetic fields in \norder to work properly and equipment for the generation, transfer \nand measurement of such currents and fields and designed for use \nwith a voltage rating not exceeding 1 000 volts for alternating \ncurrent and 1 500 volts for direct current; \n(2) for the purposes

### Extrac Annex

In [10]:
import pdfplumber
import camelot
import json
import pandas as pd

In [11]:
!pip uninstall camelot

[33mDEPRECATION: Loading egg at /home/yuxuan/anaconda3/lib/python3.11/site-packages/lltm_cpp-0.0.0-py3.11-linux-x86_64.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m

In [12]:
text_by_page[20]


' \n02011L0065 — EN — 01.02.2024 — 020.001 — 21 \nANNEX I \nCategories of EEE covered by this Directive \n1. Large household appliances. \n2. Small household appliances. \n3. IT and telecommunications equipment. \n4. Consumer equipment. \n5. Lighting equipment. \n6. Electrical and electronic tools. \n7. Toys, leisure and sports equipment. \n8. Medical devices. \n9. Monitoring and control instruments including industrial monitoring and \ncontrol instruments. \n10. Automatic dispensers. \n11. Other EEE not covered by any of the categories above. \n▼B\n'

In [13]:
def extract_8_annex(text):
    article_pattern = re.compile(r'(ANNEX\s+[IVXLCDM]+)\s*\n(.*?)(?=\nANNEX\s+[IVXLCDM]+|$)', re.DOTALL)
    matches = article_pattern.findall(text)
    annexs = matches[:28]
    return annexs
annexs = extract_8_annex(clean_strange_characters(text))

In [14]:
def extract_rules(text):
    """
    Extracts all rules from the given text that follow the format \n1. \n2. \n3., etc.

    Parameters:
    text (str): The text from which to extract the rules.

    Returns:
    list: A list of extracted rules.
    """
    # Regex pattern to match items in the format \n\d+\.\s[^\n]+
    pattern = r"\n\d+\.\s[^\n]+"
    
    # Find all matches
    matches = re.findall(pattern, text)
    
    # Clean up the matches to remove leading newline characters and extra spaces
    rules = [match.strip() for match in matches]
    
    return rules

In [15]:
annex_table_lst = [2, 6, 7]
annexs[4][1]

'Applications for granting, renewing and revoking exemptions as referred to \nin Article 5 \nApplications for exemptions, renewal of exemptions or, mutatis mutandis, for \nrevoking an exemption may be submitted by a manufacturer, the authorised \nrepresentative of a manufacturer, or any economic operator in the supply chain \nand shall include at least the following: \n(a) the name, address and contact details of the applicant; \n(b) information on the material or component and the specific uses of the \nsubstance in the material and component for which an exemption, or its \nrevocation, is requested and its particular characteristics; \n(c) verifiable and referenced justification for an exemption, or its revocation, in \nline with the conditions established in Article 5; \n(d) an analysis of possible alternative substances, materials or designs on a life- \ncycle basis, including, when available, information about independent \nresearch, peer-review studies and development activities 

In [16]:
def parse_annexs(annexs):
    annex_dict = dict()
    for annex in annexs:
        annex_dict[annex[0]] = annex[1]
    return annex_dict

annex_dict = parse_annexs(annexs)

save_to_json(annex_dict, "./annexs.json")

In [17]:
def extract_text_and_tables_from_pdf(pdf_path):
    pdf_data = {}
    
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            page_text = page.extract_text()
            pdf_data[f'Page_{i + 1}'] = {'text': page_text, 'tables': []}
            
            tables = page.extract_tables()
            for table in tables:
                pdf_data[f'Page_{i + 1}']['tables'].append(table)
                
    return pdf_data

extract_text_lst = extract_text_and_tables_from_pdf(pdf_path)
for page in extract_text_lst:
    if len(extract_text_lst[page]['tables']) != 0:
        print(page)
        print(extract_text_lst[page]['tables'])

Page_23
[[['Mercury in single capped (compact) fluorescent\nlamps not exceeding (per burner):'], ['For general lighting purposes < 30 W: 2,5 mg'], ['For general lighting purposes ≥ 30 W and < 50 W:\n3,5 mg'], ['For general lighting purposes ≥ 50 W and\n< 150 W: 5 mg'], ['For general lighting purposes ≥ 150 W: 15 mg'], ['For general lighting purposes with circular or\nsquare structural shape and tube diameter ≤\n17 mm: 5 mg'], ['For lamps designed to emit mainly light in the\nultraviolet spectrum: 5 mg'], ['For special purposes: 5 mg'], ['For general lighting purposes < 30 W with a\nlifetime equal or above 20 000 h: 3,5 mg'], ['Mercury in double-capped linear fluorescent lamps\nfor general lighting purposes not exceeding (per\nlamp):'], ['Tri-band phosphor with normal lifetime and a tube\ndiameter < 9 mm (e.g. T2): 4 mg'], ['Tri-band phosphor with normal lifetime and a tube\ndiameter ≥ 9 mm and ≤ 17 mm (e.g. T5): 3 mg'], ['Tri-band phosphor with normal lifetime and a tube\ndiameter > 17

In [18]:
def extract_tables_from_pdf(pdf_path):
    tables_data = {}
    tables = camelot.read_pdf(pdf_path, pages='all', flavor='stream')
    
    for table in tables:
        page_number = table.page
        table_data = table.df.applymap(clean_strange_characters).values.tolist()  # Clean each cell in the table
        if f'Page_{page_number}' not in tables_data:
            tables_data[f'Page_{page_number}'] = {'tables': []}
        tables_data[f'Page_{page_number}']['tables'].append(table_data)
        
    return tables_data

page_table_dict = extract_tables_from_pdf(pdf_path)

In [19]:
page_table_dict['Page_44']

{'tables': [[['Directive \n2002/95/EC \nof \nthe',
    '(OJ  L  37,  13.2.2003,  p.  19).'],
   ['European \nParliament \nand \nof \nthe', ''],
   ['Council', ''],
   ['Commission  Decision 2005/618/', '(OJ  L  214,  19.8.2005,  p.  65).'],
   ['EC', ''],
   ['Commission  Decision 2005/717/', '(OJ  L  271,  15.10.2005,  p.  48).'],
   ['EC', ''],
   ['Commission  Decision 2005/747/', '(OJ  L  280,  25.10.2005,  p.  18).'],
   ['EC', ''],
   ['Commission  Decision 2006/310/', '(OJ  L  115,  28.4.2006,  p.  38).'],
   ['EC', ''],
   ['Commission  Decision 2006/690/', '(OJ  L  283,  14.10.2006,  p.  47).'],
   ['EC', ''],
   ['Commission  Decision 2006/691/', '(OJ  L  283,  14.10.2006,  p.  48).'],
   ['EC', ''],
   ['Commission  Decision 2006/692/', '(OJ  L  283,  14.10.2006,  p.  50).'],
   ['EC', ''],
   ['Directive \n2008/35/EC \nof \nthe', '(OJ  L  81,  20.3.2008,  p.  67).'],
   ['European  Parliament  and  of  the', ''],
   ['Council', ''],
   ['Commission  Decision 2008/385/', '(O

In [20]:
def organize_tables_into_annexes(tables_data):
    annexes = {
        'ANNEX III': [],
        'ANNEX VII': [],
        'ANNEX VIII': []
    }

    # Define the pages for each annex
    annex_III_pages = range(23, 36)
    annex_VII_pages = [44]
    annex_VIII_pages = [45]

    # Map tables to their respective annexes
    for page_number in annex_III_pages:
        key = f'Page_{page_number}'
        if key in tables_data:
            annexes['ANNEX III'].extend(tables_data[key]['tables'])

    for page_number in annex_VII_pages:
        key = f'Page_{page_number}'
        if key in tables_data:
            annexes['ANNEX VII'].extend(tables_data[key]['tables'])

    for page_number in annex_VIII_pages:
        key = f'Page_{page_number}'
        if key in tables_data:
            annexes['ANNEX VIII'].extend(tables_data[key]['tables'])
    
    return annexes

annex_table_dict = organize_tables_into_annexes(page_table_dict)

In [21]:
annex_table_dict['ANNEX VII']

[[['Directive \n2002/95/EC \nof \nthe', '(OJ  L  37,  13.2.2003,  p.  19).'],
  ['European \nParliament \nand \nof \nthe', ''],
  ['Council', ''],
  ['Commission  Decision 2005/618/', '(OJ  L  214,  19.8.2005,  p.  65).'],
  ['EC', ''],
  ['Commission  Decision 2005/717/', '(OJ  L  271,  15.10.2005,  p.  48).'],
  ['EC', ''],
  ['Commission  Decision 2005/747/', '(OJ  L  280,  25.10.2005,  p.  18).'],
  ['EC', ''],
  ['Commission  Decision 2006/310/', '(OJ  L  115,  28.4.2006,  p.  38).'],
  ['EC', ''],
  ['Commission  Decision 2006/690/', '(OJ  L  283,  14.10.2006,  p.  47).'],
  ['EC', ''],
  ['Commission  Decision 2006/691/', '(OJ  L  283,  14.10.2006,  p.  48).'],
  ['EC', ''],
  ['Commission  Decision 2006/692/', '(OJ  L  283,  14.10.2006,  p.  50).'],
  ['EC', ''],
  ['Directive \n2008/35/EC \nof \nthe', '(OJ  L  81,  20.3.2008,  p.  67).'],
  ['European  Parliament  and  of  the', ''],
  ['Council', ''],
  ['Commission  Decision 2008/385/', '(OJ  L  136,  24.5.2008,  p.  9).'],


In [22]:
def annex_III_to_dataframe(annex_table_dict):
    # Extract the tables from "ANNEX III"
    annex_III_tables = annex_table_dict['ANNEX VIII']
    
    # Assuming each table is a dictionary or a list of dictionaries representing rows
    # If tables are list of lists, adjust the code accordingly
    all_rows = []
    for table in annex_III_tables:
        if isinstance(table, list):
            all_rows.extend(table)
        elif isinstance(table, dict):
            all_rows.append(table)

    # Create DataFrame from the list of rows
    df = pd.DataFrame(all_rows)
    return df
annex_III_to_dataframe(annex_table_dict)

Unnamed: 0,0,1
0,,Correlation table
1,Directive 2002/95/EC,This Directive
2,Article 1,Article 1
3,Article 2(1),"Article 2(1), 2(2), Annex I"
4,Article 2(2),Article 2(3)
5,Article 2(3),"Article 2(4), introductory wording"
6,,Article 2(4)
7,Article 3(a),"Article 3(1),(2)"
8,Article 3(b),
9,,Article 3(6)-(28)


In [23]:
# def split_text_into_rules(annex_text):
#     rules = {}
#     other_text = annex_text
#     # Match rules with one or two digits followed by an optional letter and a dot
#     rule_pattern = re.compile(r'\n(\d{1,2}[a-zA-Z]?)\.\s')
#     # Match sublists with a single letter followed by a dot
#     sublist_pattern = re.compile(r'\n\(([a-zA-Z])\)\s')

#     split_content = rule_pattern.split(annex_text)
#     other_text = clean_strange_characters(split_content[0])  # Initial other text before any rules
#     for i in range(1, len(split_content), 2):
#         rule_number = split_content[i]
#         rule_content = split_content[i + 1]
#         sublist = {}
#         sublist_split = sublist_pattern.split(rule_content)
#         rule_text = clean_strange_characters(sublist_split[0])  # Initial rule text before any sublist
#         sublist_counter = {}
#         for j in range(1, len(sublist_split), 2):
#             sublist_letter = sublist_split[j]
#             sublist_content = clean_strange_characters(sublist_split[j + 1])
#             if sublist_letter in sublist:
#                 if sublist_letter not in sublist_counter:
#                     sublist_counter[sublist_letter] = 1
#                 sublist_counter[sublist_letter] += 1
#                 unique_key = f"{sublist_letter}_{sublist_counter[sublist_letter]}"
#             else:
#                 unique_key = sublist_letter
#                 sublist_counter[sublist_letter] = 1
#             sublist[unique_key] = sublist_content
#         rules[rule_number] = {'content': rule_text, 'sublist': sublist}
    
#     return other_text, rules


def split_text_into_rules(annex_text):
    rules = {}
    other_text = annex_text

    # Match rules with one or two digits followed by an optional letter and a dot
    rule_pattern = re.compile(r'\n(\d{1,2}[a-zA-Z]?)\.\s')
    # Match sublists with a single letter followed by a dot
    sublist_pattern = re.compile(r'\n\(([a-zA-Z])\)\s')

    split_content = rule_pattern.split(annex_text)
    if len(split_content) > 1:  # If there are rules detected
        other_text = clean_strange_characters(split_content[0])  # Initial other text before any rules
        for i in range(1, len(split_content), 2):
            rule_number = split_content[i]
            rule_content = split_content[i + 1]
            sublist = {}
            sublist_split = sublist_pattern.split(rule_content)
            rule_text = clean_strange_characters(sublist_split[0])  # Initial rule text before any sublist
            sublist_counter = {}
            for j in range(1, len(sublist_split), 2):
                sublist_letter = sublist_split[j]
                sublist_content = clean_strange_characters(sublist_split[j + 1])
                if sublist_letter in sublist:
                    if sublist_letter not in sublist_counter:
                        sublist_counter[sublist_letter] = 1
                    sublist_counter[sublist_letter] += 1
                    unique_key = f"{sublist_letter}_{sublist_counter[sublist_letter]}"
                else:
                    unique_key = sublist_letter
                    sublist_counter[sublist_letter] = 1
                sublist[unique_key] = sublist_content
            rules[rule_number] = {'content': rule_text, 'sublist': sublist}
    else:  # No rules detected, treat the entire text as sublists
        split_content = sublist_pattern.split(annex_text)
        other_text = clean_strange_characters(split_content[0])  # Initial other text before any sublist
        sublist = {}
        sublist_counter = {}
        for i in range(1, len(split_content), 2):
            sublist_letter = split_content[i]
            sublist_content = clean_strange_characters(split_content[i + 1])
            if sublist_letter in sublist:
                if sublist_letter not in sublist_counter:
                    sublist_counter[sublist_letter] = 1
                sublist_counter[sublist_letter] += 1
                unique_key = f"{sublist_letter}_{sublist_counter[sublist_letter]}"
            else:
                unique_key = sublist_letter
                sublist_counter[sublist_letter] = 1
            sublist[unique_key] = sublist_content
        rules['no_rule'] = {'content': '', 'sublist': sublist}

    return other_text, rules



def process_annexes(annexes_text, annex_table_dict):
    annexes_data = {}
    
    for annex_name, annex_text in annexes_text.items():
        if annex_name not in annex_table_dict:
            other_text, rules = split_text_into_rules(annex_text)
            annexes_data[annex_name] = {
                'content': other_text,
                'rules': rules
            }

    for annex_name, annex_text in annex_table_dict.items():
        annexes_data[annex_name] = annex_text
    
    return annexes_data
    
annex_text_table_dict = process_annexes(annex_dict, annex_table_dict)

In [24]:
annex_dict['ANNEX IV']

'Applications exempted from the restriction in Article 4(1) specific to medical \ndevices and monitoring and control instruments \nEquipment utilising or detecting ionising radiation \n1. Lead, cadmium and mercury in detectors for ionising radiation. \n2. Lead bearings in X-ray tubes. \n3. Lead in electromagnetic radiation amplification devices: micro-channel plate \nand capillary plate. \n4. Lead in glass frit of X-ray tubes and image intensifiers and lead in glass frit \nbinder for assembly of gas lasers and for vacuum tubes that convert elec\ntromagnetic radiation into electrons. \n5. Lead in shielding for ionising radiation. \n6. Lead in X-ray test objects. \n7. Lead stearate X-ray diffraction crystals. \n8. Radioactive cadmium isotope source for portable X-ray fluorescence \nspectrometers. \nSensors, detectors and electrodes \n1a. Lead and cadmium in ion selective electrodes including glass of pH \nelectrodes. \n1b. Lead anodes in electrochemical oxygen sensors. \n1c. Lead, cadmiu

In [25]:
save_to_json(annex_text_table_dict, './annexs.json')

In [26]:
import re

# Define the pattern to match rules with one or two digits followed by an optional letter and a dot
rule_pattern = re.compile(r'\n(\d{1,2}[a-zA-Z]?)\.\s')

# Define the pattern to match sublists with a single letter in parentheses followed by a space
sublist_single_letter_pattern = re.compile(r'\n\(([a-zA-Z])\)\s')

# Define the pattern to match sublists with Roman numerals (i, ii, iii) in parentheses followed by a space
sublist_roman_numeral_pattern = re.compile(r'\n\((i{1,3})\)\s')

# Sample text for demonstration
sample_text = """
39. Lead in micro-channel plates (MCPs) used in equipment where at least one
of the following properties is present:
(a) a compact size of the detector for electrons or ions, where the space for
the detector is limited to a maximum of 3 mm/MCP (detector thickness
+ space for installation of the MCP), a maximum of 6 mm in total, and
an alternative design yielding more space for the detector is
scientifically and technically impracticable;
(b) a two-dimensional spatial resolution for detecting electrons or ions,
where at least one of the following applies:
(i) a response time shorter than 25 ns;
(ii) a sample detection area larger than 149 mm^2;
(iii) a multiplication factor larger than 1.3 * 10^3.
(c) a response time shorter than 5 ns for detecting electrons or ions;
(d) a sample detection area larger than 314 mm^2 for detecting electrons or
ions;
(e) a multiplication factor larger than 4.0 * 10^7.
The exemption expires on the following dates:
(a) 21 July 2021 for medical devices and monitoring and control
instruments;
(b) 21 July 2023 for in-vitro diagnostic medical devices;
(c) 21 July 2024 for industrial monitoring and control instruments.
40.
"""

# Find all rule matches
rule_matches = rule_pattern.findall(sample_text)
print("Matched rules:", rule_matches)

# Find all single letter sublist matches
sublist_single_letter_matches = sublist_single_letter_pattern.findall(sample_text)
print("Matched single letter sublists:", sublist_single_letter_matches)

# Find all Roman numeral sublist matches
sublist_roman_numeral_matches = sublist_roman_numeral_pattern.findall(sample_text)
print("Matched Roman numeral sublists:", sublist_roman_numeral_matches)

# Combine all sublist matches
all_sublist_matches = sublist_single_letter_matches + sublist_roman_numeral_matches
print("All matched sublists:", all_sublist_matches)


Matched rules: ['39', '40']
Matched single letter sublists: ['a', 'b', 'i', 'c', 'd', 'e', 'a', 'b', 'c']
Matched Roman numeral sublists: ['i', 'ii', 'iii']
All matched sublists: ['a', 'b', 'i', 'c', 'd', 'e', 'a', 'b', 'c', 'i', 'ii', 'iii']


## Langchain and OpenAI API to Extract Text and Table from PDF

In [27]:
from openai import OpenAI
client = OpenAI()

completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},
    {"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
  ]
)

print(completion.choices[0].message)

ChatCompletionMessage(content="In the realm of code where wonders reside,\nThere's a concept that takes programmers for a ride.\nIt's called recursion, a magical spell,\nThat allows functions to call themselves oh so well.\n\nLike a fractal unfolding, infinite in scope,\nRecursion dives deep, giving code new hope.\nOne step at a time, it travels the path,\nRepeating and looping like an endless math.\n\nWith base cases to halt the endless loop,\nRecursion unwinds like a dancer in a troupe.\nIt breaks down problems into smaller parts,\nAnd solves them with elegance, in all its arts.\n\nA function that's recursive is a thing of beauty,\nIt dances and sings, a symphony of duty.\nInfinite power in a loop so divine,\nRecursion in programming, a masterpiece to define.", role='assistant', function_call=None, tool_calls=None)
