In [1]:
##########################################################################################################
# Extraction of the Summary for Celex Number Documents from Eur-lex website and store it in Elastic Search (Localhost) or OpenSearch (UniHeidelberg)
# 
# Code will:
# 1. Extract the list of CELEX Numbers from the "provided URL"
#    "provided_URl": Legal Acts for Energy Domain (https://eur-lex.europa.eu/search.html?type=named&name=browse-by:legislation-in-force&CC_1_CODED=12&displayProfile=allRelAllConsDocProfile)
#
# 2. For the Languages Tweaked in the code
#   2a. Extract the Summary Information for the Celex Numbers (Raw HTML and Document Content)
#   2b. Extract the Content of the Celex Document (Raw HTML and Document Content)
#
# Features:
# Code is customised for:
# 1. Any domain of legal acts from EUR-LEX website (https://eur-lex.europa.eu/browse/directories/legislation.html)
# 2. Any Language
##########################################################################################################

In [2]:
# Importing all requried packages

import logging
import os
import re
import urllib.request
from time import sleep, time

import requests
import pandas as pd
from bs4 import BeautifulSoup

# For Uni Heidelberg Server
from opensearchpy import OpenSearch

# For Localhost
from elasticsearch import Elasticsearch

In [3]:

#####################################################################################################
# Directory Creation
# For logging the progress of the script and the list of Celex Numbers extracted
#####################################################################################################
working_dir = os.getcwd()   
directory = os.path.join(working_dir, 'Scrapped_Data')

if not os.path.exists(directory):
    os.makedirs(directory)

filename_celex = os.path.join(directory, 'Celex_Numbers.csv')

# Preparing a File to Log the Metadata Informaiton
extraction_logs = os.path.join(directory, 'Logs_Extracting_MetaData.log')

### Celex Number Extraction

In [4]:
def pages_extraction(provided_url):
    """
    Function extracts the number of pages that needs to be considered for extracting the Celex Numbers

    Args:
        provided_url (string): URL of the Domain specific Legal Acts, for example: Energy, Agriculture, Taxation, and other
                                Legal Acts: https://eur-lex.europa.eu/browse/directories/legislation.html
                                Energy Legal Acts: https://eur-lex.europa.eu/search.html?type=named&name=browse-by:legislation-in-force&CC_1_CODED=12&displayProfile=allRelAllConsDocProfile

    Returns:
        integer: Value of the number of pages present in the provided URL
    """

    input_url = urllib.request.urlopen(provided_url)
    input_soup = BeautifulSoup(input_url , 'lxml')
    page_number_indexes = input_soup.find_all('a', class_ = 'btn btn-primary btn-sm')
    last_page_number_url = page_number_indexes[1].attrs['href']
    last_page_number = int((re.search('page=(\d+)', last_page_number_url , re.IGNORECASE)).group(1)) + 1
    sleep(1)
    return last_page_number

In [5]:
def get_celex(pages, provided_url):
    """
    Function extracts all the Celex Number from the provided URL

    Args:
        pages (integer): The value of number of pages that needs to be considered for extracting the Celex Numbers
        provided_url (string): URL of the Domain specific Legal Acts, for example: Energy, Agriculture, Taxation, and other
                                Legal Acts: https://eur-lex.europa.eu/browse/directories/legislation.html
                                Energy Legal Acts: https://eur-lex.europa.eu/search.html?type=named&name=browse-by:legislation-in-force&CC_1_CODED=12&displayProfile=allRelAllConsDocProfile

    Returns:
        list: List of Celex Number extracted from the provided URL
    """    
    list_celex = []
    print(f'Total Number of Pages present in the provided URL: {pages - 1}')
    while True:
        page_number = input('Number of Pages that needs to be considered for Document Extraction: ')
        pages_considered = int(page_number) + 1
        if pages_considered > pages:
            print("Number of pages entered is greater than the toal number of pages present. Kindly enter the value within range")
            continue
        else:
            break
    
    for i in range(1, pages_considered):
        # Reading particular URL information
        sleep(1)
        url = urllib.request.urlopen(provided_url + '&page=' +str(i)).read()

        # Scrapping the Page
        soup = BeautifulSoup(url , 'lxml')

        # Fetching celex numbers by parsing html tags heirarchy and checking for text 'CELEX number'. 
        try:
            div_tags = soup.find_all("div", attrs={"class": "col-sm-6"})
            for tag in div_tags:
                titles = tag.find_all("dt")
                values = tag.find_all("dd")
                for t ,v in zip(titles, values):
                    if t.text == 'CELEX number: ':
                        list_celex.append(v.text)
        except:
            pass
        
    # Saving all the CELEX Numbers into CSV File (For further scarpping)
    pd.DataFrame(list_celex, columns=['celex_id']).to_csv(filename_celex, index = False)

    return list_celex

In [6]:
def celex_main(provided_url):
    """
    Orchestrator function to extract the list of Celex Numbers from the provided URL

    Args:
        provided_url (string): URL of the Domain specific Legal Acts, for example: Energy, Agriculture, Taxation, and other
                                Legal Acts: https://eur-lex.europa.eu/browse/directories/legislation.html
                                Energy Legal Acts: https://eur-lex.europa.eu/search.html?type=named&name=browse-by:legislation-in-force&CC_1_CODED=12&displayProfile=allRelAllConsDocProfile

    Returns:
        list: List of Celex numbers extracted from the provided URL
    """
    logging.info("Execution of Extraction of Celex Number - Started")

    last_page_number = pages_extraction(provided_url)
    all_celex_number = get_celex(last_page_number, provided_url)
    
    logging.info("Execution of Extraction of Celex Number - Ended")
    return all_celex_number

### Celex Number Document Information Extraction

In [7]:
def get_document_content(celex_id, lang):
    """
    Function to extract the Content of the Celex document in the provided language

    Args:
        celexId (string): Celex number whose content needs to be extracted
        lang (string): Language of the Celex document
    
    Returns:
        dictionary: Raw HTML of the Celex document
                    Content of the Celex document
    """
    # Dictonary to save info for each iteration
    content_dict = {}

    # Tracking dictonary which type of document (HTML / PDF / NONE) in each language
    tracking_dict = {}
    tracking_dict['celex_id'] = celex_id

########################################################################################################
    # Preparing URLs based on given number & Language.
    url_html = f'https://eur-lex.europa.eu/legal-content/{lang}/TXT/HTML/?uri=CELEX:{celex_id}'
    url_pdf = f'https://eur-lex.europa.eu/legal-content/{lang}/TXT/PDF/?uri=CELEX:{celex_id}'
########################################################################################################
    try:
        # First try to get HTML information
        content_html = requests.get(url_html).text
        if 'The requested document does not exist.' in content_html:
            # If there is no HTML available, then try to get PDF info.
            content_pdf = requests.get(url_pdf)
        
            if 'The requested document does not exist.' in content_pdf.text:
                # If PDF is also not available , then Raise Exception.
                raise Exception
            
            document = "NA"
            document_content = content_pdf.content
            tracking_dict[lang] = "PDF"
        else:
            # Saving HTML File (if available)
            if "docHtml" in content_html:
                content_html_text = BeautifulSoup(content_html, "html.parser").find("div", {"id": "docHtml"})
                document_content = content_html_text.text
                document = content_html
            else:
                content_html_text = BeautifulSoup(content_html, "html.parser")
                document_content = content_html_text.text
                document = content_html
            
            tracking_dict[lang] = "HTML"

        content_dict['rawDocument'] = document
        content_dict['documentContent'] = document_content

    except :
        tracking_dict[lang] = "None"
        content_dict['rawDocument'] = "NA"
        content_dict['documentContent'] = "NA"
    
    logging.info(tracking_dict)
    sleep(1)

    return content_dict

In [8]:
def get_document_summary(lang, document_page):
    """
    Function extracts the summary of the Celex document

    Args:
        lang (string): Language of the summary that needs to be extracted
        document_page (string): Summary HTML page of the document

    Returns:
        dictionary: Raw HTML of the Summary in the provided language
                    Summary content of the document in the provided language
    """
    summary_dict = {}
    language_id = f'format_language_table_HTML_{lang}'
    list_of_documents = document_page.find( 'a', attrs={'id':language_id, 'class': 'piwik_download'}, href = True)
    summary_url = 'https://eur-lex.europa.eu/'+ list_of_documents['href'][list_of_documents['href'].find("legal-content"):]
    
    summary_html = requests.get(summary_url).text
    summary_dict['rawSummary'] = summary_html
    summary_dict['summaryContent']= BeautifulSoup(summary_dict['rawSummary'], "html.parser").text

    return summary_dict

In [9]:
def get_document_information(celex_list):
    """
    Orchestrator functioin to extract the summary and document content for the provided Celex Number

    Args:
        celex_list (list): List of Celex number for which the summary and contents needs to be extracted

    Returns:
        list: Comprising of dictionary of information about the summary and document 
                content for the provided Celex Numbers in the different languages
    """
    # langs = ['BG' , 'CS' , 'DA' , 'DE' ,  'EL' , 'EN' , 'ES' , 
    #      'ET' , 'FI' , 'FR' , 'GA' , 'HR' , 'HU' , 'IT' ,
    #       'LT' , 'LV' , 'MT' , 'NL', 'PL' , 'PT' , 'RO' , 
    #       'SK' , 'SL' , 'SV']
    langs = ['EN', 'DE']
    
    details = []
    logging.info("Execution of Extraction of Summary for respective Celex Document - Started")

    # For Each CELEX_Number preparing the URL and extracting Info from Website
    for celex_id in celex_list:
        celex_document_information = {}
        celex_document_information['_id'] = celex_id
        
        for lang in langs:
            language_document_information = {}
            summary_data = {}
            raw_data = {}

            # Preparing URL for the summary of the Celex number
            document_url = f'https://eur-lex.europa.eu/legal-content/{lang}/LSU/?uri=CELEX:{celex_id}'
            document_request = requests.get(document_url)

            if 'No legislative summaries' in document_request.text:
                summary_data['rawSummary'] = 'NA'
                summary_data['summaryContent'] = 'NA'
            else:
                # HTML for that information
                document_page = BeautifulSoup(document_request.text, "html.parser")
                summary_data = get_document_summary(lang, document_page)    
            
            language_document_information['summaryInformation'] = summary_data

            raw_data = get_document_content(celex_id, lang)
            language_document_information['documentInformation'] = raw_data

            celex_document_information[lang] = language_document_information

            logging.info(f'Completed Extracting Information of {celex_id} for {lang}')
            sleep(1)

        details.append(celex_document_information)
        logging.info("Execution of Extraction of Summary for respective Celex Document - Ended")
        
    return details

In [10]:
def elastic_search_mapping():
    # """"""""""
    # Functionality: Creation of the mapping for the ElasticSearch or OpenSearch Index
    # 
    # For this project mapping is created from JSON using https://json-to-es-mapping.netlify.app/
    #
    # Signature of the function:
    #  Input: 
    #       No input is required for this function, as it is executed to create an object for mapping
    # 
    #  Output:
    #       esMapping: Mapping setting for the ElasticSearch or OpenSearch Index
    # """""""""" 
    es_mapping = {
        "settings": {
            "number_of_shards":1,
            "number_of_replicas":0
        },
        "mappings": {
            "properties": {
                "english": {
                    "type": "nested",
                    "properties": {
                        "documentInformation": {
                            "type": "nested",
                            "properties": {
                                "rawDocument": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                },
                                "documentContent": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                }
                            }
                        },
                        "summaryInformation": {
                            "type": "nested",
                            "properties": {
                                "rawSummary": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                },
                                "summaryContent": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                }
                            }
                        }
                    }
                },
                "german": {
                    "type": "nested",
                    "properties": {
                        "documentInformation": {
                            "type": "nested",
                            "properties": {
                                "rawDocument": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                },
                                "documentContent": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                }
                            }
                        },
                        "summaryInformation": {
                            "type": "nested",
                            "properties": {
                                "rawSummary": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                },
                                "summaryContent": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    return es_mapping

In [11]:
def elastic_search_create(es_index, index_name, es_mapping):
    # """"""""""
    # Functionality: Creation of the Index if not present in the cluster
    # 
    # Signature of the function:
    #  Input: 
    #       esIndex: ElasticSearch or OpenSearch connection
    #       indexName: Name of the index that needs to be created
    #       esMapping: Mapping of the index that needs to be created
    # 
    #  Output:
    #       If the index is already present then the function wont take any action
    #       And if the index is not present then it will be created by the function
    # """"""""""
    search_index = es_index.indices.exists(index=index_name)

    if search_index == False:
        es_index.indices.create(index=index_name, ignore=[400,404], body=es_mapping)

In [12]:
def elastic_search_insert(es_index, index_name, celex_information):
    # """"""""""
    # Functionality: Insert the document in the ElasticSearch or OpenSearch Index
    #
    # Signature of the function:
    #  Input: 
    #       esIndex: ElasticSearch or OpenSearch connection
    #       indexName: Name of the index that needs to be created
    #       celexInformation: Information that needs to be inserted in the Index in JSON format
    # 
    #  Output:
    #       Insert the information in the ElasticSearch or OpenSearch Index keeping unqiue ID (_id) as the celex number
    # """"""""""
    for id in range(len(celex_information)):
        doc = { 
                "english":
                { 
                    "documentInformation":
                        {
                            "rawDocument":celex_information[id]['EN']['documentInformation']['rawDocument'],
                            "documentContent":celex_information[id]['EN']['documentInformation']['documentContent']
                        },

                    "summaryInformation":
                        {
                            "rawSummary":celex_information[id]['EN']['summaryInformation']['rawSummary'],
                            "summaryContent":celex_information[id]['EN']['summaryInformation']['summaryContent']
                        }
                },
                "german":
                { 
                    "documentInformation":
                        {
                            "rawDocument":celex_information[id]['DE']['documentInformation']['rawDocument'],
                            "documentContent":celex_information[id]['DE']['documentInformation']['documentContent']
                        },

                    "summaryInformation":
                        {
                            "rawSummary":celex_information[id]['DE']['summaryInformation']['rawSummary'],
                            "summaryContent":celex_information[id]['DE']['summaryInformation']['summaryContent']
                        }
                }
            }
        _id = celex_information[id]['_id']
        
        retries = 0
        while True:
            try:
                es_index.index(index=index_name,body=doc,id=_id)
                break
            except Exception as e:
                if retries == 5:
                    print('Indexing user \'{}\' failed for 5 consecutiv times. Aborting!'.format(_id))
                    break
                retries += 1
                sleep(retries * 5)

In [13]:
def elastic_search_existing_check(es_index, index_name, celex_list):
    # """"""""""
    # Functionality: Check if the document is already present in the index
    #
    # Signature of the function:
    #  Input: 
    #       esIndex: ElasticSearch or OpenSearch connection
    #       indexName: Name of the index that needs to be created
    #       celexList: List of the celex number for which the summary and content needs to be extracted
    # 
    #  Output:
    #       nonExisting: List of all the celex number that are not present in the ElasticSearch or OpenSearch index
    # """"""""""
    non_existing = []
    for celex_id in celex_list:
        document_status = es_index.exists(index= index_name, id= celex_id)
        if document_status == False:
            non_existing.append(celex_id)
    
    return non_existing

In [18]:
if __name__ == '__main__':

    # Configuring the File name to logging Level
    logging.basicConfig(filename=extraction_logs,level=logging.INFO)

    list_celex_number = pd.DataFrame(data=None)
    celex_information = pd.DataFrame(data=None)

    # Input from the User:
    # URL of the Domain specific Legal Acts, for example: Energy, Agriculture, Taxation, and other
    # For Example:
    #   1. From Legal Acts: https://eur-lex.europa.eu/browse/directories/legislation.html
    #   2. If Legal Acts for Energy Domain is required
    #   3. Provided URl will be: https://eur-lex.europa.eu/search.html?type=named&name=browse-by:legislation-in-force&CC_1_CODED=12&displayProfile=allRelAllConsDocProfile

    provided_url = input('Provide the URL: ')

    start_time = time()
    logging.info("Current date and time: " + str(start_time))

    # Elastic Search Index
    index_name = input('Provide the Elastic Search Index Name: ')

    while True:  
        # Instance of Elastic Search
        server = input('Server of the Elastic Search Index (localhost or Uniheidelberg): ')
        if server.lower() == "localhost":
            es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
            break
        elif server.lower() == "uniheidelberg":
            user_name = input('University Heidelberg Elastic Search Username: ')
            password = input('University Heidelberg Elastic Search Password: ')
            es = OpenSearch(hosts = [{'host': 'elastic-dbs.ifi.uni-heidelberg.de', 'port': 443}], 
            http_auth =(user_name, password), 
            use_ssl = True,
            verify_certs = True,
            ssl_assert_hostname = False,
            ssl_show_warn = False
            )
            break
        else:
            continue

    es_index_mapping = elastic_search_mapping()
    elastic_search_create(es, index_name, es_index_mapping)
    
    # Calling the Function for the given CELEX_Numbers
    list_celex_number = celex_main(provided_url)

    non_existing_celex_number = elastic_search_existing_check(es, index_name, list_celex_number)

    # Calling the Function to extract the metadata for the list of celex numbers
    celex_information = get_document_information(non_existing_celex_number)

    elastic_search_insert(es, index_name, celex_information)

    end_time = time()
    logging.info("Current date and time: " + str(end_time))
    logging.info("Time for Execution of Script: " + str(start_time - end_time))

KeyboardInterrupt: Interrupted by user