In [2]:
##########################################################################################################
# Extraction of the Summary for Celex Number Documents from Eur-lex website and store it in Elastic Search (Localhost) or OpenSearch (UniHeidelberg)
# 
# Code will:
# 1. Extract the list of CELEX Numbers from the "provided URL"
#    "provided_URl": Legal Acts for Energy Domain (https://eur-lex.europa.eu/search.html?type=named&name=browse-by:legislation-in-force&CC_1_CODED=12&displayProfile=allRelAllConsDocProfile)
#
# 2. For the Languages Tweaked in the code
#   2a. Extract the Summary Information for the Celex Numbers (Raw HTML and Document Content)
#   2b. Extract the Content of the Celex Document (Raw HTML and Document Content)
#
# Features:
# Code is customised for:
# 1. Any domain of legal acts from EUR-LEX website (https://eur-lex.europa.eu/browse/directories/legislation.html)
# 2. Any Language
##########################################################################################################

In [3]:
# Importing all requried packages

from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import re
import os
import logging
import requests
from time import sleep, time

# For Uni Heidelberg Server
from opensearchpy import OpenSearch

# For Localhost
from elasticsearch import Elasticsearch

In [4]:
#####################################################################################################
# Directory Creation
# For logging the progress of the script and the list of Celex Numbers extracted
#####################################################################################################
working_dir = os.getcwd()   
directory = os.path.join(working_dir, 'Scrapped_Data')

if not os.path.exists(directory):
    os.makedirs(directory)

filename_celex = os.path.join(directory, 'Celex_Numbers.csv')

# Preparing a File to Log the Metadata Informaiton
extraction_logs = os.path.join(directory, 'Logs_Extracting_MetaData.log')

### Celex Number Extraction

In [5]:
def pages(provided_url):
    # """"""""""
    # Functionality: It extracts the number of pages that needs to be considered for extracting the Celex Numbers
    #
    # Signature of the function:
    #  Input: 
    #       provided_url: URL of the Domain specific Legal Acts, for example: Energy, Agriculture, Taxation, and other
    #                     Legal Acts: https://eur-lex.europa.eu/browse/directories/legislation.html
    #                     Energy Legal Acts: https://eur-lex.europa.eu/search.html?type=named&name=browse-by:legislation-in-force&CC_1_CODED=12&displayProfile=allRelAllConsDocProfile
    # 
    #  Output:
    #       last_page_number: Value of the number of pages present in the provided URL
    # """"""""""
    input_url = urllib.request.urlopen(provided_url)
    input_soup = BeautifulSoup(input_url , 'lxml')
    page_number_indexes = input_soup.find_all('a', class_ = 'btn btn-primary btn-sm')
    last_page_number_url = page_number_indexes[1].attrs['href']
    last_page_number = int((re.search('page=(\d+)', last_page_number_url , re.IGNORECASE)).group(1)) + 1
    return last_page_number

In [6]:
def get_celex(pages, provided_url):
    # """"""""""
    # Functionality: It extracts all the Celex Number from the provided URL.
    #
    # Signature of the function:
    #  Input: 
    #       pages: The value of number of pageas that needs to be considered for extracting the Celex Numbers.
    #       provided_url: URL of the Domain specific Legal Acts, for example: Energy, Agriculture, Taxation, and other
    #                     Legal Acts: https://eur-lex.europa.eu/browse/directories/legislation.html
    #                     Energy Legal Acts: https://eur-lex.europa.eu/search.html?type=named&name=browse-by:legislation-in-force&CC_1_CODED=12&displayProfile=allRelAllConsDocProfile
    #
    #  Output:
    #       CELEX_Numbers: List of Celex Number extracted from the provided URL
    # """"""""""  
    CELEX_Numbers = []
    print(f'Total Number of Pages present in the provided URL: {pages - 1}')
    while True:
        page_number = input('Number of Pages that needs to be considered for Document Extraction: ')
        pages_considered = int(page_number) + 1
        if pages_considered > pages:
            print("Number of pages entered is greater than the toal number of pageas present")
            continue
        else:
            break
    
    for i in range(1, pages_considered):
        # Reading particular URL information
        sleep(1)
        url = urllib.request.urlopen(provided_url + '&page=' +str(i)).read()
        
        # Scrapping the Page
        soup = BeautifulSoup(url , 'lxml')

        # Fetching celex numbers by parsing html tags heirarchy and checking for text 'CELEX number'. 
        try:
            t = soup.find_all("div", attrs={"class": "col-sm-6"})
            for tag in t:
                titles = tag.find_all("dt")
                values = tag.find_all("dd")
                for t ,v in zip(titles, values):
                    if t.text == 'CELEX number: ':
                        CELEX_Numbers.append(v.text)
        except:
            pass
        
    # Saving all the CELEX Numbers into CSV File (For further scarpping)
    pd.DataFrame(CELEX_Numbers, columns=['celex_id']).to_csv(filename_celex, index = False)

    return CELEX_Numbers

In [7]:
def celex_main(provided_url):
    # """"""""""
    # Functionality: It is an Orchestrator function that extract the number of pages that needs to be scrapped,
    #                and extract the Celex Number from the provided URL
    #
    # Signature of the function:
    #  Input: 
    #       provided_url: URL of the Domain specific Legal Acts, for example: Energy, Agriculture, Taxation, and other
    #                     Legal Acts: https://eur-lex.europa.eu/browse/directories/legislation.html
    #                     Energy Legal Acts: https://eur-lex.europa.eu/search.html?type=named&name=browse-by:legislation-in-force&CC_1_CODED=12&displayProfile=allRelAllConsDocProfile
    # 
    #  Output:
    #       all_celex_number: List of all Celex Numbers present in the provided URL
    # """"""""""
    logging.info("Execution of Extraction of Celex Number - Started")

    last_page_number = pages(provided_url)
    all_celex_number = get_celex(last_page_number, provided_url)
    
    logging.info("Execution of Extraction of Celex Number - Ended")
    return all_celex_number

### Celex Number Document Information Extraction

In [8]:
def get_document_content(celex_id , lang):
    # """"""""""
    # Functionality: Extract the content present in the Celex document
    #
    # Signature of the function:
    #  Input: 
    #       celex_id: Celex Number whose content needs to be extracted
    #       lang: Language of the Celex document
    # 
    #  Output:
    #       dict: Dictionary comprising of:
    #               1. HTML raw text of the document
    #               2. Content of the document
    # """""""""" 

    # Dictonary to save info for each iteration
    contentDict = {}

    # Tracking dictonary which type of document (HTML / PDF / NONE) in each language
    trackingDict = {}
    trackingDict['celex_id'] = celex_id  

########################################################################################################
    # Preparing URLs based on given number & Language.
    urlHTML = f'https://eur-lex.europa.eu/legal-content/{lang}/TXT/HTML/?uri=CELEX:{celex_id}'
    urlPDF = f'https://eur-lex.europa.eu/legal-content/{lang}/TXT/PDF/?uri=CELEX:{celex_id}'
########################################################################################################
    try:
        # First try to get HTML information
        contentHTML = requests.get(urlHTML).text
        if 'The requested document does not exist.' in contentHTML:
            pass
            # If there is no HTML available, then try to get PDF info.
            contentPDF = requests.get(urlPDF)
        
            if 'The requested document does not exist.' in contentPDF.text:
                # If PDF is also not available , then Raise Exception.
                raise Exception
            
            document = "NA"
            documentContent = contentPDF.content
            trackingDict[lang] = "PDF"
        else:
            # Saving HTML File (if available)
            if "docHtml" in contentHTML:
                contentHTMLText = BeautifulSoup(contentHTML, "html.parser").find("div", {"id": "docHtml"})
                documentContent = contentHTMLText.text
                document = contentHTML
            else:
                contentHTMLText = BeautifulSoup(contentHTML, "html.parser")
                documentContent = contentHTMLText.text
                document = contentHTML
            
            trackingDict[lang] = "HTML"

        contentDict['rawDocument'] = document
        contentDict['documentContent'] = documentContent

    except :
        trackingDict[lang] = "None"
        contentDict['rawDocument'] = "NA"
        contentDict['documentContent'] = "NA"
    
    logging.info(trackingDict)
    sleep(2)

    return contentDict

In [9]:
def get_document_summary(lang, documentPage):
    # """"""""""
    # Functionality: Extract the summary content of the Celex document
    #
    # Signature of the function:
    #  Input: 
    #       documentPage: Summary HTML page of the Celex Document
    #       lang: Language of the Celex document
    # 
    #  Output:
    #       dict: Dictionary comprising of:
    #               1. HTML summary raw text of the document in the provided language
    #               2. Content of the document in the provided language
    # """""""""" 
    summaryDict = {}
    languageId = f'format_language_table_HTML_{lang}'
    list_of_documents = documentPage.find( 'a', attrs={'id':languageId, 'class': 'piwik_download'}, href = True)
    summaryURL = 'https://eur-lex.europa.eu/'+ list_of_documents['href'][list_of_documents['href'].find("legal-content"):]
    
    summaryHTML = requests.get(summaryURL).text
    summaryDict['rawSummary'] = summaryHTML
    summaryDict['summaryContent']= BeautifulSoup(summaryDict['rawSummary'], "html.parser").text

    return summaryDict

In [10]:
def get_document_information(celexList):
    # """"""""""
    # Functionality: Extracting the summary and content of the Celex document from the Eur-lex website
    #
    # Signature of the function:
    #  Input: 
    #       celexList: List of Celex Number whose information needs to be extracted
    # 
    #  Output:
    #       details: Summary Content and Document Content of the Celex Document from the Eur-lex website
    # """""""""" 
    # langs = ['BG' , 'CS' , 'DA' , 'DE' ,  'EL' , 'EN' , 'ES' , 
    #      'ET' , 'FI' , 'FR' , 'GA' , 'HR' , 'HU' , 'IT' ,
    #       'LT' , 'LV' , 'MT' , 'NL', 'PL' , 'PT' , 'RO' , 
    #       'SK' , 'SL' , 'SV']
    langs = ['EN', 'DE']
    
    details = []
    logging.info("Execution of Extraction of Metadata for respective Celex Document - Started")

    # For Each CELEX_Number preparing the URL and extracting Info from Website
    for celexId in celexList:
        celexDocumentInformation = {}
        celexDocumentInformation['_id'] = celexId
        
        for lang in langs:
            languageDocumentInformation = {}
            
            # Preparing URL for that CELEX_Number
            documentUrl = f'https://eur-lex.europa.eu/legal-content/{lang}/LSU/?uri=CELEX:{celexId}'
            r = requests.get(documentUrl)

            if 'No legislative summaries' in r.text:
                summaryData['rawSummary'] = 'NA'
                summaryData['summaryContent'] = 'NA'
            else:
                # HTML for that information
                documentPage = BeautifulSoup(r.text, "html.parser")
                summaryData = get_document_summary(lang, documentPage)    
            
            languageDocumentInformation['summaryInformation'] = summaryData

            rawData = get_document_content(celexId, lang)
            languageDocumentInformation['documentInformation'] = rawData

            celexDocumentInformation[lang] = languageDocumentInformation

            logging.info(" Completed Extracting MetaData Information for : " + str(celexId))
            sleep(2)

        details.append(celexDocumentInformation)
        logging.info("Execution of Extraction of Metadata for respective Celex Document - Ended")
        
    return details

In [11]:
def elastic_search_mapping():
    # """"""""""
    # Functionality: Creation of the mapping for the ElasticSearch or OpenSearch Index
    # 
    # For this project mapping is created from JSON using https://json-to-es-mapping.netlify.app/
    #
    # Signature of the function:
    #  Input: 
    #       No input is required for this function, as it is executed to create an object for mapping
    # 
    #  Output:
    #       esMapping: Mapping setting for the ElasticSearch or OpenSearch Index
    # """""""""" 
    esMapping = {
        "settings": {
            "number_of_shards":1,
            "number_of_replicas":0
        },
        "mappings": {
            "properties": {
                "english": {
                    "type": "nested",
                    "properties": {
                        "documentInformation": {
                            "type": "nested",
                            "properties": {
                                "rawDocument": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                },
                                "documentContent": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                }
                            }
                        },
                        "summaryInformation": {
                            "type": "nested",
                            "properties": {
                                "rawSummary": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                },
                                "summaryContent": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                }
                            }
                        }
                    }
                },
                "german": {
                    "type": "nested",
                    "properties": {
                        "documentInformation": {
                            "type": "nested",
                            "properties": {
                                "rawDocument": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                },
                                "documentContent": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                }
                            }
                        },
                        "summaryInformation": {
                            "type": "nested",
                            "properties": {
                                "rawSummary": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                },
                                "summaryContent": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    return esMapping

In [12]:
def elastic_search_create(esIndex, indexName, esMapping):
    # """"""""""
    # Functionality: Creation of the Index if not present in the cluster
    # 
    # Signature of the function:
    #  Input: 
    #       esIndex: ElasticSearch or OpenSearch connection
    #       indexName: Name of the index that needs to be created
    #       esMapping: Mapping of the index that needs to be created
    # 
    #  Output:
    #       If the index is already present then the function wont take any action
    #       And if the index is not present then it will be created by the function
    # """"""""""
    searchIndex = esIndex.indices.exists(index=indexName)

    if searchIndex == False:
        esIndex.indices.create(index=indexName, ignore=[400,404], body=esMapping)

In [13]:
def elastic_search_insert(esIndex, indexName, celexInformation):
    # """"""""""
    # Functionality: Insert the document in the ElasticSearch or OpenSearch Index
    #
    # Signature of the function:
    #  Input: 
    #       esIndex: ElasticSearch or OpenSearch connection
    #       indexName: Name of the index that needs to be created
    #       celexInformation: Information that needs to be inserted in the Index in JSON format
    # 
    #  Output:
    #       Insert the information in the ElasticSearch or OpenSearch Index keeping unqiue ID (_id) as the celex number
    # """"""""""
    for id in range(len(celexInformation)):
        doc = { 
                "english":
                { 
                    "documentInformation":
                        {
                            "rawDocument":celexInformation[id]['EN']['documentInformation']['rawDocument'],
                            "documentContent":celexInformation[id]['EN']['documentInformation']['documentContent']
                        },

                    "summaryInformation":
                        {
                            "rawSummary":celexInformation[id]['EN']['summaryInformation']['rawSummary'],
                            "summaryContent":celexInformation[id]['EN']['summaryInformation']['summaryContent']
                        }
                },
                "german":
                { 
                    "documentInformation":
                        {
                            "rawDocument":celexInformation[id]['DE']['documentInformation']['rawDocument'],
                            "documentContent":celexInformation[id]['DE']['documentInformation']['documentContent']
                        },

                    "summaryInformation":
                        {
                            "rawSummary":celexInformation[id]['DE']['summaryInformation']['rawSummary'],
                            "summaryContent":celexInformation[id]['DE']['summaryInformation']['summaryContent']
                        }
                }
            }
        _id = celexInformation[id]['_id']
        
        retries = 0
        while True:
            try:
                esIndex.index(index=indexName,body=doc,id=_id)
                break
            except Exception as e:
                if retries == 5:
                    print('Indexing user \'{}\' failed for 5 consecutiv times. Aborting!'.format(_id))
                    break
                retries += 1
                sleep(retries * 5)

In [14]:
def elastic_search_existing_check(esIndex, indexName, celexList):
    # """"""""""
    # Functionality: Check if the document is already present in the index
    #
    # Signature of the function:
    #  Input: 
    #       esIndex: ElasticSearch or OpenSearch connection
    #       indexName: Name of the index that needs to be created
    #       celexList: List of the celex number for which the summary and content needs to be extracted
    # 
    #  Output:
    #       nonExisting: List of all the celex number that are not present in the ElasticSearch or OpenSearch index
    # """"""""""
    nonExisting = []
    for celexId in celexList:
        document_status = esIndex.exists(index= indexName, id= celexId)
        if document_status == False:
            nonExisting.append(celexId)
    
    return nonExisting

In [20]:
if __name__ == '__main__':

    # Configuring the File name to logging Level
    logging.basicConfig(filename=extraction_logs,level=logging.INFO)

    listCelexNumber = pd.DataFrame(data=None)
    celexInformation = pd.DataFrame(data=None)

    # Input from the User:
    # URL of the Domain specific Legal Acts, for example: Energy, Agriculture, Taxation, and other
    # For Example:
    #   1. From Legal Acts: https://eur-lex.europa.eu/browse/directories/legislation.html
    #   2. If Legal Acts for Energy Domain is required
    #   3. Provided URl will be: https://eur-lex.europa.eu/search.html?type=named&name=browse-by:legislation-in-force&CC_1_CODED=12&displayProfile=allRelAllConsDocProfile

    providedUrl = input('Provide the URL: ')

    startTime = time()
    logging.info("Current date and time: " + str(startTime))

    # Elastic Search Index
    indexName = input('Provide the Elastic Search Index Name: ')

    while True:  
        # Instance of Elastic Search
        server = input('Server of the Elastic Search Index (localhost or Uniheidelberg): ')
        if server.lower() == "localhost":
            es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
            break
        elif server.lower() == "uniheidelberg":
            userName = input('University Heidelberg Elastic Search Username: ')
            password = input('University Heidelberg Elastic Search Password: ')
            es = OpenSearch(hosts = [{'host': 'elastic-dbs.ifi.uni-heidelberg.de', 'port': 443}], 
            http_auth =(userName, password), 
            use_ssl = True,
            verify_certs = True,
            ssl_assert_hostname = False,
            ssl_show_warn = False
            )
            break
        else:
            continue

    esIndexMapping = elastic_search_mapping()
    elastic_search_create(es, indexName, esIndexMapping)
    
    # Calling the Function for the given CELEX_Numbers
    listCelexNumber = celex_main(providedUrl)

    nonExistingCelexNumber = elastic_search_existing_check(es, indexName, listCelexNumber)

    # Calling the Function to extract the metadata for the list of celex numbers
    celexInformation = get_document_information(nonExistingCelexNumber)

    elastic_search_insert(es, indexName, celexInformation)

    endTime = time()
    logging.info("Current date and time: " + str(endTime))
    logging.info("Time for Execution of Script: " + str(startTime - endTime))

AuthenticationException: AuthenticationException(401, '')