# Extraction of EU corporate bodies classification from the EU vocabularies

This notebook extracts the EU catogories of the EU organisations.The data source is the XML file avialble in the [corporte boy classification page](https://op.europa.eu/en/web/eu-vocabularies/dataset/-/resource?uri=http://publications.europa.eu/resource/dataset/corporate-body).

ASSUMPTIONS:
- The XML file **corporatebodies.xml** must be available in the same directory as this notebook


In [1]:
import xml.etree.ElementTree as ET
import csv
import pandas as pd

## 1. Open the XML file: corporate-body-classification.xml

In [2]:
def open_file(file_name): 
    """
    open and parse the xml file given by file_name and return the parsed file and the root.
    
    """
    tree = ET.parse(file_name)
    root = tree.getroot()
    return tree, root

## 2. Extract records

In [3]:
def check_deprecated(deprecated_tag):
    """
    check whether a record is deprecated or not. It returns 1 --> deprecated or 0 --> not deprecated
    """
    
    if deprecated_tag == "false":
        is_deprecated = 0
    else:
        is_deprecated = 1
        
    return is_deprecated        
        

In [4]:
def EU_body_extractor(child, target_tag):
    """
    extracts the EU classification code. it returns -1 of the extracted information doesn't match the target tag
    """
    
    if child.tag == target_tag:
        info_extracted = child.text
        return info_extracted
    else:
        return -1
        

In [5]:
def EU_classification_name_extractor(child, target_tag):
    """
     extracts the EU classification name in english. it returns -1 of the extracted information doesn't match the target tag
    """
    
    if child.attrib["lg"] == target_tag:
        info_extracted = child.text
        return info_extracted
    else:
        return -1    
        

In [6]:
def corporate_classification_extractor(child, target_tag):
    """
    extracts the information in the tag <corporate.classification> to identify whether the corporate body is an EU organisation or not. it returns -1 of the extracted information doesn't match the target tag
    """
    
    if child.tag == target_tag:
        info_extracted = child.text
        return info_extracted
    else:
        return -1
        

In [7]:
def extract_info(root):
    """
    extract the AUTHORITY-CODE and NAME attributed related to each non-dep0recated record. It returns the extractec information as a string of dicts with one dict per record
    """

    record_attrib = {}
    EU_organisations = []
    EU_organisations_info = {}

    for record in root:   
        is_deprecated = check_deprecated(record.attrib["deprecated"])
        if not is_deprecated: # if the record is not deprecated
            for child in record: # records sub-levels
                    info_extracted = EU_body_extractor(child, "authority-code") # extracts the body code                
                    if  info_extracted != -1:
                        EU_organisations_info["authority-code"] = info_extracted

                    info_extracted = corporate_classification_extractor(child, "corporate.classification") # detects if the corporate body is an EU organisation
                    if  info_extracted != -1:
                        EU_organisations_info["corporate_classification"] = info_extracted

                    info_extracted = EU_body_extractor(child, "corporate.body.classification") # extracts the EU body classification
                    if  info_extracted != -1:
                        EU_organisations_info["corporate_body_classification"] = info_extracted

                    if child.tag == "label": # to extract the name of the classification in english
                        for child2 in child:
                            info_extracted =  EU_classification_name_extractor(child2, "eng") # extracts the EU body name in english
                            if  info_extracted != -1:
                                EU_organisations_info["name"] = info_extracted

                                EU_organisations.append(EU_organisations_info)
                                EU_organisations_info = {} # clean up the variable            
    
    return EU_organisations                    
    
    

## 3. Keep only the EU organisations

In [8]:
def filter_EU_organisations(corporate_bodies): 
    """
    tansforms the array of corporate bodies into a dataframe and filter the EU organisations. It returns a dataframe containing only the EU organisations
    """
    corporate_bodies_df = pd.DataFrame(corporate_bodies, columns=["authority-code", "name", "corporate_classification", "corporate_body_classification"])
    
    return corporate_bodies_df[corporate_bodies_df["corporate_classification"] == "EU"].reset_index(drop=True)

## 3. Save the extracted information into a csv file

In [9]:
def save_to_csv(EU_organisations, filename): 
    """
    save the extracted information into a csv file
    """
  
    # specifies the fields for csv file 
    fields = ["authority-code", "name", "corporate_body_classification"]

    # sorts by the authority-code column
    EU_organisations_sorted = EU_organisations.sort_values("authority-code")

    # save as a csv file
    EU_organisations_sorted.to_csv(filename, columns=fields, index=False)

    return

## 4. Main

In [10]:
def main():
    xml_file = "corporatebodies.xml"
    output_filename = "EU_organisations.csv"
    tree, root = open_file(xml_file)
    corporate_bodies = extract_info(root)
    EU_organisations = filter_EU_organisations(corporate_bodies)
    save_to_csv(EU_organisations, output_filename)

## 5. Execution

In [11]:
if __name__ == "__main__": 
  
    # calling main function 
    main() 

## LAB

In [10]:
corporate_bodies_df = pd.DataFrame(corporate_bodies, columns=["authority-code", "name", "corporate_classification", "corporate_body_classification"])
corporate_bodies_df

Unnamed: 0,authority-code,name,corporate_classification,corporate_body_classification
0,EAEC,European Atomic Energy Community,EU,CB_EU
1,EURUN,European Union,EU,CB_EU
2,COM,European Commission,EU,EU_INST
3,EP,European Parliament,EU,EU_INST
4,EURCOU,European Council,EU,EU_INST
...,...,...,...,...
883,EP_INGE,Special Committee on Foreign Interference in a...,EU,EP_CMT
884,EP_AIDA,Special Committee on Artificial Intelligence i...,EU,EP_CMT
885,EP_BECA,Special Committee on Beating Cancer,EU,EP_CMT
886,EP_ANIT,Committee of Inquiry on the Protection of Anim...,EU,EP_CMT


In [11]:
corporate_bodies_df[corporate_bodies_df["corporate_classification"].isna()]

Unnamed: 0,authority-code,name,corporate_classification,corporate_body_classification
415,DATPRO,Provisional data,,CB_OTHER


In [17]:
EU_organisations_df = corporate_bodies_df[corporate_bodies_df["corporate_classification"] == "EU"].reset_index(drop=True)
EU_organisations_df

Unnamed: 0,authority-code,name,corporate_classification,corporate_body_classification
0,EAEC,European Atomic Energy Community,EU,CB_EU
1,EURUN,European Union,EU,CB_EU
2,COM,European Commission,EU,EU_INST
3,EP,European Parliament,EU,EU_INST
4,EURCOU,European Council,EU,EU_INST
...,...,...,...,...
587,EP_INGE,Special Committee on Foreign Interference in a...,EU,EP_CMT
588,EP_AIDA,Special Committee on Artificial Intelligence i...,EU,EP_CMT
589,EP_BECA,Special Committee on Beating Cancer,EU,EP_CMT
590,EP_ANIT,Committee of Inquiry on the Protection of Anim...,EU,EP_CMT


In [22]:
EU_organisations

Unnamed: 0,authority-code,name,corporate_classification,corporate_body_classification
0,EAEC,European Atomic Energy Community,EU,CB_EU
1,EURUN,European Union,EU,CB_EU
2,COM,European Commission,EU,EU_INST
3,EP,European Parliament,EU,EU_INST
4,EURCOU,European Council,EU,EU_INST
...,...,...,...,...
587,EP_INGE,Special Committee on Foreign Interference in a...,EU,EP_CMT
588,EP_AIDA,Special Committee on Artificial Intelligence i...,EU,EP_CMT
589,EP_BECA,Special Committee on Beating Cancer,EU,EP_CMT
590,EP_ANIT,Committee of Inquiry on the Protection of Anim...,EU,EP_CMT


In [33]:
# specifies the fields for csv file 
fields = ["authority-code", "name", "corporate_body_classification"]

# sorts by the authority-code column
EU_organisations_sorted = EU_organisations.sort_values("authority-code")

# save as a csv file
EU_organisations_sorted.to_csv(output_filename, columns=fields, index=False)

In [30]:
EU_organisations.sort_values("authority-code", inplace=True)
EU_organisations

Unnamed: 0,authority-code,name,corporate_classification,corporate_body_classification
559,ACER,European Union Agency for the Cooperation of E...,EU,AGENCY_DEC
401,ACSH,Advisory Committee on Safety and Health at Work,EU,CB_EU
403,AEPPEPF,Authority for European Political Parties and E...,EU,CB_EU
60,AGRI,Directorate-General for Agriculture and Rural ...,EU,DIR_GEN
360,BANK_EURO19,National banks within the Eurosystem: National...,EU,CB_EU
...,...,...,...,...
541,SRSS,Structural Reform Support Service,EU,SERV_DEP
496,STECF,"Scientific, Technical and Economic Committee f...",EU,CB_EU
65,TAXUD,Directorate-General for Taxation and Customs U...,EU,DIR_GEN
71,TRADE,Directorate-General for Trade,EU,DIR_GEN
