# Extraction of data themes from the EU vocabularies

This notebook extracts the data themes.The data source is the XML file avialble in the [data themes classification  page](https://op.europa.eu/en/web/eu-vocabularies/dataset/-/resource?uri=http://publications.europa.eu/resource/dataset/data-theme).

ASSUMPTIONS:
- The XML file **data-theme.xml** must be available in the same directory as this notebook


In [11]:
import xml.etree.ElementTree as ET
import csv
import pandas as pd

## 1. Open the XML file: data-theme.xml

In [12]:
def open_file(file_name): 
    """
    open and parse the xml file given by file_name and return the parsed file and the root.
    
    """
    tree = ET.parse(file_name)
    root = tree.getroot()
    return tree, root

## 2. Extract records

In [13]:
def check_deprecated(deprecated_tag):
    """
    check whether a record is deprecated or not. It returns 1 --> deprecated or 0 --> not deprecated
    """
    
    if deprecated_tag == "false":
        is_deprecated = 0
    else:
        is_deprecated = 1
        
    return is_deprecated        
        

In [14]:
def data_theme_extractor(child, target_tag):
    """
    extracts the data themes code. it returns -1 of the extracted information doesn't match the target tag
    """
    
    if child.tag == target_tag:
        info_extracted = child.text
        return info_extracted
    else:
        return -1
        

In [15]:
def data_theme_name_extractor(child, target_tag):
    """
     extracts the data theme name in english. it returns -1 of the extracted information doesn't match the target tag
    """
    
    if child.attrib["lg"] == target_tag:
        info_extracted = child.text
        return info_extracted
    else:
        return -1    
        

In [16]:
def extract_info(root):
    """
    extract the AUTHORITY-CODE and NAME attributed related to each non-deprecated record. It returns the extracted information as a string of dicts with one dict per record
    """

    record_attrib = {}
    data_themes = []
    data_themes_info = {}

    for record in root:   
        is_deprecated = check_deprecated(record.attrib["deprecated"])
        if not is_deprecated: # if the record is not deprecated
            for child in record: # records sub-levels
                    info_extracted = data_theme_extractor(child, "authority-code") # extracts the data theme code                
                    if  info_extracted != -1:
                        data_themes_info["authority-code"] = info_extracted                   

                    if child.tag == "label": # to extract the name of the data theme in english
                        for child2 in child:
                            info_extracted =  data_theme_name_extractor(child2, "eng") # extracts the data theme name in english
                            if  info_extracted != -1:
                                data_themes_info["name"] = info_extracted

                                data_themes.append(data_themes_info)
                                data_themes_info = {} # clean up the variable            
    
    return data_themes                    
    
    

## 3. Save the extracted information into a csv file

In [17]:
def save_to_csv(data_themes, filename): 
    """
    save the extracted information into a csv file
    """
  
    # specifies the fields for csv file 
    fields = ["authority-code", "name"]

    # sorts by the authority-code column
    data_themes_sorted = data_themes.sort_values("authority-code")

    # save as a csv file
    data_themes_sorted.to_csv(filename, columns=fields, index=False)

    return

## 4. Main

In [18]:
def main():
    xml_file = "data-theme.xml"
    output_filename = "data_themes.csv"
    tree, root = open_file(xml_file)
    data_themes = pd.DataFrame(extract_info(root), columns=["authority-code", "name"])
    save_to_csv(data_themes, output_filename)

## 5. Execution

In [19]:
if __name__ == "__main__": 
  
    # calling main function 
    main() 

## LAB