In [1]:
import json
from os import makedirs, path
from copy import deepcopy

In [4]:
def clean_string(input_string, mode = None):
    """
    Utility function to clean the input string.
    The filters are chosen by hand when scraping the values.
    NOTE: Only add filters, don't remove them

    :param input_string: String to be cleaned
    :param mode: Useful to avoid adding spaces in case of select fields
    :return: Cleaned string
    """
    return input_string.strip().replace(u"\xa0â€‹", "").replace(u"â€‹", "").replace(u"\n", "").replace(u"\t", "").replace(u"\xa0", " " if (mode and mode == 'prof') else "").replace(u"\u200b", "").replace(u"\u00e0", "à")


def initialize_dataset():
    """
    Return an object to use for inizialization of a new dataset.

    :return: Initial dictionary for any dataset
    """
    return {
        "value": {
            "total": 0,
            "size": 0,
            "language": "en",
            "data": []
        }
    }


def append_data(dataset, to_append):
    """
    Utility function to mask away some code and make it more readable

    :param dataset: Dataset to append data to
    :param to_append: Data to append
    """
    dataset['value']['data'].append(to_append)


def save_dataset(dataset, name, file_format):
    """
    Save the dataset given in input

    :param dataset: Dataset to save
    :param name: Name of the dataset
    :param file_format: Format the dataset should be saved in
    """
    filename = f'../datasets/{name}.{file_format}'
    makedirs(path.dirname(filename), exist_ok=True)
    with open(filename, 'w', encoding='utf-8') as f:
        if file_format == 'json':
            json.dump(dataset, f, indent=2)


def set_total_size(dataset):
    """
    Sets the values of the 'total' and 'size' fields in a dictionary

    :param dataset: Dataset to manipulate
    """
    dataset['value']['total'] = len(dataset['value']['data'])
    dataset['value']['size'] = len(dataset['value']['data'])


In [2]:
with open('../datasets/original/person_en.json', 'r') as f:
    persons = json.load(f)

In [3]:
positions = set()
for person in persons['value']['data']:

    for position in person['position']:
        positions.add(position['role'])

positions

{'Adjunct professor',
 'Alternate member',
 'Anti-corruption Officer',
 'Assistant professor',
 'Associate professor',
 'Chief digital transformation officer',
 'Collaborator',
 'Coordinator',
 'Data Protection Officer',
 'Deputy coordinator',
 'Deputy dean',
 'Director',
 'Director General',
 'Emeritus professor',
 "Employees' representative for safety issues",
 'Executive assistant to the Head of the Direction',
 'External staff',
 'Full professor',
 'Guest',
 'Head',
 'Implementation of the Development Plan',
 'Linguistic expert',
 'Member',
 'PhD student',
 'President',
 'Rector',
 "Rector's Delegate for musical activities",
 "Rector's delegate for Academic and International Solidarity",
 "Rector's delegate for Alumni",
 "Rector's delegate for Development Cooperation",
 "Rector's delegate for Disability Support",
 "Rector's delegate for Doctoral Programmes",
 "Rector's delegate for Environmental Sustainability",
 "Rector's delegate for IT Services and Technologies",
 "Rector's dele