In [1]:
from requests import request
import json
from os import makedirs, path
from geopy.geocoders import Nominatim
from copy import deepcopy
import time

In [8]:
def get_initial_data(urls, filenames):
    """
    Download the json files about courses, organizational structure
    and staff from dati.trentino.it

    :param urls: Array of URLs to download
    :param filenames: Array of the file names to use when saving the file
    :return: All the downloaded data and the extracted URLs
    """
    downloaded_data = []
    # URL_REGEX = r"""((?:(?:https|ftp|http)?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:it)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:it)\b/?(?!@)))"""

    for i in range(len(urls)):
        print(f"Downloading {urls[i]}...")
        res = request('get', urls[i], headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:106.0) Gecko/20100101 Firefox/106.0'})
        if res.ok:
            file_to_json = json.loads(res.text)  # Convert string to JSON

            # Save file
            save_dataset(file_to_json, filenames[i], 'json')

            """ with open('urls.txt', 'w', encoding='utf-8') as f:
                f.write(str(subjects_urls)) """

            downloaded_data.append(file_to_json)
        else:
            # add better error control with an exception
            print(f"Cannot download data from {urls[i]}.")

    return downloaded_data
    
def clean_string(input_string, mode = None):
    """
    Utility function to clean the input string.
    The filters are chosen by hand when scraping the values.
    NOTE: Only add filters, don't remove them

    :param input_string: String to be cleaned
    :param mode: Useful to avoid adding spaces in case of select fields
    :return: Cleaned string
    """
    return input_string.strip().replace(u"\xa0â€‹", "").replace(u"â€‹", "").replace(u"\n", "").replace(u"\t", "").replace(u"\xa0", " " if (mode and mode == 'prof') else "").replace(u"\u200b", "").replace(u"\u00e0", "à")


def initialize_dataset():
    """
    Return an object to use for inizialization of a new dataset.

    :return: Initial dictionary for any dataset
    """
    return {
        "value": {
            "total": 0,
            "size": 0,
            "language": "en",
            "data": []
        }
    }


def append_data(dataset, to_append):
    """
    Utility function to mask away some code and make it more readable

    :param dataset: Dataset to append data to
    :param to_append: Data to append
    """
    dataset['value']['data'].append(to_append)


def save_dataset(dataset, name, file_format):
    """
    Save the dataset given in input

    :param dataset: Dataset to save
    :param name: Name of the dataset
    :param file_format: Format the dataset should be saved in
    """
    filename = f'../datasets/{name}.{file_format}'
    makedirs(path.dirname(filename), exist_ok=True)
    with open(filename, 'w', encoding='utf-8') as f:
        if file_format == 'json':
            json.dump(dataset, f, indent=2)


def set_total_size(dataset):
    """
    Sets the values of the 'total' and 'size' fields in a dictionary

    :param dataset: Dataset to manipulate
    """
    dataset['value']['total'] = len(dataset['value']['data'])
    dataset['value']['size'] = len(dataset['value']['data'])

def get_address_information(address):
    """
    Fetch and return the address information from OpenStreetMap

    :param address: Address to look up in OSM
    :return: Dictionary with the information for the input address
    """
    # Check the Nominatim TOS before using this, it allows maximum 1 request per second
    # https://operations.osmfoundation.org/policies/nominatim/
    # Also check the OSM wiki regarding the API
    # https://wiki.openstreetmap.org/wiki/API_v0.6
    geolocator = Nominatim(
        user_agent="Mozilla/5.0 (Windows NT 10.0; rv:105.0) Gecko/20100101 Firefox/105.0")
    query = geolocator.geocode(query=address)
    if query:
        r = request('get', f"https://www.openstreetmap.org/api/0.6/way/{query.raw.get('osm_id')}.json", headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:106.0) Gecko/20100101 Firefox/106.0'})
        if r.ok:
            return json.loads(r.text)
        else:
            return {}
    else:
        return {}


def get_geospatial_data(dep_data, addresses):
    """
    Fetch and save the geospatial data about Uni departments from OpenStreetMap

    :param dep_data: departments_en file
    :param addresses: Addresses to use
    """
    print("Getting addresses from OpenStreetMap...")
    osm_data = initialize_dataset()
    tags_to_use = {'addr:city',
                   'addr:country',
                   'addr:housenumber',
                   'addr:postcode',
                   'addr:street',
                   'alt_name',
                   'amenity',
                   'email',
                   'long_name',
                   'name',
                   'name:en',
                   'name:it',
                   'old_name',
                   'opening_hours',
                   'phone',
                   'short_name',
                   'website',
                   'wheelchair'}
    for address in addresses:
        info = get_address_information(address)
        tags = {}
        if bool(info):
            for tag in tags_to_use:
                tags[tag] = info['elements'][0]['tags'][tag] if tag in info['elements'][0]['tags'] else ''
            tags['timestamp'] = info['elements'][0]['timestamp']
        append_data(osm_data, {'address': address, 'osm_tags': tags})
        time.sleep(1.5)

    problematic_addresses = [building['address']
                             for building in osm_data['value']['data'] if not bool(building['osm_tags'])]
    i = 0
    while i < len(osm_data['value']['data']):
        if osm_data['value']['data'][i]['address'] in problematic_addresses:
            del osm_data['value']['data'][i]
        else:
            i += 1

    set_total_size(osm_data)
    save_dataset(osm_data, 'generated/buildings', 'json')


In [27]:
with open('../datasets/original/organizational_unit_en.json', encoding='utf-8') as fp:
    organizations = json.load(fp)
organization_dataset = initialize_dataset()
addresses = set()
for organization in organizations['value']['data']:
    if len(organization['email']) > 0:
        organization['email'] = organization['email'][0]
    else:
        organization['email'] = ''
    if len(organization['website']) > 0:
        organization['website'] = organization['website'][0]
    else:
        organization['website'] = ''
    organization['id'] = organization['identifier']

    del organization['unitPath'], organization['identifier']
    addresses.add(organization['address'])
    
    if len(organization['phone']) > 0:
        for phone in organization['phone']:
            to_append = deepcopy(organization)
            to_append['phoneNumber'] = phone
            del to_append['phone']

            append_data(organization_dataset, to_append)
    else:
        organization['phoneNumber'] = ''
        del organization['phone']
        append_data(organization_dataset, organization)

set_total_size(organization_dataset)
addresses.remove('')

save_dataset(organization_dataset,
                'generated/organization_en_final', 'json')

#get_geospatial_data(organizations, addresses)