In [1]:
from requests import request
import json
from os import makedirs, path
from geopy.geocoders import Nominatim

In [2]:
with open("../datasets/generated/buildings.json", 'r') as fp:
    buildings = json.load(fp)

In [3]:
def save_dataset(dataset, name, file_format):
    """
    Save the dataset given in input

    :param dataset: Dataset to save
    :param name: Name of the dataset
    :param file_format: Format the dataset should be saved in
    """
    filename = f'../datasets/{name}.{file_format}'
    makedirs(path.dirname(filename), exist_ok=True)
    with open(filename, 'w', encoding='utf-8') as f:
        if file_format == 'json':
            json.dump(dataset, f, indent=2)

def set_total_size(dataset):
    """
    Sets the values of the 'total' and 'size' fields in a dictionary

    :param dataset: Dataset to manipulate
    """
    dataset['value']['total'] = len(dataset['value']['data'])
    dataset['value']['size'] = len(dataset['value']['data'])

def get_address_information(address):
    """
    Fetch and return the address information from OpenStreetMap

    :param address: Address to look up in OSM
    :return: Dictionary with the information for the input address
    """
    # Check the Nominatim TOS before using this, it allows maximum 1 request per second
    # https://operations.osmfoundation.org/policies/nominatim/
    # Also check the OSM wiki regarding the API
    # https://wiki.openstreetmap.org/wiki/API_v0.6
    geolocator = Nominatim(
        user_agent="Mozilla/5.0 (Windows NT 10.0; rv:105.0) Gecko/20100101 Firefox/105.0")
    query = geolocator.geocode(query=address)
    print(geolocator.reverse([query.raw.get('lat'), query.raw.get('lon')]).raw)
    if query:
        r = request('get', f"https://www.openstreetmap.org/api/0.6/way/{query.raw.get('osm_id')}.json", headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:106.0) Gecko/20100101 Firefox/106.0'})
        if r.ok:
            return json.loads(r.text)
        else:
            return {}
    else:
        return {}

def append_data(dataset, to_append):
    """
    Utility function to mask away some code and make it more readable

    :param dataset: Dataset to append data to
    :param to_append: Data to append
    """
    dataset['value']['data'].append(to_append)

In [4]:
problematic_addresses = [building['address'] for building in buildings['value']['data'] if not bool(building['osm_tags'])]

i = 0
while i < len(buildings['value']['data']):
    if buildings['value']['data'][i]['address'] in problematic_addresses:
        del buildings['value']['data'][i]
    else:
        i += 1
problematic_addresses

['Via Calepina, 14, Trento, TN, Italia',
 'Via Giuseppe Verdi, 7 38122 Trento TN',
 'Via Giuseppe Verdi, 8, Trento, 38122 TN',
 'Corso Bettini, 43 - 38068 Rovereto',
 'Via Giuseppe Verdi, 6, Trento, TN',
 'Via Calepina, 14, Trento, TN',
 'Via Tomaso Gar, 14, Trento, TN',
 'Via Calepina, 14, Trento, 38122 TN',
 "Via Edmund Mach, 1, 38010 San Michele All'adige TN",
 'Via Calepina, 14 38122 Trento TN',
 'Via M. del Ben, 5b, Rovereto, 38068 TN',
 'Via Tomaso Gar, 14, 38122 Trento',
 'Via Giuseppe Verdi, 6, Trento, 38122 TN',
 'via Calepina, 14 - 38122 Trento',
 'via S. Maria Maddalena, 1, 38122 Trento',
 'Via Sommarive, 9, Povo, 38123 TN',
 'Via Tomaso Gar, 14, Trento, 38122 TN',
 'Corso Bettini, 31, Rovereto, 38068 TN']

In [6]:
set_total_size(buildings)
save_dataset(buildings, 'generated/buildings', 'json')