### Based on https://github.com/pwr-inf/hex2vec

In [1]:
import pandas as pd

from tqdm import tqdm
from IPython.display import clear_output

from _lib.osm_cities_data import download_whole_city, add_h3_indices_to_city, load_filter, group_city_tags, group_cities, load_processed_dataset
from _lib.settings import DATA_RAW_DIR, DATA_OSM_CITIES_DIR
from _lib.settings import SELECTED_CITIES, SELECTED_RESOLUTIONS, SELECTED_TAGS

# Download OSM Data

In [3]:
for city in tqdm(SELECTED_CITIES):
    download_whole_city(city, DATA_RAW_DIR)
    clear_output(wait=True)

100%|██████████| 40/40 [9:16:05<00:00, 834.14s/it]


### Process dataset, select tags, add h3 indices of selected resolution

In [2]:
for city in tqdm(SELECTED_CITIES):
    for resolution in SELECTED_RESOLUTIONS:
        print('City:', city,'Resolution:', resolution)
        add_h3_indices_to_city(city, resolution)
        clear_output(wait=True)

100%|██████████| 40/40 [11:54:02<00:00, 1071.06s/it]


### Group selected tags in cities

In [2]:
cities = [i if type(i) != list else i[0] for i in SELECTED_CITIES]
TAG_FILTER = load_filter("from_wiki.json")

In [3]:
for city in tqdm(cities):
    for resolution in SELECTED_RESOLUTIONS:
        print('City:', city,'Resolution:', resolution)
        group_city_tags(city, resolution, filter_values=TAG_FILTER, fill_missing=True)
    clear_output(wait=True)

100%|██████████| 40/40 [17:17<00:00, 25.93s/it]


### Group all cities

In [4]:
for resolution in tqdm(SELECTED_RESOLUTIONS):
    group_cities(cities, resolution)
    clear_output(wait=True)

100%|██████████| 3/3 [05:11<00:00, 103.86s/it]


# Create Final Dataset 4 Word2Vec Training

In [2]:
problem_columns = [
    'amenity_waste_basket',
    'landuse_grass',
    'historic_tomb',
    'natural_tree',
    'natural_tree_row',
    'natural_valley', # northern Warsaw
]

In [3]:
for resolution in tqdm(SELECTED_RESOLUTIONS):
    df = load_processed_dataset(resolution, select_tags=SELECTED_TAGS)

    df = df.drop(columns=problem_columns)
    zero_cities_columns = df.columns.drop('city')[(df.groupby('city').sum() > 0).sum() == 0]
    df = df.drop(columns=zero_cities_columns)
    df = df[~(df.drop(columns='city') == 0).all(axis=1)]

    df.groupby('city').size().sort_values(ascending=False)

    df_cols = pd.read_csv(f"{DATA_OSM_CITIES_DIR}/w2v_columns.csv")
    df_cols = df_cols.drop(df_cols[df_cols['column'] == 'city'].index)

    split = df_cols.column.str.split("_", 1)
    df_cols['key'] = split.str[0]
    df_cols['value'] = split.str[1]

    used_vals = dict(df_cols[['key', 'value']].groupby('key')['value'].apply(set))

    from_wiki = load_filter("from_wiki.json")

    row = "\\TopicLine \\Topic[\\texttt{%s}] & \\texttt{%s} & %s \\\\\n"

    rows = []
    for key in sorted(from_wiki.keys()):
        vals = sorted(from_wiki[key])
        key_str = key.replace("_", "\\_")
        for val in vals:
            val_str = val.replace("_", "\\_")
            if key in SELECTED_TAGS and val in used_vals[key]:
                rows.append(row%(key_str, val_str, "\\checkmark"))
            else:
                rows.append(row%(key_str, val_str, ""))
    with open(f"{DATA_OSM_CITIES_DIR}/tags.txt", 'wt') as f:
        f.writelines(rows)

    df.to_csv(f'{DATA_OSM_CITIES_DIR}/{resolution}.csv', sep=';')

100%|██████████| 3/3 [10:26<00:00, 208.76s/it]
