# Summary of extrinsic analysis

Input files for this notebook:

- file...

Output files from this notebook:

- file...

This notebook produces a quick overview of core results from the extrinsic results.

Only run this notebook once you have used a reference data set and successfully run the notebook for the [extrinsic analysis](./extrinsinc_analysis.ipynb).

For further information on how to use and interpret the different metrics, we refer to the original analysis notebook.

In [1]:
import geopandas as gpd
import osmnx as ox
import networkx as nx
import yaml
import matplotlib.pyplot as plt
import contextily as cx
import json
import pickle
import pandas as pd 
import numpy as np
import os.path
from src import evaluation_functions as ef

### Load settings

In [3]:
with open(r'../config.yml') as file:

    parsed_yaml_file = yaml.load(file, Loader=yaml.FullLoader)

    study_area = parsed_yaml_file['study_area']
    
print('Settings loaded!')

Settings loaded!


### Load data

In [4]:
#osm_graph = ox.load_graphml(f'../data/osm_{study_area}.graphml')
osm_simplified_graph = ox.load_graphml(f'../data/osm_{study_area}_simple.graphml')

# Convert to nodes and edges
#osm_nodes, osm_edges = ox.graph_to_gdfs(osm_graph)
osm_simplified_nodes, osm_simplified_edges = ox.graph_to_gdfs(osm_simplified_graph)

# Load simplified and non-simplified graphs
#ref_graph = ox.load_graphml(f'../data/ref_{study_area}.graphml')
ref_simplified_graph = ox.load_graphml(f'../data/ref_{study_area}_simple.graphml')

# Convert to nodes and ref_edges
#ref_nodes, ref_edges = ox.graph_to_gdfs(ref_graph)
ref_simplified_nodes, ref_simplified_edges = ox.graph_to_gdfs(ref_simplified_graph)

print('Data loaded!')

Data loaded!


### Load results

In [5]:
with open(f'../results/extrinsic_analysis_{study_area}.json') as input_file:
    all_results = json.load(input_file)

with open(f'../results/grid_results_extrinsic_{study_area}.pickle', 'rb') as fp:
    grid = pickle.load(fp)

with open(f'../results/feature_matches_{study_area}.json') as input_file:
    fm_results = json.load(input_file)

with open(f'../results/grid_results_feature_matching_{study_area}.pickle', 'rb') as fp:
    grid_fm = pickle.load(fp)

### Styling settings

In [10]:
cell_hover = {
    'selector': 'td:hover',
    'props': [('background-color', '#ffffb3')]
}

row_hover = {
    'selector': 'tr:hover',
    'props': [('background-color', '#eff7fa')]
}

caption = {
    'selector': 'caption',
    'props': 'caption-side: top; font-size:2em;'
}

cell_style =  {
    'selector': 'td', 
    'props': 'text-align: center; font-weight: bold;'
}

# Styling setting for completeness results
index_name_completeness = {
    'selector': '.index_name',
    'props': 'color:white; font-weight:bold; background-color: orange; font-size:1.3em;'
}

columns_completeness = {
    'selector': 'th',
    'props': 'background-color: orange; color: white; font-weight:bold; font-size:1.3em;'
}

def format_completeness_style(styler):
    styler.set_caption('Network Completeness Quality Metrics')
    styler.format(precision=2, na_rep=' - ', thousands=',', formatter={'pct_difference': lambda x: f'{str(x)} %', 'normalised_values_pct_difference' : lambda x: f'{str(x)} %'})
    styler.set_table_styles([cell_hover, row_hover, columns_completeness, caption, index_name_completeness, cell_style], overwrite=False)
    styler.applymap_index(lambda v: 'color:white; font-style: italic; font-weight:bold; background-color: orange; font-size:1em;', axis=0)
    styler.applymap(ef.style_pct_value_completeness, osm_bigger='color:blue;',osm_smaller='color:green;')
    
    return styler


# Styling settings for topology results
index_name_topo = {
    'selector': '.index_name',
    'props': 'color:white; font-weight:bold; background-color: purple; font-size:1.3em;'
}

columns_topo = {
    'selector': 'th',
    'props': 'background-color: purple; color: white; font-weight:bold; font-size:1.3em;'
}

high_bad_topo = ['dangling_node_count','dangling_node_density_sqkm','component_count','count_adjacent_issues','count_overshoots','count_undershoots']
# TODO: Change adjacent issues to 'component_gaps'
high_good_topo = ['largest_cc_pct_size','largest_cc_length_km','edge_component_ratio']

topo_slice_inverse = high_bad_topo,['pct_difference','normalised_values_pct_difference']
topo_slice = high_good_topo,['pct_difference','normalised_values_pct_difference']

def format_topology_style(styler):
    styler.set_caption('Network Topology Quality Metrics')
    styler.format(precision=2, na_rep=' - ', thousands=',', formatter={'pct_difference': lambda x: f'{str(x)} %', 'normalised_values_pct_difference' : lambda x: f'{str(x)} %'})
    styler.set_table_styles([cell_hover, row_hover, columns_topo, caption, index_name_topo, cell_style], overwrite=False)
    styler.applymap_index(lambda v: 'color:white; font-style: italic; font-weight:bold; background-color: purple; font-size:1em;', axis=0)
    styler.applymap(ef.style_pct_value, osm_better='color:blue;',osm_worse='color:green;',subset=topo_slice)
    styler.applymap(ef.style_pct_value_inversed, osm_better='color:blue;',osm_worse='color:green;',subset=topo_slice_inverse)
    
    return styler



## Global differences

In [7]:
osm_df = pd.DataFrame.from_dict(all_results['osm_results'], orient='index')
ref_df = pd.DataFrame.from_dict(all_results['ref_results'], orient='index')

osm_df.rename(columns={0:'OSM'},inplace=True)
ref_df.rename(columns={0:'reference'}, inplace=True)

combined_results = pd.concat([osm_df, ref_df], axis=1)

combined_results = combined_results.round(2)

combined_results['pct_difference'] = combined_results.apply( lambda x: ef.find_pct_diff(x, 'OSM', 'reference'), axis=1)

select_completeness_results = ['node_count', 'edge_count', 'edge_density_m_sqkm', 'node_density_sqkm',
       'protected_density_m_sqkm','unprotected_density_m_sqkm', 'mixed_density_m_sqkm']
       
completeness_results = combined_results.loc[select_completeness_results]

completeness_results.index.name = 'Quality Metrics'

select_topology_results = ['dangling_node_count','dangling_node_density_sqkm', 
       'simplified_edge_pct_diff', 'simplified_node_pct_diff', 'edges_pr_km',
       'nodes_pr_km', 'alpha', 'beta', 'gamma', 'component_count',
       'largest_cc_pct_size', 'largest_cc_length_km', 'count_adjacent_issues',
        'count_overshoots', 'count_undershoots',
       'edge_component_ratio']
       
topology_results = combined_results.loc[select_topology_results]

topology_results['OSM_normalised'] = topology_results.OSM / (osm_simplified_edges.length.sum()/1000)

topology_results['reference_normalised'] = topology_results.reference / (ref_simplified_edges.length.sum()/1000)

topology_results['normalised_values_pct_difference'] = topology_results.apply( lambda x: ef.find_pct_diff(x, 'OSM_normalised','reference_normalised'), axis=1)
#topology_results['normalised_values_pct_diff'] = topology_results['normalised_values_pct_diff'].astype(str) + ' %'

# Some values cannot meaningfully be normalised per network length
topology_results.loc[['largest_cc_pct_size','alpha', 'beta','gamma'],['OSM_normalised','reference_normalised']] = None

topology_results.index.name = 'Quality Metrics'

### Interpretation of network completeness results

In the table below, values are colored based on whether the results indicate a more complete dataset in the OSM data (<span style='color:blue'>blue</span>) or in the reference data (<span style='color:green'>green</span>).

Whether differences in the coverage or completeness of the data are due to errors of omission or commission requires further analysis - and ideally familiarity with the study area. In the interpretation we assume differences in network density etc. to be due to errors of comisssion - but if you know that this assumption is false for your study area, the interpretation of differences in largest component etc. should of course be adjusted.

In [11]:
completeness_results.style.pipe(format_completeness_style)

Unnamed: 0_level_0,OSM,reference,pct_difference
Quality Metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
node_count,4725.0,3655.00,25.54 %
edge_count,5479.0,4208.00,26.24 %
edge_density_m_sqkm,5869.44,3437.55,52.26 %
node_density_sqkm,26.05,20.15,25.54 %
protected_density_m_sqkm,5303.09,2996.63,55.58 %
unprotected_density_m_sqkm,514.78,440.92,15.46 %
mixed_density_m_sqkm,51.4,-,-


### Interpretation of network topology results

When comparing the values for respectively the OSM and the reference data, the value for the OSM data is taken as the basis, to which the value for the reference data is compared. In the columns computing the percentual difference, positive values thus means that the original metric value is highest for the OSM data, while negative value for pct difference means that the metric value is highest for the reference data.

In the table below, values are colored based on whether the results indicate a higher quality for the OSM data (<span style='color:blue'>blue</span>) or the reference data (<span style='color:green'>green</span>). For some metrics, a high value indicates high quality - e.g. how many percent of the network is included in the largest component or the edge/component ratio, while it for other metrics indicates lower quality (e.g. the number of overshoots).

To account for differences in the extent of the network, the column contains two columns comparing the OSM and the reference data: One comparing the original values ('pct_diff') and one comparing the metric value per kilometer of edges. In this way, we account for e.g. situations where a higher number of potential topological differences in a dataset might be explained by that dataset covering much more infrastructure.

For some metrics - such as the number of edges per kilometer or the difference in node count between the simplified and non-simplified graphs - we cannot judge whether high or low values are an indication of data quality, without further inspection.

In the interpretation of some metrics - e.g. the length of the largest component - we assume that the dataset with more kilometers mapped is the correct one (i.e. we assume that in case of differences, it is a case of ommission by the smaller dataset, not a case of commission by the larger one). If you know that this assumption is false for your study area, the interpretation of differences in largest component etc. should of course be adjusted.

In [12]:
topology_results.style.pipe(format_topology_style)

Unnamed: 0_level_0,OSM,reference,pct_difference,OSM_normalised,reference_normalised,normalised_values_pct_difference
Quality Metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dangling_node_count,1785.0,872.0,68.72 %,2.21,1.40,44.99 %
dangling_node_density_sqkm,9.84,4.81,68.67 %,0.01,0.01,44.93 %
simplified_edge_pct_diff,82.48,60.18,31.26 %,0.10,0.10,5.65 %
simplified_node_pct_diff,84.52,63.5,28.4 %,0.10,0.10,2.73 %
edges_pr_km,5.15,6.75,-26.89 %,0.01,0.01,-51.72 %
nodes_pr_km,4.44,5.86,-27.57 %,0.01,0.01,-52.37 %
alpha,0.08,0.08,0.0 %,-,-,-25.72 %
beta,1.16,1.15,0.87 %,-,-,-24.87 %
gamma,0.39,0.38,2.6 %,-,-,-23.17 %
component_count,340.0,195.0,54.21 %,0.42,0.31,29.51 %


## Local differences

Does it make sense to create several small multiples with each their theme? Or is the point exactly to show different metrics together to aid detecing correlations etc?

Fix color scheme when plotting the same value for OSM and ref to aid interpretation

Choose between plotting values for both or plotting differences?

3 rows/columns for 1 and 2: One with total values for each, one with differences


1. Completeness:
    - Density - differences in edge and node density, 
    - Differences in protected, unprotected, and mixed if available



2. Structure + topology:
    - Local Edge node ratio for both - or just the differences?
    - Dangling nodes
    - Dn nodes to nodes ratio
    - over/undershoots



3. Component analysis:
    - all components
    - cells with more than one component
    - cells reached
    - Differences in cells reached

In [None]:


# List of values to be plotted and labels

# Create subplots based on lengths

# Flatten axes?

# Delete unneeded plot

# Plot values

# Set axis off

# How to control colors?

# All this is saved to grid - no need for recomputing anything

### Feature Matching

In [None]:
# Make subplot (either two rows or two columns with OSM in one and reference in one)

# Plot all these columns with titles, nice colors, legends, axis off etc.

fm_cols = ['count_osm_matched', 'length_osm_matched',
       'count_osm_unmatched', 'length_osm_unmatched', 'count_ref_matched',
       'length_ref_matched', 'count_ref_unmatched', 'length_ref_unmatched']


In [70]:
grid.columns

Index(['grid_id', 'geometry', 'count_osm_edges', 'count_osm_nodes',
       'count_osm_simplified_edges', 'count_osm_simplified_nodes',
       'count_ref_edges', 'count_ref_nodes', 'count_ref_simplified_edges',
       'count_ref_simplified_nodes', 'osm_edge_density', 'osm_node_density',
       'ref_edge_density', 'ref_node_density', 'edge_density_diff',
       'node_density_diff', 'osm_protected_density', 'osm_unprotected_density',
       'osm_mixed_density', 'ref_protected_density', 'ref_unprotected_density',
       'protected_density_diff', 'unprotected_density_diff',
       'osm_edge_node_ratio', 'ref_edge_node_ratio', 'edge_node_ratio_diff',
       'component_ids_osm', 'component_ids_ref', 'cells_reached_osm',
       'cells_reached_ref', 'cell_reach_diff', 'count_osm_dangling_nodes',
       'count_ref_dangling_nodes', 'osm_dangling_nodes_per_node',
       'ref_dangling_nodes_per_node'],
      dtype='object')

In [69]:
grid_fm.columns

Index(['grid_id', 'geometry', 'count_osm_edges', 'count_osm_nodes',
       'count_osm_simplified_edges', 'count_osm_simplified_nodes',
       'count_ref_edges', 'count_ref_nodes', 'count_ref_simplified_edges',
       'count_ref_simplified_nodes', 'count_osm_matched', 'length_osm_matched',
       'count_osm_unmatched', 'length_osm_unmatched', 'count_ref_matched',
       'length_ref_matched', 'count_ref_unmatched', 'length_ref_unmatched'],
      dtype='object')

## Export plots

In [None]:
# Export results