# Summary of extrinsic analysis

This notebook produces a quick overview of core results from the extrinsic results.

Only run this notebook once you have used a reference data set and successfully run the notebook for the [extrinsic analysis](./extrinsinc_analysis.ipynb).

For further information on how to use and interpret the different metrics, we refer to the original analysis notebook.

In [4]:
import geopandas as gpd
import osmnx as ox
import networkx as nx
import yaml
import matplotlib.pyplot as plt
import contextily as cx
import json
import pickle
import pandas as pd 
import numpy as np
import os.path
from src import evaluation_functions as ef
from src import matching_functions as mf

In [5]:
with open(r'../config.yml') as file:

    parsed_yaml_file = yaml.load(file, Loader=yaml.FullLoader)

    study_area = parsed_yaml_file['study_area']
    
print('Settings loaded!')

Settings loaded!


**Load data:**

In [6]:
#osm_graph = ox.load_graphml(f'../data/osm_{study_area}.graphml')
osm_simplified_graph = ox.load_graphml(f'../data/osm_{study_area}_simple.graphml')

# Convert to nodes and edges
#osm_nodes, osm_edges = ox.graph_to_gdfs(osm_graph)
osm_simplified_nodes, osm_simplified_edges = ox.graph_to_gdfs(osm_simplified_graph)

# Load simplified and non-simplified graphs
#ref_graph = ox.load_graphml(f'../data/ref_{study_area}.graphml')
ref_simplified_graph = ox.load_graphml(f'../data/ref_{study_area}_simple.graphml')

# Convert to nodes and ref_edges
#ref_nodes, ref_edges = ox.graph_to_gdfs(ref_graph)
ref_simplified_nodes, ref_simplified_edges = ox.graph_to_gdfs(ref_simplified_graph)


print('Data loaded!')

Data loaded!


**Load results:**

In [7]:
with open(f'../results/extrinsic_analysis_{study_area}.json') as input_file:
    all_results = json.load(input_file)

with open(f'../results/grid_results_extrinsic_{study_area}.pickle', 'rb') as fp:
    grid = pickle.load(fp)

with open(f'../results/feature_matches_{study_area}.json') as input_file:
    fm_results = json.load(input_file)

with open(f'../results/grid_results_feature_matching_{study_area}.pickle', 'rb') as fp:
    grid_fm = pickle.load(fp)

In [40]:
osm_df = pd.DataFrame.from_dict(all_results['osm_results'], orient='index')
ref_df = pd.DataFrame.from_dict(all_results['ref_results'], orient='index')

osm_df.rename(columns={0:'OSM'},inplace=True)
ref_df.rename(columns={0:'reference'}, inplace=True)

combined_results = pd.concat([osm_df, ref_df], axis=1)

combined_results = combined_results.round(2)

combined_results['pct_diff'] = combined_results.apply( lambda x: ef.find_pct_diff(x, 'OSM', 'reference'), axis=1)

select_coverage_results = ['node_count', 'edge_count', 'edge_density_m_sqkm', 'node_density_sqkm',
       'protected_density_m_sqkm','unprotected_density_m_sqkm', 'mixed_density_m_sqkm']
       
coverage_results = combined_results.loc[select_coverage_results]

coverage_results.index.name = 'Quality Metrics'

select_topology_results = ['dangling_node_count','dangling_node_density_sqkm', 
       'simplified_edge_pct_diff', 'simplified_node_pct_diff', 'edges_pr_km',
       'nodes_pr_km', 'alpha', 'beta', 'gamma', 'component_count',
       'largest_cc_pct_size', 'largest_cc_length_km', 'count_adjacent_issues',
        'count_overshoots', 'count_undershoots',
       'edge_component_ratio']
       
topology_results = combined_results.loc[select_topology_results]

topology_results['OSM_normalised'] = topology_results.OSM / (osm_simplified_edges.length.sum()/1000)

topology_results['reference_normalised'] = topology_results.reference / (ref_simplified_edges.length.sum()/1000)

topology_results['normalised_values_pct_diff'] = topology_results.apply( lambda x: ef.find_pct_diff(x, 'OSM_normalised','reference_normalised'), axis=1)

# Some values cannot meaningfully be normalised per network length
topology_results.loc[['largest_cc_pct_size','alpha', 'beta','gamma'],['OSM_normalised','reference_normalised']] = None

topology_results.index.name = 'Quality Metrics'

Unnamed: 0,OSM,reference,pct_diff
node_count,4725.0,3655.0,25.54
edge_count,5479.0,4208.0,26.24
edge_density_m_sqkm,5869.44,3437.55,52.26
node_density_sqkm,26.05,20.15,25.54
dangling_node_density_sqkm,9.84,4.81,68.67
protected_density_m_sqkm,5303.09,2996.63,55.58
unprotected_density_m_sqkm,514.78,440.92,15.46
mixed_density_m_sqkm,51.4,,
simplified_edge_pct_diff,82.48,60.18,31.26
simplified_node_pct_diff,84.52,63.5,28.4


In [129]:
# TODO: Style topology results

# Style all columns based on high values good/bad classification 
# Write function that changes numerical values to string and appends pct sign at the end

cell_hover = {
    'selector': 'td:hover',
    'props': [('background-color', '#ffffb3')]
}

row_hover = {
    'selector': 'tr:hover',
    'props': [('background-color', '#eff7fa')]
}

index_name = {
    'selector': '.index_name',
    'props': 'color:white; font-weight:bold; background-color: purple; font-size:1.3em;'
}

columns = {
    'selector': 'th', #'th.col_heading', #'th:not(.index_name)',
    'props': 'background-color: purple; color: white; font-weight:bold; font-size:1.3em;'
}

caption = {
    'selector': 'caption',
    'props': 'caption-side: top; font-size:2em;'
}

cell_style =  {
    'selector': 'td', 
    'props': 'text-align: center; font-weight: bold;'
}

def format_topology_style(styler):
    styler.set_caption('Network Topology Quality Metrics')
    styler.format(precision=2, na_rep=' - ', thousands=',')
    #styler.format(add_pct_sign())
    #styler.background_gradient(axis=None, vmin=1, vmax=5, cmap="YlGnBu")
    styler.set_table_styles([cell_hover, row_hover, columns, caption, index_name, cell_style], overwrite=False)
    #styler.applymap_
    styler.applymap_index(lambda v: 'color:white; font-style: italic; font-weight:bold; background-color: purple; font-size:1em;', axis=0)
    return styler

topology_results.style.pipe(format_topology_style)

Unnamed: 0_level_0,OSM,reference,pct_diff,OSM_normalised,reference_normalised,normalised_values_pct_diff
Quality Metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dangling_node_count,1785.0,872.0,68.72,2.21,1.40,44.99
dangling_node_density_sqkm,9.84,4.81,68.67,0.01,0.01,44.93
simplified_edge_pct_diff,82.48,60.18,31.26,0.10,0.10,5.65
simplified_node_pct_diff,84.52,63.5,28.4,0.10,0.10,2.73
edges_pr_km,5.15,6.75,-26.89,0.01,0.01,-51.72
nodes_pr_km,4.44,5.86,-27.57,0.01,0.01,-52.37
alpha,0.08,0.08,0.0,-,-,-25.72
beta,1.16,1.15,0.87,-,-,-24.87
gamma,0.39,0.38,2.6,-,-,-23.17
component_count,340.0,195.0,54.21,0.42,0.31,29.51


dangling node count and dangling node density:
- high values bad - indicate snapping issues?

Simplified pct diff:
- high values bad (BUT! important to note that it might reflect that OSM data are broken down into way more segments due to many tags)

edges and nodes pr km:
- High values bad (again, indicates too complex mapping?) But it is based on infrastructure length? So might just be an indication of the mapping style (e.g. true geoms vs. centerline) So maybe high values good?

alpha, beta, gamma:
- neutral

component_count:
- high values bad

largest_cc_pct size:
- high values good - but normalised does not make sense

count_adjacent_issues, over and undershoots:

- high values bad

edge_component_ratio:

- high values good 


OBS - a lot of the interpretation assumes that the 'best' dataset is the correct one? And that they are mapping the same feautures

In [8]:
grid

Unnamed: 0,grid_id,geometry,count_osm_edges,count_osm_nodes,count_osm_simplified_edges,count_osm_simplified_nodes,count_ref_edges,count_ref_nodes,count_ref_simplified_edges,count_ref_simplified_nodes,...,ref_node_edge_ratio,component_ids_osm,component_ids_ref,cells_reached_osm,cells_reached_ref,cell_reach_diff,count_osm_dangling_nodes,count_ref_dangling_nodes,osm_dangling_nodes_per_node,ref_dangling_nodes_per_node
0,0,"POLYGON ((710193.940 6181853.300, 710682.960 6...",46.0,43.0,14.0,11.0,,,,,...,,[0],,653,0,653,5.0,,0.454545,
1,1,"POLYGON ((710682.960 6181371.592, 710682.960 6...",38.0,35.0,12.0,9.0,,,,,...,,[0],,653,0,653,5.0,,0.555556,
2,2,"POLYGON ((710682.960 6180889.885, 710682.960 6...",21.0,20.0,4.0,3.0,2.0,2.0,2.0,2.0,...,1.0,[0],[139],653,7,646,2.0,2.0,0.666667,1.0
3,3,"POLYGON ((710682.960 6180408.177, 710682.960 6...",46.0,45.0,20.0,19.0,1.0,1.0,1.0,1.0,...,1.0,"[0, 243]",[139],655,7,648,11.0,1.0,0.578947,1.0
4,4,"POLYGON ((710682.960 6179926.469, 710682.960 6...",31.0,31.0,15.0,15.0,1.0,1.0,1.0,1.0,...,1.0,"[0, 243]",[137],655,9,646,10.0,1.0,0.666667,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
765,765,"POLYGON ((727309.640 6173664.269, 727309.640 6...",18.0,17.0,1.0,,,,,,...,,[0],,653,0,653,,,,
766,766,"POLYGON ((727309.640 6173182.561, 727309.640 6...",10.0,10.0,3.0,3.0,,,,,...,,"[0, 293]",,654,0,654,3.0,,1.000000,
767,767,"POLYGON ((727309.640 6172700.853, 727309.640 6...",16.0,17.0,3.0,4.0,,,,,...,,"[0, 292]",,654,0,654,3.0,,0.750000,
768,768,"POLYGON ((727309.640 6172219.145, 727309.640 6...",,,,,,,,,...,,,,0,0,0,,,,


In [None]:
# Read dictionary into dataframe with result type as index and osm/ref as columns

# Compute new col with difference

# Plot difference based on color? Or plot all values based on color?

## Local differences

In [None]:
# List of values to be plotted and labels

# Create subplots based on lengths

# Flatten axes?

# Delete unneeded plot

# Plot values

# Set axis off

# How to control colors?


# Plot of differences in:
#  network density length
# Node density
# Protected density
# Unprotected density
# Local node/edge ratio



# Dangling node density 


# All this is saved to grid - no need for recomputing anything

In [None]:
# Also load feature matching results and add to things to plot!

## Component comparison

In [None]:
# Plots of dangling nodes per grid cell for both
# Plots of cc and cc_i for both

# Plot of connected component connectivity

In [None]:
# Export results

**How to summarize feature matching?**