# Analysis of OSM-GeoDanmark differences between municipalities

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import pickle
import json
import pandas as pd
import seaborn as sns
import plotly.express as px
import numpy as np
from scipy import stats

%run ../settings/yaml_variables.py
%run ../settings/paths.py

In [None]:
make_plots = True

## Differences per municipality

In [None]:
# Load results of municipal analysis

osm_muni = pd.read_csv(osm_analysis_data_fp+"muni_network_counts.csv",index_col=False)
ref_muni = pd.read_csv(ref_analysis_data_fp+"muni_network_counts.csv",index_col=False)

osm_muni['datasource'] = 'osm'
ref_muni['datasource'] = 'ref'

combined = pd.concat([osm_muni.reset_index(), ref_muni.reset_index()])
combined.drop('index',axis=1, inplace=True)

osm_muni.set_index('navn',inplace=True)
ref_muni.set_index('navn',inplace=True)

osm_muni.fillna(0,inplace=True)
ref_muni.fillna(0,inplace=True)

### Compute total differences between OSM & GeoDanmark data

In [None]:
diffs = []

diff_cols = ['node_count', 'dangling_node_count',
       'infra_km', 'infra_dens', 'infra_pop', 'overshoots',
       'undershoots', 'component_gaps', 'component_count',
       'comp_per_km']

for c in diff_cols:
    diff = ref_muni[c] - osm_muni[c]
    diffs.append(diff)

diff_df = pd.DataFrame(diffs).transpose()
diff_df

In [None]:
diff_df.describe()

### Compute percent differences between OSM & GeoDanmark data

In [None]:
topology_cols = ['overshoots','undershoots','over_under','component_gaps']
pct_diff_cols = [d for d in diff_cols if d not in topology_cols]

pct_diffs = []
for d in pct_diff_cols:
    pct_diff = np.round(
        100
        * (ref_muni[d] - osm_muni[d])
        / osm_muni[d],
        2,
    )
    pct_diffs.append(pct_diff)

pct_diff_df = pd.DataFrame(pct_diffs).transpose()
pct_diff_df

In [None]:
pct_diff_df.describe()

#### Completeness

In [None]:
count_osm_bigger = len(diff_df[diff_df.infra_km < 0])
osm_bigger = list(diff_df[diff_df.infra_km < 0].index.values)
count_ref_bigger = len(diff_df[diff_df.infra_km > 0])
ref_bigger = list(diff_df[diff_df.infra_km > 0].index.values)

print(f"Out of {len(diff_df)} municipalities, the length of the OSM network is smaller in {count_ref_bigger} and larger in {count_osm_bigger} municipalities.")

min_diff = diff_df.infra_km.abs().min()
max_diff = diff_df.infra_km.abs().max()
min_pct_diff = pct_diff_df.infra_km.abs().min()
max_pct_diff = pct_diff_df.infra_km.abs().max()

ave_abs_diff = diff_df.infra_km.abs().mean()
ave_pct_diff = pct_diff_df.infra_km.abs().mean()

print(f"The difference ranges between {min_diff:.2f} and {max_diff:.2f} km or {min_pct_diff:.2f} - {max_pct_diff:.2f}%")
print(f"The average difference is {ave_abs_diff:.2f} km or {ave_pct_diff:.2f}%.")

In [None]:
osm_dens_min = osm_muni.infra_dens.min()
osm_dens_max = osm_muni.infra_dens.max()
osm_dens_ave = osm_muni.infra_dens.mean()

ref_dens_min = ref_muni.infra_dens.min()
ref_dens_max = ref_muni.infra_dens.max()
ref_dens_ave = ref_muni.infra_dens.mean()

ave_dens_diff = diff_df.infra_dens.abs().mean()
min_dens_diff = diff_df.infra_dens.abs().min()
max_dens_diff = diff_df.infra_dens.abs().max()

ave_dens_pct_diff = pct_diff_df.infra_dens.abs().mean()
min_dens_pct_diff = pct_diff_df.infra_dens.abs().min()
max_dens_pct_diff = pct_diff_df.infra_dens.abs().max()

In [None]:
print(f"The OSM infrastructure density ranges from {osm_dens_min} - {osm_dens_max} km/sqkm, with an average density of {osm_dens_ave} km/sqkm.")
print(f"The {reference_name} infrastructure density ranges from {ref_dens_min} - {ref_dens_max} km/sqkm, with an average density of {ref_dens_ave} km/sqkm.")
print(f"The difference ranges between {min_dens_diff:.3f} and {max_dens_diff:.2f} k/sqkm or {min_dens_pct_diff:.2f} - {max_dens_pct_diff:.2f}%")
print(f"The average difference between municipalties is {ave_dens_diff:.2f} km/sqkm or {ave_dens_pct_diff:.2f} percent.")

#### Topology & network structure

In [None]:
osm_comp_count = len(diff_df[diff_df.component_count < 0])
osm_comp = list(diff_df[diff_df.component_count < 0].index.values)
ref_comp_count = len(diff_df[diff_df.component_count > 0])
ref_comp = list(diff_df[diff_df.component_count > 0].index.values)


osm_comp_km_count = len(diff_df[diff_df.comp_per_km < 0])
osm_comp_km = list(diff_df[diff_df.comp_per_km < 0].index.values)
ref_comp_km_count = len(diff_df[diff_df.comp_per_km > 0])
ref_comp_km = list(diff_df[diff_df.comp_per_km > 0].index.values)

print(f"Out of {len(diff_df)} municipalities, {osm_comp_count} have more components in the OSM data than {reference_name}, {ref_comp_count} have fewer.")
print(f"Out of {len(diff_df)} municipalities, {osm_comp_km_count} have more components per kilometer in the OSM data than {reference_name}, {ref_comp_km_count} have fewer.")
print("\n")
print(f"In the OSM data, the municipalities have between {osm_muni.comp_per_km.min()} and {osm_muni.comp_per_km.max()} components per km.")
print(f"In the {reference_name} data, the municipalities have between {ref_muni.comp_per_km.min()} and {ref_muni.comp_per_km.max()} components per km.")


#### Topology errors

In [None]:
osm_over = osm_muni.overshoots.sum()
osm_under = osm_muni.undershoots.sum()
osm_muni_over = len(osm_muni[osm_muni.overshoots>0])
osm_muni_under = len(osm_muni[osm_muni.undershoots>0])
ref_over = ref_muni.overshoots.sum()
ref_under = ref_muni.undershoots.sum()
ref_muni_over = len(osm_muni[ref_muni.overshoots>0])
ref_muni_under = len(osm_muni[ref_muni.undershoots>0])

print(f"The OSM data has {osm_over:.0f} overshoots in {osm_muni_over} municipalities and {osm_under:.0f} undershoots in {osm_muni_under} municipalities.")

print(f"The {reference_name} data has {ref_over:.0f} overshoots in {ref_muni_over} municipalities and {ref_under:.0f} undershoots in {ref_muni_under} municipalities.")

osm_length = osm_muni.infra_km.sum() / 10
ref_length = ref_muni.infra_km.sum() / 10

osm_over_km = osm_over / osm_length
osm_under_km = osm_under / osm_length
ref_over_km = ref_over / ref_length
ref_under_km = ref_under / ref_length

print(f"The OSM data has {osm_over_km:.2f} overshoots and {osm_under_km:.2f} undershoots per 10 km.")
print(f"The {reference_name} data has {ref_over_km:.2f} overshoots and {ref_under_km:.2f} undershoots per 10 km.")

In [None]:
osm_more_over_count = len(diff_df[diff_df.overshoots < 0])
osm_more_over = list(diff_df[diff_df.overshoots < 0].index.values)
ref_more_over_count = len(diff_df[diff_df.overshoots > 0])
ref_more_over = list(diff_df[diff_df.overshoots > 0].index.values)

osm_more_under_count = len(diff_df[diff_df.undershoots < 0])
osm_more_under = list(diff_df[diff_df.undershoots < 0].index.values)
ref_more_under_count = len(diff_df[diff_df.undershoots > 0])
ref_more_under = list(diff_df[diff_df.undershoots > 0].index.values)

print(f"There are {osm_more_over_count} municipalities with more overshoots in OSM than in {reference_name}.")
print(f"There are {ref_more_over_count} municipalities with more overshoots {reference_name} than in OSM.")

print(f"There are {osm_more_under_count} municipalities with more undershoots in OSM than in {reference_name}.")
print(f"There are {ref_more_under_count} municipalities with more undershoots {reference_name} than in OSM.")

#### Plot absolute values for OSM & GeoDanmark

In [None]:
if make_plots:
    for d in diff_cols:

        fig, ax = plt.subplots(figsize=(20,20))

        sns.barplot(combined, x="navn",y=d,ax=ax, hue='datasource',palette='Set2')
        plt.xticks(rotation = 45, ha = 'right')
        plt.xlabel('')
        plt.ylabel(d)
        plt.title('Comparison');

#### Plot total differences

In [None]:
plot_labels = {
    "navn":"",
    "node_count":"nodes",
    "dangling_node_count":"dangling nodes",
    "infra_km": "bicycle infrastructure (km)",
    "infra_pop": "infrastructure per population (km/1000)",
    "component_gaps": "component gaps",
    "component_count": "components",
    "comp_per_km": "components per km",
    "datasource": "data source"
    }

In [None]:
if make_plots:

    for d in diff_cols:

        fig = px.bar(diff_df, y=d, x=diff_df.index, labels=plot_labels)

        fig.update_traces(marker_color='green')

        fig.show()

**Plot percent differences between GeoDK and OSM**

In [None]:
if make_plots:
    for d in pct_diff_cols:

        fig = px.bar(pct_diff_df, y=d, x=diff_df.index, labels=plot_labels)

        fig.update_traces(marker_color='purple')

        fig.show()

#### Plots of value distributions

In [None]:
if make_plots:

    for d in diff_cols:

        fig = px.violin(diff_df, y=d,labels=plot_labels,title='Value distribution: Total differences')

        fig.update_traces(marker_color='green')

        fig.show()

In [None]:
if make_plots:
    
    for d in pct_diff_cols:

        fig = px.violin(pct_diff_df, y=d, labels=plot_labels, title="Value distribution: Percent differences")

        fig.update_traces(marker_color='purple')

        fig.show()

In [None]:
if make_plots:
    
    for d in diff_cols:

        fig = px.violin(combined, x='datasource', y=d, title="Comparison of value distributions",labels=plot_labels)

        fig.update_traces(marker_color='red')

        fig.show()

#### Correlation between differences

**Correlation between total differences**

In [None]:
sns.pairplot(diff_df);

**Correlation between standardized differences**

In [None]:
diff_std = pd.DataFrame()

for c in diff_df.columns:
    
    # Z-Score using scipy
    diff_std[c] = stats.zscore(diff_df[c])

sns.pairplot(diff_std);

**Correlation between percent differences**

In [None]:
sns.pairplot(pct_diff_df);

**Correlation between standardized percent differences**

In [None]:
pct_diff_std = pd.DataFrame()

for c in pct_diff_df.columns:
    # Z-Score using scipy
    pct_diff_std[c] = stats.zscore(pct_diff_df[c])

sns.pairplot(pct_diff_std)

plt.title('Correlation between differences');

## Compare ranks

In [None]:
# For infra length, number of components, comps per km

osm_rank = pd.DataFrame()

for d in diff_cols:

    osm_rank[d] = osm_muni[d].rank(ascending=False, method="first")

osm_rank['source'] = 'OSM'

ref_rank = pd.DataFrame()

for d in diff_cols:

    ref_rank[d] = ref_muni[d].rank(ascending=False, method="first")

ref_rank['source'] = reference_name


infra_ranks = pd.concat([osm_rank[['infra_km','source']].reset_index(), ref_rank[['infra_km','source']].reset_index()])

infra_ranks.rename({"infra_km":'rank'},axis=1,inplace=True)

comp_ranks = pd.concat([osm_rank[['component_count','source']].reset_index(), ref_rank[['component_count','source']].reset_index()])

comp_ranks.rename({"component_count":'rank'},axis=1,inplace=True)

comp_km_ranks = pd.concat([osm_rank[['comp_per_km','source']].reset_index(), ref_rank[['comp_per_km','source']].reset_index()])

comp_km_ranks.rename({"comp_per_km":'rank'},axis=1,inplace=True)

In [None]:
osm_rank.compare(ref_rank, keep_equal=True)

In [None]:
from matplotlib.ticker import MultipleLocator, FixedFormatter, FixedLocator

# based on https://stackoverflow.com/questions/68095438/how-to-make-a-bump-chart

fig, ax = plt.subplots(figsize=(20,20))

ax.xaxis.set_major_locator(MultipleLocator(1))
ax.yaxis.set_major_locator(MultipleLocator(1))

ax.get_yaxis().set_visible(False)
yax1 = ax.secondary_yaxis("left")
yax1.yaxis.set_major_locator(FixedLocator(infra_ranks["rank"].to_list()[0:98]))
yax1.yaxis.set_major_formatter(FixedFormatter(infra_ranks["navn"].to_list()[0:98]))

yax2 = ax.secondary_yaxis("right")
yax2.yaxis.set_major_locator(FixedLocator(infra_ranks["rank"].to_list()[98:]))
yax2.yaxis.set_major_formatter(FixedFormatter(infra_ranks["navn"].to_list()[98:]))

for i, j in infra_ranks.groupby("navn"):
    ax.plot("source", "rank", "o-", data=j, mfc="w")

ax.invert_yaxis()
ax.set(xlabel="Data source", ylabel="Rank", title="Rank of infrastructure length")
ax.grid(axis="x")
plt.tight_layout()

In [None]:
from matplotlib.ticker import MultipleLocator, FixedFormatter, FixedLocator

# based on https://stackoverflow.com/questions/68095438/how-to-make-a-bump-chart

fig, ax = plt.subplots(figsize=(20, 20))

ax.xaxis.set_major_locator(MultipleLocator(1))
ax.yaxis.set_major_locator(MultipleLocator(1))

ax.get_yaxis().set_visible(False)
yax1 = ax.secondary_yaxis("left")
yax1.yaxis.set_major_locator(FixedLocator(comp_ranks["rank"].to_list()[0:98]))
yax1.yaxis.set_major_formatter(FixedFormatter(comp_ranks["navn"].to_list()[0:98]))

yax2 = ax.secondary_yaxis("right")
yax2.yaxis.set_major_locator(FixedLocator(comp_ranks["rank"].to_list()[98:]))
yax2.yaxis.set_major_formatter(FixedFormatter(comp_ranks["navn"].to_list()[98:]))

for i, j in comp_ranks.groupby("navn"):
    ax.plot("source", "rank", "o-", data=j, mfc="w")

ax.invert_yaxis()
ax.set(xlabel="Data source", ylabel="Rank", title="Rank of component count")
ax.grid(axis="x")
plt.tight_layout()


In [None]:
from matplotlib.ticker import MultipleLocator, FixedFormatter, FixedLocator

# based on https://stackoverflow.com/questions/68095438/how-to-make-a-bump-chart

fig, ax = plt.subplots(figsize=(20,20))

ax.xaxis.set_major_locator(MultipleLocator(1))
ax.yaxis.set_major_locator(MultipleLocator(1))

ax.get_yaxis().set_visible(False)
yax1 = ax.secondary_yaxis("left")
yax1.yaxis.set_major_locator(FixedLocator(comp_km_ranks["rank"].to_list()[0:98]))
yax1.yaxis.set_major_formatter(FixedFormatter(comp_km_ranks["navn"].to_list()[0:98]))

yax2 = ax.secondary_yaxis("right")
yax2.yaxis.set_major_locator(FixedLocator(comp_km_ranks["rank"].to_list()[98:]))
yax2.yaxis.set_major_formatter(FixedFormatter(comp_km_ranks["navn"].to_list()[98:]))

for i, j in comp_km_ranks.groupby("navn"):
    ax.plot("source", "rank", "o-", data=j, mfc="w")

ax.invert_yaxis()
ax.set(xlabel="Data source", ylabel="Rank", title="Rank of components per km")
ax.grid(axis="x")
plt.tight_layout()