# Import and load

In [41]:
import pandas as pd
import numpy as np
import os
import json

In [22]:
path_to_data = os.getcwd() + "/20221125_path_data.json"

'/Users/anamamatelashvili/PycharmProjects/cyclist_path_complexity/20221125_path_data.json'

In [26]:
with open(path_to_data) as f:
    json_data = json.load(f)

In [33]:
cycling_paths_df = pd.DataFrame(json_data)
cycling_paths_df.set_index("id", inplace=True)

# Summary stats

In [34]:
cycling_paths_df.head()

Unnamed: 0_level_0,segregated_portion_m,elevation_gain_total_m,max_steepness_gradient,length_m,lit_portion,cyclists_average,stoplights_total
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.272663,74.558967,5.696159,624.395554,0.171733,"[3, 1, 6, 4, 2, 8, 14, 6, 54, 22, 16, 21, 41, ...",6
1,0.321567,-70.671561,5.573763,773.466732,0.202954,"[4, 3, 8, 2, 1, 7, 14, 5, 28, 35, 20, 31, 40, ...",4
2,0.221246,-35.172413,1.378596,638.563193,0.256841,"[1, 0, 6, 4, 5, 9, 9, 11, 33, 18, 29, 30, 23, ...",5
3,0.266797,-63.608671,3.071868,853.136713,0.36044,"[4, 5, 6, 0, 9, 7, 10, 12, 59, 50, 25, 46, 17,...",5
4,0.257422,-170.341477,2.682898,800.187257,0.136058,"[3, 0, 6, 4, 5, 1, 10, 6, 33, 28, 50, 35, 29, ...",1


In [36]:
cycling_paths_df.isna().any()

segregated_portion_m      False
elevation_gain_total_m    False
max_steepness_gradient    False
length_m                  False
lit_portion               False
cyclists_average          False
stoplights_total          False
dtype: bool

In [35]:
cycling_paths_df.describe()

Unnamed: 0,segregated_portion_m,elevation_gain_total_m,max_steepness_gradient,length_m,lit_portion,stoplights_total
count,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.195898,-4.689172,3.349055,745.083229,0.165199,4.04
std,0.125393,81.171714,1.822165,95.732283,0.102036,3.014845
min,0.025245,-175.094522,0.125318,391.512465,0.002481,0.0
25%,0.096995,-64.713789,1.764582,686.904234,0.09161,1.0
50%,0.173,-5.424127,3.614644,756.449222,0.137292,3.0
75%,0.261119,58.292482,5.086246,808.259692,0.206105,7.0
max,0.574913,155.560535,5.907504,996.086604,0.455219,10.0


In [43]:
np.mean([3, 1, 6, 4, 2, 8, 14, 6, 54, 22, 16, 21])

13.083333333333334

In [51]:
cycling_paths_df.cyclists_average.apply(np.mean).describe()

count    100.000000
mean      22.758750
std        2.013774
min       18.750000
25%       21.312500
50%       22.729167
75%       24.166667
max       28.041667
Name: cyclists_average, dtype: float64

In [52]:
cycling_paths_df.cyclists_average.apply(np.max).describe()

count    100.000000
mean      60.650000
std        2.958893
min       48.000000
25%       59.000000
50%       61.000000
75%       63.000000
max       64.000000
Name: cyclists_average, dtype: float64

In [57]:
cycling_paths_df.apply(lambda row: np.max(row.cyclists_average) / row.length_m, axis=1).describe()

count    100.000000
mean       0.082870
std        0.012498
min        0.059417
25%        0.074056
50%        0.081263
75%        0.090701
max        0.145589
dtype: float64

## Conclusions/questions
- No missing values
- Reasonable slopes
- No zero length paths, so divisions are fine 
- All paths are reasonably short -- maybe individual streets so that we can assemble different paths as needed
- What does "average munber of cyclists on path" mean? On path near me or on the entire path? -- perhaps because the paths are short number of cyclists on the entire path are meant


## Plan 
- Develop three measures:
    - Physical effort measure 
        - length, total elevation gain divided by the length (positive gets more weight than negative), max steepness gradient (give this at least a quadratic transform,  maybe even exponential, what is max steepenss what you can even ride up on -- research that)
    - Time efficiency measure (effort already translates into extra time so here only pure extra time components)
        - segregated portion (low weight), number of stoplights
    - Safety measure
        - unsegregated absolute length, number of cyclists, unlit absolute length   
- Bring all these onto the same scale so we can make meaningful comparisons and make sense of weights
- Research what would be a good way to pick weights and validate them
- Adjust weights by the time of day: e.g. unlit does not matter during the day, unsegregated might not matter as much during the night
- Document the functions


## End measure

- Star chart with these three indicators per path 
- Aggrerate these three measures into a single number (what are good weights?) 
- Aggregate these three measures into one of these categories: "easy", "medium", "hard"
- Consider having day/night/peak time ratings

# Implement one measure 

In [68]:
# Define safety measure, score will be 0 to 3 with higher score indicating less safe. 
def get_safety_measure(row, segregated_weight = 1, unlit_weight = 1, unsafe_weight = .5, crowded_weight = .2, unsafe_for_too_long_weight = .3):
    # portion of unsafe meters (possibly doubled):
    unsafe_length = segregated_weight * (1 - row.segregated_portion_m) * row.length_m + unlit_weight * (1 - row.lit_portion) * row.length_m
    
    is_unsafe_too_long = unsafe_length > 100
    unsafe_portion = max(row.length_m, unsafe_length) / row.length_m
    
    # people closer than 5 meters apart is too much 
    is_ever_crowded = int(np.max(row.cyclists_average) / row.length_m > .2)
    
    return unsafe_portion * unsafe_weight + is_ever_crowded * crowded_weight + unsafe_for_too_long_weight * is_unsafe_too_long

In [69]:
get_safety_measure(cycling_paths_df.loc[0,:])

1.0778021176451449