In [13]:
import functions
import imp
import pandas as pd
import numpy as np

In [14]:
df_coded = functions.get_df_coded()

# Average obstruction rate

In [15]:
imp.reload(functions)
df_some_lanes = df_coded.query('0 < lane_max_int < 4').query('useful').query('classified').copy()

# Store this in a dictionary corresponding to the two levels
# of the split vars
d = {}
# Loop over the split var
for split_var in functions.cols_classified:
        df_var = df_some_lanes.query(f'{split_var}')
        # Calculate the mean obstruction and the observation count by obstruction type
        res = {}
        for obstruction in functions.obstructions:
            vals = df_var[obstruction]
            d[(split_var, obstruction)] = vals.count(),vals.sum()

df = pd.DataFrame(d).T.reset_index()
df.columns = ['lane_type', 'obstruction', 'num_images', 'num']
df['mean_obstruction'] = df.num / df.num_images

# Add the confidence interval for the mean
# which can be calculated by formula for a binomial variable
df['ci'] = 1.96 * np.sqrt(df.mean_obstruction * (1 - df.mean_obstruction) / df.num_images)

# Add upper and lower bounds
df['lower'] = df.mean_obstruction - df.ci
# If lower is below 0 replace with 0
df.loc[df.lower < 0, 'lower'] = 0
df['upper'] = df.mean_obstruction + df.ci

# Create nicely formatted lift as mean [lower, upper] string
df['lift'] = df.apply(lambda r: f"{r.mean_obstruction*100:.0f}%" + (f" [{r.lower*100:.0f}%, {r.upper*100:.0f}%]" if r.mean_obstruction>0 else ""), axis=1)

mean_obstruction = df.query('num_images > 10').groupby(['lane_type', 'obstruction']).lift.first().unstack(1)
# mean_obstruction['pct_sum'] = mean_obstruction.sum(axis=1)
mean_obstruction['num_images'] = df.groupby('lane_type').num_images.min()
mean_obstruction.loc[functions.cols_classified, ['num_images'] + functions.obstructions]

obstruction,num_images,construction,cars_standing,cars_moving,humans
lane_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
sharrow,117,"3% [0%, 5%]","4% [1%, 8%]","23% [15%, 31%]",0%
unprotected,253,"1% [0%, 2%]","11% [7%, 14%]","8% [5%, 11%]","2% [0%, 3%]"
partially_protected,90,0%,"2% [0%, 5%]","2% [0%, 5%]",0%
protected,36,0%,0%,0%,"14% [3%, 25%]"


# Streets

In [19]:
df_streets = functions.get_streets()

with open('data/ids_reviewed.txt', 'r') as f:
    ids_reviewed = f.read().split('\n')
len(set(ids_reviewed))

2724

In [20]:
df_agg = (df_streets
          .groupby(['borough', 'lane_max'])
          .length_miles.sum()
          .unstack(1)
          .astype(int)[functions.lanes_dict.values()])
print(df_agg.to_markdown())

max_dist_meters = 200
obs = df_coded.query('useful').__len__()
kms = int(df_streets.length_km.sum())
full_coverage = kms * 1000 / max_dist_meters

obs, kms, full_coverage, int(obs/full_coverage*100)

# Summarize this in a dictionary
d = {}
d['useful'] = obs
d['not_useful'] = len(set(ids_reviewed))
d['total_reviewed'] = d['useful'] + d['not_useful']
d['kilometers'] = kms
d['miles'] = int(kms * 0.621371)
d['meters_per_image'] = max_dist_meters
d['images_full_coverage'] = int(full_coverage)
d['percent_coverage'] = int(obs/full_coverage*100)
summary = pd.Series(d)
summary

| borough       |   None |   Sharrows |   Standard |   Protected Path |   Greenway |
|:--------------|-------:|-----------:|-----------:|-----------------:|-----------:|
| Bronx         |     17 |         15 |         66 |               14 |         32 |
| Brooklyn      |     36 |         40 |        147 |               30 |         41 |
| Manhattan     |     28 |         22 |         58 |               67 |         51 |
| Queens        |     38 |         43 |        100 |               39 |         28 |
| Staten_Island |     22 |         12 |         16 |                2 |         19 |


useful                   786
not_useful              2724
total_reviewed          3510
kilometers              1601
miles                    994
meters_per_image         200
images_full_coverage    8005
percent_coverage           9
dtype: int64