# Flow vs obs flow & speed vs obs speed

Side-by-side comparison of what seems to be related columns.

## What we learned
* **use `flow`, `speed` columns** and ignore `obs_flow`, `obs_speed`
* `flow` is always >= `obs_flow`
* Most of the time, `flow = obs_flow`
* Why would imputation be more than what's observed? Is this only true when `obs_flow == 0`?
   * It does appear like when `obs_flow has mean = 0`, `flow` will be higher and hold non-zeroes.
* We'll just use `flow` for now, and use imputed values always?
* Most cases are `speed < obs_speed` (so imputed tends to be less than what detector says).
* Looking at descriptives, these are occurring where observed speed is perceived to be too high so that these are adjusted down, though means don't differ by too much for the subset where `speed < obs_speed`.

In [1]:
import pandas as pd

from utils import PROCESSED_GCS

In [2]:
def metric_vs_observed(metric: str, filtering: tuple) -> pd.DataFrame:
    PREFIX = "station_weekday_hour"

    metric_df = pd.read_parquet(
        f"{PROCESSED_GCS}{PREFIX}_{metric}.parquet",
        filters = filtering
    )

    obs_metric_df = pd.read_parquet(
        f"{PROCESSED_GCS}{PREFIX}_obs_{metric}.parquet",
        filters = filtering
    )
    
    merge_cols = ["station_uuid", "year", "month", "weekday", "hour"]

    df = pd.merge(
        metric_df,
        obs_metric_df,
        on = merge_cols,
        how = "outer"
    )
    
    return df

In [3]:
flow_df = metric_vs_observed("flow", filtering = None)
speed_df = metric_vs_observed("speed", filtering = None)

In [4]:
def lane_comparisons(df: pd.DataFrame, lane_number: int, metric: str):

    N_ROWS = len(df)
    
    def rounded(numerator, denominator):
        return round(numerator / denominator, 3)
    
    col = f"lane_{lane_number}_{metric}"
    obs_col = f"lane_{lane_number}_obs_{metric}"
    
    display(df[[col, obs_col]].describe())
    
    N_EQUAL = df[df[col] == df[obs_col]].shape[0]
    N_MORE = df[df[col] > df[obs_col]].shape[0]
    N_LESS = df[df[col] < df[obs_col]].shape[0]
    
    
    print(f"# rows: {N_ROWS}")
    print(f"equal: {N_EQUAL}, imputed > obs: {N_MORE}, imputed < obs: {N_LESS}")
    print(f"% equal {rounded(N_EQUAL, N_ROWS)}")
    print(f"greater: {rounded(N_MORE, N_ROWS)}, less: {rounded(N_LESS, N_ROWS)}")
    
    if metric == "speed":
        print("**** values when imputed < obs ****")

        less_df = df.loc[df[col] < df[obs_col]]

        display(less_df[[col, obs_col]].describe())
    
    print("****values when imputed > obs *****")
    
    more_df = df.loc[df[col] > df[obs_col]]
    
    more_df = more_df.assign(
        obs_col_zero = more_df.apply(
            lambda x: True if x[obs_col]==0 
            else False, axis=1)
    )
    
    print(more_df.obs_col_zero.value_counts())


## Flow vs observed flow diagnostics

In [5]:
for i in range(1, 9):
    lane_comparisons(flow_df, i, "flow")

Unnamed: 0,lane_1_flow,lane_1_obs_flow
count,5490744.0,5524535.0
mean,761.029763,722.35622
std,564.397175,562.738313
min,0.0,0.0
25%,246.4,207.4
50%,699.0,645.75
75%,1210.0,1164.333333
max,5198.0,5198.0


# rows: 5524535
equal: 4826068, imputed > obs: 664676, imputed < obs: 0
% equal 0.874
greater: 0.12, less: 0.0
****values when imputed > obs *****
False    566253
True      98423
Name: obs_col_zero, dtype: int64


Unnamed: 0,lane_2_flow,lane_2_obs_flow
count,3668484.0,5524535.0
mean,912.07882,577.289599
std,544.438406,606.562733
min,0.0,0.0
25%,400.4,0.0
50%,965.0,347.0
75%,1354.5,1131.333333
max,4737.0,4737.0


# rows: 5524535
equal: 3219391, imputed > obs: 449093, imputed < obs: 0
% equal 0.583
greater: 0.081, less: 0.0
****values when imputed > obs *****
False    372805
True      76288
Name: obs_col_zero, dtype: int64


Unnamed: 0,lane_3_flow,lane_3_obs_flow
count,3346786.0,5524535.0
mean,829.965098,478.92817
std,480.593977,540.801227
min,0.0,0.0
25%,386.75,0.0
50%,861.25,233.0
75%,1217.0,952.75
max,4496.0,4496.0


# rows: 5524535
equal: 2929831, imputed > obs: 416955, imputed < obs: 0
% equal 0.53
greater: 0.075, less: 0.0
****values when imputed > obs *****
False    342808
True      74147
Name: obs_col_zero, dtype: int64


Unnamed: 0,lane_4_flow,lane_4_obs_flow
count,2577647.0,5524535.0
mean,768.204926,342.366352
std,474.421297,490.061659
min,0.0,0.0
25%,331.75,0.0
50%,770.25,0.0
75%,1139.333333,663.25
max,4235.0,4235.0


# rows: 5524535
equal: 2259657, imputed > obs: 317990, imputed < obs: 0
% equal 0.409
greater: 0.058, less: 0.0
****values when imputed > obs *****
False    272609
True      45381
Name: obs_col_zero, dtype: int64


Unnamed: 0,lane_5_flow,lane_5_obs_flow
count,834680.0,5524535.0
mean,676.559778,97.664623
std,483.344518,297.848786
min,0.0,0.0
25%,246.0,0.0
50%,614.0,0.0
75%,1036.0,0.0
max,2755.5,2755.5


# rows: 5524535
equal: 728520, imputed > obs: 106160, imputed < obs: 0
% equal 0.132
greater: 0.019, less: 0.0
****values when imputed > obs *****
False    92664
True     13496
Name: obs_col_zero, dtype: int64


Unnamed: 0,lane_6_flow,lane_6_obs_flow
count,157727.0,5524535.0
mean,549.264937,15.030314
std,435.67858,114.333933
min,0.0,0.0
25%,191.25,0.0
50%,458.333333,0.0
75%,813.0,0.0
max,2692.0,2692.0


# rows: 5524535
equal: 137721, imputed > obs: 20006, imputed < obs: 0
% equal 0.025
greater: 0.004, less: 0.0
****values when imputed > obs *****
False    17530
True      2476
Name: obs_col_zero, dtype: int64


Unnamed: 0,lane_7_flow,lane_7_obs_flow
count,4202.0,5524535.0
mean,481.056592,0.350383
std,215.572484,14.063007
min,18.0,0.0
25%,320.6875,0.0
50%,549.45,0.0
75%,643.25,0.0
max,1051.666667,1027.5


# rows: 5524535
equal: 3624, imputed > obs: 578, imputed < obs: 0
% equal 0.001
greater: 0.0, less: 0.0
****values when imputed > obs *****
False    543
True      35
Name: obs_col_zero, dtype: int64


Unnamed: 0,lane_8_flow,lane_8_obs_flow
count,0.0,5524535.0
mean,,0.0
std,,0.0
min,,0.0
25%,,0.0
50%,,0.0
75%,,0.0
max,,0.0


# rows: 5524535
equal: 0, imputed > obs: 0, imputed < obs: 0
% equal 0.0
greater: 0.0, less: 0.0
****values when imputed > obs *****
Series([], Name: obs_col_zero, dtype: int64)


## Speed vs observed speed diagnostics

In [6]:
for i in range(1, 9):
    lane_comparisons(speed_df, i, "speed")

Unnamed: 0,lane_1_speed,lane_1_obs_speed
count,5490744.0,5410636.0
mean,65.605164,65.664043
std,10.380544,10.444326
min,3.0,3.0
25%,63.625,63.675
50%,66.075,66.233333
75%,73.1,73.3
max,90.5,90.5


# rows: 5524535
equal: 4874919, imputed > obs: 262289, imputed < obs: 273428
% equal 0.882
greater: 0.047, less: 0.049
**** values when imputed < obs ****


Unnamed: 0,lane_1_speed,lane_1_obs_speed
count,273428.0,273428.0
mean,66.618133,67.894924
std,9.446861,9.228313
min,3.45,4.1
25%,64.4,64.9
50%,68.75,70.9
75%,73.133333,74.525
max,84.725,88.4


****values when imputed > obs *****
False    262289
Name: obs_col_zero, dtype: int64


Unnamed: 0,lane_2_speed,lane_2_obs_speed
count,3668484.0,3606628.0
mean,64.512993,64.604668
std,9.864657,9.928083
min,3.0,3.0
25%,62.85,63.0
50%,67.45,67.65
75%,70.45,70.525
max,89.675,89.675


# rows: 5524535
equal: 3241065, imputed > obs: 167026, imputed < obs: 198537
% equal 0.587
greater: 0.03, less: 0.036
**** values when imputed < obs ****


Unnamed: 0,lane_2_speed,lane_2_obs_speed
count,198537.0,198537.0
mean,65.291033,66.774167
std,8.507366,8.210417
min,3.9,4.3
25%,63.775,65.0
50%,67.475,69.3
75%,70.1,70.966667
max,85.125,89.3


****values when imputed > obs *****
False    167026
Name: obs_col_zero, dtype: int64


Unnamed: 0,lane_3_speed,lane_3_obs_speed
count,3346786.0,3289997.0
mean,60.460405,60.582831
std,10.022042,10.065959
min,3.0,3.0
25%,57.475,57.65
50%,62.62,62.675
75%,66.85,66.925
max,90.7,90.7


# rows: 5524535
equal: 2944499, imputed > obs: 156529, imputed < obs: 188969
% equal 0.533
greater: 0.028, less: 0.034
**** values when imputed < obs ****


Unnamed: 0,lane_3_speed,lane_3_obs_speed
count,188969.0,188969.0
mean,60.943249,62.748345
std,8.947731,8.648396
min,3.133333,3.25
25%,58.4,60.866667
50%,62.625,64.3
75%,66.52,67.4
max,85.675,89.05


****values when imputed > obs *****
False    156529
Name: obs_col_zero, dtype: int64


Unnamed: 0,lane_4_speed,lane_4_obs_speed
count,2577647.0,2540089.0
mean,58.184358,58.260981
std,9.704024,9.78274
min,3.0,3.0
25%,55.2,55.275
50%,61.12,61.24
75%,62.8,62.875
max,88.05,88.05


# rows: 5524535
equal: 2270198, imputed > obs: 126301, imputed < obs: 143590
% equal 0.411
greater: 0.023, less: 0.026
**** values when imputed < obs ****


Unnamed: 0,lane_4_speed,lane_4_obs_speed
count,143590.0,143590.0
mean,58.944181,60.458667
std,8.565516,8.504824
min,3.02,3.025
25%,56.725,58.375
50%,61.033333,62.35
75%,63.0,65.033333
max,80.625,80.975


****values when imputed > obs *****
False    126301
Name: obs_col_zero, dtype: int64


Unnamed: 0,lane_5_speed,lane_5_obs_speed
count,834680.0,824582.0
mean,58.82221,58.882487
std,8.907645,8.989742
min,3.0,3.0
25%,56.925,57.02
50%,62.7,62.866667
75%,64.2,64.225
max,88.0,88.0


# rows: 5524535
equal: 733822, imputed > obs: 42517, imputed < obs: 48243
% equal 0.133
greater: 0.008, less: 0.009
**** values when imputed < obs ****


Unnamed: 0,lane_5_speed,lane_5_obs_speed
count,48243.0,48243.0
mean,59.320598,60.755841
std,7.83319,7.553725
min,3.133333,3.2
25%,58.0,60.2
50%,62.3,63.75
75%,63.75,64.5
max,74.15,84.0


****values when imputed > obs *****
False    42517
Name: obs_col_zero, dtype: int64


Unnamed: 0,lane_6_speed,lane_6_obs_speed
count,157727.0,155293.0
mean,59.833939,59.845897
std,8.40468,8.450482
min,3.0,3.0
25%,59.2,59.2
50%,63.4,63.466667
75%,64.3,64.3
max,77.1,77.1


# rows: 5524535
equal: 139096, imputed > obs: 7865, imputed < obs: 8332
% equal 0.025
greater: 0.001, less: 0.002
**** values when imputed < obs ****


Unnamed: 0,lane_6_speed,lane_6_obs_speed
count,8332.0,8332.0
mean,60.685959,61.573892
std,7.157928,6.932292
min,3.133333,3.15
25%,60.775,62.2
50%,63.175,64.0
75%,64.05,64.46
max,68.75,73.9


****values when imputed > obs *****
False    7865
Name: obs_col_zero, dtype: int64


Unnamed: 0,lane_7_speed,lane_7_obs_speed
count,4202.0,4167.0
mean,57.523356,57.620076
std,12.663946,12.560834
min,3.333333,3.333333
25%,58.85,58.958333
50%,63.0,63.05
75%,63.9,63.933333
max,73.3,73.3


# rows: 5524535
equal: 3636, imputed > obs: 208, imputed < obs: 323
% equal 0.001
greater: 0.0, less: 0.0
**** values when imputed < obs ****


Unnamed: 0,lane_7_speed,lane_7_obs_speed
count,323.0,323.0
mean,58.024721,59.346352
std,12.739368,12.055794
min,5.8,6.05
25%,61.7,62.473333
50%,63.1,63.666667
75%,63.755,64.229167
max,65.9,72.6


****values when imputed > obs *****
False    208
Name: obs_col_zero, dtype: int64


Unnamed: 0,lane_8_speed,lane_8_obs_speed
count,0.0,0.0
mean,,
std,,
min,,
25%,,
50%,,
75%,,
max,,


# rows: 5524535
equal: 0, imputed > obs: 0, imputed < obs: 0
% equal 0.0
greater: 0.0, less: 0.0
**** values when imputed < obs ****


Unnamed: 0,lane_8_speed,lane_8_obs_speed
count,0.0,0.0
mean,,
std,,
min,,
25%,,
50%,,
75%,,
max,,


****values when imputed > obs *****
Series([], Name: obs_col_zero, dtype: int64)
