## Check out Results

In [1]:
import geopandas as gpd
import pandas as pd

In [2]:
from segment_speed_utils.project_vars import (
    GCS_FILE_PATH,
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
)
from segment_speed_utils import helpers, wrangle_shapes,sched_rt_utils
from shared_utils import schedule_rt_utils, portfolio_utils, geography_utils

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
GCS_PATH = 'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/trip_level_metrics/'

In [5]:
def check_out(df:pd.DataFrame):
    display(df.spatial_accuracy_pct.describe())
    display(df.pings_per_min.describe())
    display(df.rt_triptime_w_gtfs_pct.describe())
    display(df.rt_v_scheduled_trip_time_pct.describe())

In [6]:
mar_df = pd.read_parquet(f'{GCS_PATH}2023-03-15_metrics.parquet')

In [7]:
mar_df.columns

Index(['trip_instance_key', 'rt_service_min', 'min_w_atleast2_trip_updates',
       'total_pings_for_trip', 'total_min_w_gtfs', 'total_vp', 'vp_in_shape',
       'speed_mph', 'route_id', 'time_of_day', 'service_minutes',
       'pings_per_min', 'spatial_accuracy_pct', 'rt_triptime_w_gtfs_pct',
       'rt_v_scheduled_trip_time_pct'],
      dtype='object')

In [8]:
check_out(mar_df)

count   69494.00
mean       94.10
std        12.34
min         0.00
25%        95.24
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   83620.00
mean        2.46
std         0.68
min         0.00
25%         1.91
50%         2.86
75%         2.96
max         4.75
Name: pings_per_min, dtype: float64

count   83620.00
mean       95.20
std        14.55
min         0.28
25%        97.99
50%        99.58
75%       100.25
max       108.43
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   71797.00
mean       60.68
std       333.76
min       -87.37
25%        10.76
50%        25.19
75%        44.44
max     17909.79
Name: rt_v_scheduled_trip_time_pct, dtype: float64

In [9]:
apr_df = pd.read_parquet(f'{GCS_PATH}2023-04-12_metrics.parquet')

In [10]:
check_out(apr_df)

count   71094.00
mean       94.06
std        12.64
min         0.00
25%        95.45
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   84516.00
mean        2.45
std         0.67
min         0.01
25%         1.91
50%         2.83
75%         2.95
max         5.18
Name: pings_per_min, dtype: float64

count   84516.00
mean       95.23
std        14.48
min         0.56
25%        97.88
50%        99.53
75%       100.20
max       108.11
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   73471.00
mean       61.42
std       349.18
min       -86.02
25%        10.46
50%        25.10
75%        44.38
max     15903.70
Name: rt_v_scheduled_trip_time_pct, dtype: float64

In [11]:
may_df = pd.read_parquet(f'{GCS_PATH}2023-05-17_metrics.parquet')

In [12]:
check_out(may_df)

count   65385.00
mean       94.07
std        12.45
min         0.00
25%        95.00
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   83606.00
mean        2.47
std         0.68
min         0.00
25%         1.95
50%         2.86
75%         2.95
max         5.14
Name: pings_per_min, dtype: float64

count   83606.00
mean       95.47
std        13.78
min         0.27
25%        97.76
50%        99.51
75%       100.21
max       108.43
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   67864.00
mean       60.61
std       329.38
min       -90.09
25%        10.96
50%        25.64
75%        44.61
max     17907.92
Name: rt_v_scheduled_trip_time_pct, dtype: float64

In [13]:
jun_df = pd.read_parquet(f'{GCS_PATH}2023-06-14_metrics.parquet')

In [14]:
check_out(jun_df)

count   64270.00
mean       93.92
std        12.67
min         0.00
25%        94.77
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   80331.00
mean        2.51
std         0.67
min         0.01
25%         1.98
50%         2.90
75%         2.96
max         4.93
Name: pings_per_min, dtype: float64

count   80331.00
mean       95.92
std        13.48
min         0.28
25%        98.43
50%        99.67
75%       100.28
max       107.78
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   66748.00
mean       63.20
std       345.10
min       -90.25
25%        11.69
50%        25.91
75%        45.15
max     12997.42
Name: rt_v_scheduled_trip_time_pct, dtype: float64

In [15]:
jul_df = pd.read_parquet(f'{GCS_PATH}2023-07-12_metrics.parquet')

In [16]:
check_out(jul_df)

count   67091.00
mean       93.83
std        13.15
min         0.00
25%        94.96
50%       100.00
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   82044.00
mean        2.49
std         0.67
min         0.00
25%         1.95
50%         2.90
75%         2.97
max         5.76
Name: pings_per_min, dtype: float64

count   82044.00
mean       96.42
std        11.07
min         0.43
25%        98.28
50%        99.65
75%       100.25
max       108.60
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   69488.00
mean       56.85
std       291.52
min       -91.05
25%        11.82
50%        26.10
75%        45.69
max     17880.62
Name: rt_v_scheduled_trip_time_pct, dtype: float64

In [17]:
aug_df = pd.read_parquet(f'{GCS_PATH}2023-08-15_metrics.parquet')

In [18]:
sept_df = pd.read_parquet(f'{GCS_PATH}2023-09-13_metrics.parquet')

In [19]:
oct_df = pd.read_parquet(f'{GCS_PATH}2023-10-11_metrics.parquet')

In [20]:
nov_df = pd.read_parquet(f'{GCS_PATH}2023-11-15_metrics.parquet')

In [21]:
check_out(nov_df)

count   74891.00
mean       93.53
std        13.12
min         0.00
25%        93.94
50%        99.68
75%       100.00
max       100.00
Name: spatial_accuracy_pct, dtype: float64

count   86832.00
mean        2.51
std         0.63
min         0.00
25%         2.09
50%         2.86
75%         2.96
max         5.29
Name: pings_per_min, dtype: float64

count   86832.00
mean       95.93
std        12.15
min         0.45
25%        98.40
50%        99.67
75%       100.26
max       108.93
Name: rt_triptime_w_gtfs_pct, dtype: float64

count   77194.00
mean       42.53
std       196.80
min       -88.16
25%        11.44
50%        26.10
75%        46.17
max     11797.08
Name: rt_v_scheduled_trip_time_pct, dtype: float64

In [22]:
len(nov_df[nov_df.rt_triptime_w_gtfs_pct > 100])

30287

In [23]:
len(nov_df)

86832

In [24]:
nov_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86832 entries, 0 to 86831
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   trip_instance_key             86832 non-null  object 
 1   rt_service_min                86832 non-null  float64
 2   min_w_atleast2_trip_updates   86832 non-null  int64  
 3   total_pings_for_trip          86832 non-null  int64  
 4   total_min_w_gtfs              86832 non-null  int64  
 5   total_vp                      74891 non-null  float64
 6   vp_in_shape                   74891 non-null  float64
 7   speed_mph                     77194 non-null  float64
 8   route_id                      75619 non-null  object 
 9   time_of_day                   77194 non-null  object 
 10  service_minutes               77194 non-null  float64
 11  pings_per_min                 86832 non-null  float64
 12  spatial_accuracy_pct          74891 non-null  float64
 13  r

### Test aggregating with November

In [25]:
analysis_date = "2023-11-15"

In [26]:
nov_df.rt_triptime_w_gtfs_pct = nov_df.rt_triptime_w_gtfs_pct.mask(nov_df.rt_triptime_w_gtfs_pct > 100).fillna(100)

In [27]:
nov_df.columns

Index(['trip_instance_key', 'rt_service_min', 'min_w_atleast2_trip_updates',
       'total_pings_for_trip', 'total_min_w_gtfs', 'total_vp', 'vp_in_shape',
       'speed_mph', 'route_id', 'time_of_day', 'service_minutes',
       'pings_per_min', 'spatial_accuracy_pct', 'rt_triptime_w_gtfs_pct',
       'rt_v_scheduled_trip_time_pct'],
      dtype='object')

In [28]:
nov_df2= nov_df.drop(columns = ['route_id','service_minutes', 'time_of_day'])

In [29]:
f"{SEGMENT_GCS}trip_summary/trip_speeds_2023-11-15.parquet"

'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/trip_summary/trip_speeds_2023-11-15.parquet'

In [30]:
speeds = pd.read_parquet(
    f"{SEGMENT_GCS}trip_summary/trip_speeds_2023-11-15.parquet")

#### https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/avg_speeds_by_segment.py#L135

In [31]:
keep_cols = [
        "gtfs_dataset_key",
        "direction_id", 
        "route_id", "route_short_name", "route_long_name", "route_desc", "trip_instance_key"
    ]
        
crosswalk = helpers.import_scheduled_trips(
        analysis_date, 
        columns = keep_cols, 
        get_pandas = True
    )

In [32]:
nov_df.head()

Unnamed: 0,trip_instance_key,rt_service_min,min_w_atleast2_trip_updates,total_pings_for_trip,total_min_w_gtfs,total_vp,vp_in_shape,speed_mph,route_id,time_of_day,service_minutes,pings_per_min,spatial_accuracy_pct,rt_triptime_w_gtfs_pct,rt_v_scheduled_trip_time_pct
0,a3647253d4cc8f847e972ed8c83d1b9b,22.62,22,65,23,,,,,,,2.87,,100.0,
1,7029f592047be84e5bb1d28d299be35d,16.93,16,48,17,,,,,,,2.83,,100.0,
2,1040196034fd380818a2cbcf1eafd9b8,40.95,40,118,41,,,,,,,2.88,,100.0,
3,5c6d43026fe5f02e5b31c18fcb8c0bf5,62.95,61,176,63,,,,,,,2.8,,100.0,
4,ee2f1fd83d87e85119f66014da5d74d5,14.07,13,37,15,,,,,,,2.63,,100.0,


In [33]:
crosswalk.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,route_id,route_short_name,route_long_name,route_desc,trip_instance_key
0,1770249a5a2e770ca90628434d4934b1,1.0,3402,Route 11,Route 11,PACIFIC VIEW MALL via TELEPHONE RD,ed6aa732d8c47e4df3a2f7ba9d24415c
1,1770249a5a2e770ca90628434d4934b1,1.0,3402,Route 11,Route 11,PACIFIC VIEW MALL via TELEPHONE RD,72b2c8bdebbf14039af7c57133cc23e5


In [34]:
common_shape = sched_rt_utils.most_common_shape_by_route_direction(analysis_date)

In [35]:
analysis_date

'2023-11-15'

In [36]:
common_shape.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,shape_array_key
0,014d0998350083249a9eb310635548c2,10866826,1.0,10866826:1,a7f294e50a9a8ff179d4c82cd9136625
1,014d0998350083249a9eb310635548c2,10866849,1.0,10866849:1,af9aea31d387f59024bf25fb1d9334a3


In [37]:
crosswalk2 = pd.merge(
        crosswalk,
        common_shape,
        on = ["schedule_gtfs_dataset_key", "route_id", "direction_id"],
        how = "inner"
    ).astype({"direction_id": "Int64"})

In [38]:
time_of_day = sched_rt_utils.get_trip_time_buckets(analysis_date)

In [39]:
time_of_day.head()

Unnamed: 0,trip_instance_key,service_hours,trip_first_departure_datetime_pacific,time_of_day,service_minutes
0,ed6aa732d8c47e4df3a2f7ba9d24415c,0.6,2023-11-15 15:10:00,PM Peak,36.0
1,72b2c8bdebbf14039af7c57133cc23e5,0.55,2023-11-15 16:40:00,PM Peak,33.0
2,e829a24393004fd97a0c6da2efd2b38b,0.53,2023-11-15 06:40:00,Early AM,32.0
3,05c66c28220d0a7982980194c8d479f6,0.55,2023-11-15 19:05:00,PM Peak,33.0
4,169af6136355af8248800d0c1c0916b7,0.55,2023-11-15 17:10:00,PM Peak,33.0


In [40]:
crosswalk2 = portfolio_utils.add_route_name(
        crosswalk2
    ).drop(columns = ["route_short_name", "route_long_name", "route_desc"])


In [41]:
df = pd.merge(
        nov_df2,
        crosswalk2,
        on = "trip_instance_key",
        how = "left",
    ).merge(
        time_of_day,
        on = "trip_instance_key",
        how = "left"
    )

In [42]:
df.head()

Unnamed: 0,trip_instance_key,rt_service_min,min_w_atleast2_trip_updates,total_pings_for_trip,total_min_w_gtfs,total_vp,vp_in_shape,speed_mph,pings_per_min,spatial_accuracy_pct,rt_triptime_w_gtfs_pct,rt_v_scheduled_trip_time_pct,schedule_gtfs_dataset_key,direction_id,route_id,common_shape_id,shape_array_key,route_name_used,service_hours,trip_first_departure_datetime_pacific,time_of_day,service_minutes
0,a3647253d4cc8f847e972ed8c83d1b9b,22.62,22,65,23,,,,2.87,,100.0,,,,,,,,,NaT,,
1,7029f592047be84e5bb1d28d299be35d,16.93,16,48,17,,,,2.83,,100.0,,,,,,,,,NaT,,
2,1040196034fd380818a2cbcf1eafd9b8,40.95,40,118,41,,,,2.88,,100.0,,,,,,,,,NaT,,
3,5c6d43026fe5f02e5b31c18fcb8c0bf5,62.95,61,176,63,,,,2.8,,100.0,,,,,,,,,NaT,,
4,ee2f1fd83d87e85119f66014da5d74d5,14.07,13,37,15,,,,2.63,,100.0,,,,,,,,,NaT,,


#### https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/export.py#L150
* Should I still drop overly quick speeds?

In [43]:
df2 = df.loc[df.speed_mph <= 70].reset_index(drop = True)

In [44]:
len(df2), len(df)

(77170, 86832)

In [45]:
route_cols = [
        "schedule_gtfs_dataset_key", "time_of_day",
        "route_id", "direction_id",
        "route_name_used",
        "common_shape_id", "shape_array_key"
    ]

In [46]:
df2.sample(2)

Unnamed: 0,trip_instance_key,rt_service_min,min_w_atleast2_trip_updates,total_pings_for_trip,total_min_w_gtfs,total_vp,vp_in_shape,speed_mph,pings_per_min,spatial_accuracy_pct,rt_triptime_w_gtfs_pct,rt_v_scheduled_trip_time_pct,schedule_gtfs_dataset_key,direction_id,route_id,common_shape_id,shape_array_key,route_name_used,service_hours,trip_first_departure_datetime_pacific,time_of_day,service_minutes
62094,e77edc665d02c0c34d16290e6d3caad6,38.32,38,112,39,112.0,111.0,12.84,2.92,99.11,100.0,53.27,7cc0cb1871dfd558f11a2885c145d144,1,37,3751,347b53a6f1c73d92b970c56cee4aa2d1,Weekdays 6am-9pm Weekends 9am-9pm,0.42,2023-11-15 22:00:00,Evening,25.0
25552,ecb5631c1349a15c70b673e25fc54827,47.93,47,141,48,141.0,140.0,10.13,2.94,99.29,100.0,22.91,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,0,690-13168,6900006_JUNE23,1a819c320e5f644df0a9bbabc869b049,SYLMAR OLIVE VIEW HOSP. - SUNLAND VIA FOOTHILL BL,0.65,2023-11-15 16:32:00,PM Peak,39.0


In [47]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77170 entries, 0 to 77169
Data columns (total 22 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   trip_instance_key                      77170 non-null  object        
 1   rt_service_min                         77170 non-null  float64       
 2   min_w_atleast2_trip_updates            77170 non-null  int64         
 3   total_pings_for_trip                   77170 non-null  int64         
 4   total_min_w_gtfs                       77170 non-null  int64         
 5   total_vp                               74245 non-null  float64       
 6   vp_in_shape                            74245 non-null  float64       
 7   speed_mph                              77170 non-null  float64       
 8   pings_per_min                          77170 non-null  float64       
 9   spatial_accuracy_pct                   74245 non-null  float6

In [48]:
df3 = (df2.groupby(route_cols)
           .agg({
               "service_minutes": "mean",
               "rt_service_min": "mean",
               "speed_mph": "mean",
               "pings_per_min":"mean",
               "total_vp":"mean",
               "vp_in_shape":"mean",
               "trip_instance_key": "count"
           }).reset_index()
)     

In [49]:
df4 = df3.assign(
        rt_service_min = df2.rt_service_min.round(1),
        service_minutes = df2.service_minutes.round(1),
        speed_mph = df2.speed_mph.round(1)
    ).rename(columns = {
        "service_minutes": "avg_sched_trip_min",
        "rt_service_min": "avg_rt_trip_min",
        "trip_instance_key": "n_trips",
        "route_name_used": "route_name",
        "schedule_gtfs_dataset_key": "gtfs_dataset_key"
    })

In [50]:
org_crosswalk = (
        schedule_rt_utils.sample_gtfs_dataset_key_to_organization_crosswalk(
            df4,
            analysis_date,
            quartet_data = "schedule",
            dim_gtfs_dataset_cols = ["key", "base64_url"],
            dim_organization_cols = ["source_record_id", 
                                     "name", "caltrans_district"])
    )

In [60]:
org_crosswalk.shape, org_crosswalk.schedule_gtfs_dataset_key.nunique()

((82, 5), 82)

In [59]:
org_crosswalk.head()

Unnamed: 0,schedule_gtfs_dataset_key,base64_url,organization_source_record_id,organization_name,caltrans_district
0,015d67d5b75b5cf2b710bbadadfb75f5,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland
1,07d3b79f14cec8099119e1eb649f065b,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3Rmcy90YWhvZS1jYS11cy90YWhvZS1jYS11cy56aXA=,rec3u4aMplqObcoTR,Tahoe Transportation District,03 - Marysville
2,0881af3822466784992a49f1cc57d38f,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1TQQ==,recUmm4gcNXaqrwpn,Sonoma-Marin Area Rail Transit District,04 - Oakland
3,09a703757d1ed14ca9580b1385e39315,aHR0cHM6Ly9yaWRlbGF3bmRhbGViZWF0LmNvbS9ndGZz,recj8LXdeSurpSRNU,City of Lawndale,07 - Los Angeles
4,09e16227fc42c4fe90204a9d11581034,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1TTw==,recRM3c9Zfaft4V2B,Cloverdale Transit,04 - Oakland


In [51]:
df_with_org = pd.merge(
        df4,
        org_crosswalk.rename(columns = {
            "schedule_gtfs_dataset_key": "gtfs_dataset_key"}),
        on = "gtfs_dataset_key",
        how = "inner"
    )

In [52]:
df_with_org.shape

(11229, 18)

In [53]:
df_with_org.head()

Unnamed: 0,gtfs_dataset_key,time_of_day,route_id,direction_id,route_name,common_shape_id,shape_array_key,avg_sched_trip_min,avg_rt_trip_min,speed_mph,pings_per_min,total_vp,vp_in_shape,n_trips,base64_url,organization_source_record_id,organization_name,caltrans_district
0,015d67d5b75b5cf2b710bbadadfb75f5,AM Peak,17,0,Downtown San Rafael - Sausalito,104,4362491ca6006e88a076e2eaaf693762,58.0,62.4,22.2,2.75,205.0,162.8,5,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland
1,015d67d5b75b5cf2b710bbadadfb75f5,AM Peak,17,1,Downtown San Rafael - Sausalito,109,fe2f063891238d0132d4543c537cf574,58.0,67.7,21.6,2.73,181.6,166.6,5,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland
2,015d67d5b75b5cf2b710bbadadfb75f5,AM Peak,219,0,Tiburon - Strawberry,56,168629b6e90c8b94b78dd60c8eeaf0ca,58.0,127.4,8.1,2.91,119.0,113.25,4,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland
3,015d67d5b75b5cf2b710bbadadfb75f5,AM Peak,219,1,Tiburon - Strawberry,58,6457a064626d1b0abdce22dd159fbb2d,58.0,152.0,9.4,2.93,86.75,85.75,4,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland
4,015d67d5b75b5cf2b710bbadadfb75f5,AM Peak,22,0,Downtown San Rafael - Marin City,50,0e10aded9dede712f3c623c5deae87a4,55.0,76.3,28.6,2.95,90.33,53.33,6,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland


In [54]:
shapes = helpers.import_scheduled_shapes(
        analysis_date,
        columns = ["shape_array_key", "geometry"],
        get_pandas = True,
        crs = geography_utils.WGS84
    )
    

In [55]:
shapes.shape

(7303, 2)

In [67]:
df_with_shape = pd.merge(
        shapes,
        df_with_org,
        on = "shape_array_key", # once merged, can drop shape_array_key
        how = "inner"
    )

In [68]:
final_df['avg_pct_vp_shape'] = final_df.vp_in_shape / final_df.total_vp * 100

In [69]:
final_df['avg_pct_rt_v_sched'] = (final_df.avg_rt_trip_min / final_df.avg_sched_trip_min - 1) * 100

In [70]:
final_df = final_df.drop(columns = ['total_vp','vp_in_shape'])

In [71]:
final_df = final_df.rename(columns = {'pings_per_min':'avg_pings_per_min'})

In [72]:
final_df = final_df.rename(
        columns = {"organization_source_record_id": "org_id",
                   "organization_name": "agency", 
                   "caltrans_district": "district_name"
                  })

In [74]:
final_df.shape

(11229, 19)

In [75]:
final_df.avg_pings_per_min.describe()

count   11229.00
mean        2.46
std         0.55
min         0.04
25%         1.97
50%         2.75
75%         2.93
max         3.10
Name: avg_pings_per_min, dtype: float64

In [76]:
final_df.drop(columns = ['geometry', 'base64_url']).head()

Unnamed: 0,shape_array_key,gtfs_dataset_key,time_of_day,route_id,direction_id,route_name,common_shape_id,avg_sched_trip_min,avg_rt_trip_min,speed_mph,avg_pings_per_min,n_trips,org_id,agency,district_name,avg_pct_vp_shape,avg_pct_rt_v_sched
0,f73c9e5905f12d7392b4fff9a31c83ce,f1b35a50955aeb498533c1c6fdafbe44,AM Peak,51,0,LONG BEACH BLVD,510078,57.0,79.2,4.9,1.62,13,rec00qSzZL8KqiXAo,Long Beach Transit,07 - Los Angeles,92.61,38.95
1,f73c9e5905f12d7392b4fff9a31c83ce,f1b35a50955aeb498533c1c6fdafbe44,Early AM,51,0,LONG BEACH BLVD,510078,47.0,52.4,6.0,1.83,8,rec00qSzZL8KqiXAo,Long Beach Transit,07 - Los Angeles,87.61,11.49
2,f73c9e5905f12d7392b4fff9a31c83ce,f1b35a50955aeb498533c1c6fdafbe44,Evening,51,0,LONG BEACH BLVD,510078,44.0,68.6,4.8,1.87,4,rec00qSzZL8KqiXAo,Long Beach Transit,07 - Los Angeles,96.53,55.91
3,f73c9e5905f12d7392b4fff9a31c83ce,f1b35a50955aeb498533c1c6fdafbe44,Midday,51,0,LONG BEACH BLVD,510078,51.0,45.5,16.8,1.69,20,rec00qSzZL8KqiXAo,Long Beach Transit,07 - Los Angeles,96.03,-10.78
4,f73c9e5905f12d7392b4fff9a31c83ce,f1b35a50955aeb498533c1c6fdafbe44,PM Peak,51,0,LONG BEACH BLVD,510078,75.0,72.4,20.3,1.65,16,rec00qSzZL8KqiXAo,Long Beach Transit,07 - Los Angeles,95.01,-3.47
