# Adding & Calulating Door Data for 5311 Agencies

### Script work and analysis for caculating fleet age and doors for agencies

* adding door counts using [Eric Dasmalchi's Notebook](https://github.com/cal-itp/data-analyses/blob/572e255f997e7fa0969c88caf23a7791070ad28a/lossan_validators/lossan_validators.ipynb) count for the known 5311 Agencies
 
 
`({'bus':2, 'articulated_bus':3,
'over_the_road_bus':1,
'vintage_historic_trolley':2,
'trolleybus':2,
'van':1,
'cutaway':1, 
'automobile':1,
minivan':1,
'sport_utility_vehicle:':1,
'lrt_stn':6, 'hrt_stn':15,
'la_union_stn':40,
'commuter_rail_stn':6,
'interchange':20, ## interchange is for places like 7th/Metro
'ferry_stn':4})`

In [1]:
import numpy as np
import pandas as pd
from siuba import *
import altair as alt
import altair_saver
from calitp import *
from plotnine import *
import intake



In [2]:
from shared_utils import geography_utils



In [3]:
import data_prep

In [4]:
#df = data_prep.load_grantprojects()
vehicles = data_prep.load_vehiclesdata()
cw1 = pd.read_parquet("test_crosswalk_both.parquet")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
vehicles.sample()

Unnamed: 0,agency,city,state,legacy_ntd_id,ntd_id,organization_type,reporter_type,primary_uza_population,agency_voms,vehicle_type,years_old:,_13_15,_16_20,_21_25,_26_30,_31_60,_60+,total_vehicles,average_age_of_fleet__in_years_,average_lifetime_miles_per_vehicle,_0_9,_10_12,vehicle_groups
4449,San Luis Obispo Council of Governments,San Luis Obispo,CA,,90297,"MPO, COG or Other Planning Agency",Reduced Reporter,59219,13,Sports Utility Vehicle,,0,0,0,0,0,0,3,1.333333,30471.3333,3,0,Automobiles


In [6]:
vehicles.primary_uza_population.unique()

array([12150996,  3281212,  2956746,    87941,  1664496,  1723634,
         328454,  1932666,   654628,   615968,   370583,   114237,
         163703,   345580,   277634,   195861,   523994,   341219,
         258653,        0,   367260,   308231,   358172,   136969,
          59219,    83913,    98176,   219454,   133683,   165074,
          72794,   116719,   130447,   214811,   107672,   117731,
          78413,   583681,    71772,   125206,    93141,    68738,
          64078,    87569,    70272,    51509,    99904,    54372,
          83578,    65088])

In [7]:
vehicles>>group_by(_.reporter_type)>>summarize(n=_.agency.nunique()) >> arrange(-_.n)

Unnamed: 0,reporter_type,n
1,Full Reporter,82
3,Reduced Reporter,76
4,Rural Reporter,52
2,Reduced Asset Reporter,7
0,Asset Subrecipient,1


In [8]:
cw1.head()

Unnamed: 0,agency_x,ntd_id,itp_id
41,Fresno County Rural Transit Agency,9R02-91007,117.0
58,Eastern Sierra Transit Authority,9R02-91062,99.0
61,Kern Regional Transit,9R02-91059,146.0
73,Lake Transit Authority,9R02-91053,159.0
81,Mendocino Transit Authority,9R02-91047,198.0


In [9]:
cw1 = cw1.rename(columns={"agency_x": "agency"})

In [10]:
df = cw1.merge(vehicles, how="outer", on="agency", indicator=True)

In [11]:
df>>count(_._merge)

Unnamed: 0,_merge,n
0,left_only,0
1,right_only,587
2,both,120


In [12]:
df>>filter(_._merge=="right_only")>>count(_.agency)

Unnamed: 0,agency,n
0,Access Services,6
1,"Alameda-Contra Costa Transit District, dba: AC...",8
2,Altamont Corridor Express,5
3,Anaheim Transportation Network,4
4,Antelope Valley Transit Authority,6
...,...,...
169,"Wasco, City of",2
170,Western Contra Costa Transit Authority,6
171,"Yolo County Transportation District, dba: Yolobus",5
172,Yuba-Sutter Transit Authority,4


In [13]:
df>>filter(_._merge=="both")>>count(_.agency)>>arrange(-_.n)

Unnamed: 0,agency,n
24,Fresno County Rural Transit Agency,6
1,Amador Regional Transit System,5
20,Colusa County Transit Agency,5
23,Eastern Sierra Transit Authority,5
33,"Mountain Area Regional Transit Authority, dba:...",5
38,Tehama County,5
2,City of Arcata,4
8,City of Dinuba,4
22,"County of Siskiyou, dba: Siskiyou County Transit",4
26,Humboldt Transit Authority,4


## Put it in a script:

In [14]:
def get_age_and_doors(df):   
    d = {
        'Articulated Bus':3,
        'Automobile':1,
        'Automated Guideway Vehicle':2,
        'Automobiles (Service)':0,
        'Bus':2,
        'Cable Car':2,
        'Commuter Rail Locomotive':0,
        'Commuter Rail Passenger Coach':2,
        'Commuter Rail Self-Propelled Passenger Car':2,
        'Cutaway':1,
        'Double Decker Bus':2,
        'Ferryboat':2,
        'Heavy Rail Passenger Car':2,
        'Light Rail Vehicle':2,
        'Minivan':1,
        'Other':0,
        'Over-the-road Bus':1,
        'Sports Utility Vehicle':1,
        'Steel Wheel Vehicles (Service)':0,
        'Trolleybus':2,
        'Trucks and other Rubber Tire Vehicles (Service)':0,
        'Van':1,
        'Vintage Trolley':2
    }

    age = geography_utils.aggregate_by_geography(df, 
                           group_cols = ["agency"],
                           sum_cols = ["total_vehicles", "_0_9","_10_12", "_13_15", "_16_20","_21_25","_26_30","_31_60","_60+"],
                           mean_cols = ["average_age_of_fleet__in_years_", "average_lifetime_miles_per_vehicle"]
                                          ).sort_values(["agency","total_vehicles"], ascending=[True, True])
    age = age.rename(columns={'_60+': '_60plus'})

    older = (age.query('_21_25 != 0 or _26_30 != 0 or _31_60 != 0 or _60plus!=0'))
    older["sum_15plus"] = older[["_16_20","_21_25","_26_30","_31_60","_60plus"]].sum(axis=1)
    older = (older>>select(_.agency, _.sum_15plus))

    age = pd.merge(age, older, on=['agency'], how='left')

    df['doortype'] = (df['vehicle_type'].map(d))
    df['door_count']= df['total_vehicles']*df['doortype']

    doors = geography_utils.aggregate_by_geography(df, 
                           group_cols = ["agency"],
                           sum_cols = ["door_count"],
                           mean_cols = ["door_count"], 
                           nunique_cols = ["vehicle_type"])
    doors.rename(columns={'door_count_x': 'sum_doors', 'door_count_y': 'avg_doors'}, inplace=True)

    agency_counts = pd.merge(age, doors, on=['agency'], how='left')

    return agency_counts

In [15]:
agency_counts = get_age_and_doors(vehicles)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
#agency_counts.to_parquet("agency_door_vehicle_counts.parquet")

In [16]:
agency_counts

Unnamed: 0,agency,_0_9,_10_12,_13_15,_16_20,_21_25,_26_30,_31_60,_60plus,total_vehicles,average_age_of_fleet__in_years_,average_lifetime_miles_per_vehicle,sum_15plus,sum_doors,avg_doors,vehicle_type
0,Access Services,918,9,2,0,0,0,0,0,929,5.964027,154171.849925,,912,152.00,6
1,"Alameda-Contra Costa Transit District, dba: AC...",708,109,103,112,8,2,0,0,1042,8.244583,256035.690100,122.0,1568,196.00,8
2,Alpine County Local Transportation Commission,1,1,0,0,0,0,0,0,2,9.000000,0.000000,,2,1.00,2
3,Altamont Corridor Express,8,4,5,16,10,0,0,0,43,19.000000,499140.935350,26.0,58,11.60,5
4,Amador Regional Transit System,18,3,1,1,1,0,0,0,24,4.065359,0.000000,2.0,21,4.20,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,Western Contra Costa Transit Authority,56,12,7,6,1,0,0,0,82,7.480000,223090.097300,7.0,117,19.50,6
214,"Yolo County Transportation District, dba: Yolobus",48,21,0,8,8,0,0,0,85,7.554598,328419.482767,16.0,134,26.80,5
215,Yosemite Area Regional Transportation System,9,1,0,0,0,0,0,0,10,7.600000,0.000000,,10,10.00,1
216,Yuba-Sutter Transit Authority,48,3,0,1,0,0,0,0,52,4.035256,134500.657200,,73,18.25,4


## Script work

### How many agencies have vehicles over 15 years old?

In [15]:
# summary of the vehicle counts

In [16]:
age =geography_utils.aggregate_by_geography(vehicles, 
                       group_cols = ["agency"],
                       sum_cols = ["total_vehicles", "_0_9","_10_12", "_13_15", "_16_20","_21_25","_26_30","_31_60","_60+"],
                       mean_cols = ["average_age_of_fleet__in_years_", "average_lifetime_miles_per_vehicle"]
                                      ).sort_values(["agency","total_vehicles"], ascending=[True, True])

In [17]:
age = age.rename(columns={'_60+': '_60plus'})

In [18]:
age

Unnamed: 0,agency,_0_9,_10_12,_13_15,_16_20,_21_25,_26_30,_31_60,_60plus,total_vehicles,average_age_of_fleet__in_years_,average_lifetime_miles_per_vehicle
2,Access Services,918,9,2,0,0,0,0,0,929,5.964027,154171.849925
6,"Alameda-Contra Costa Transit District, dba: AC...",708,109,103,112,8,2,0,0,1042,8.244583,256035.690100
203,Alpine County Local Transportation Commission,1,1,0,0,0,0,0,0,2,9.000000,0.000000
67,Altamont Corridor Express,8,4,5,16,10,0,0,0,43,19.000000,499140.935350
125,Amador Regional Transit System,18,3,1,1,1,0,0,0,24,4.065359,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
47,Western Contra Costa Transit Authority,56,12,7,6,1,0,0,0,82,7.480000,223090.097300
50,"Yolo County Transportation District, dba: Yolobus",48,21,0,8,8,0,0,0,85,7.554598,328419.482767
132,Yosemite Area Regional Transportation System,9,1,0,0,0,0,0,0,10,7.600000,0.000000
66,Yuba-Sutter Transit Authority,48,3,0,1,0,0,0,0,52,4.035256,134500.657200


In [19]:
age>>select(_.agency,_.average_age_of_fleet__in_years_)>>arrange(-_.average_age_of_fleet__in_years_)

Unnamed: 0,agency,average_age_of_fleet__in_years_
25,"Peninsula Corridor Joint Powers Board, dba: Ca...",28.864771
3,"City and County of San Francisco, dba: San Fra...",20.247938
8,Santa Clara Valley Transportation Authority,19.841354
4,San Diego Metropolitan Transit System,19.713625
67,Altamont Corridor Express,19.000000
...,...,...
26,San Joaquin Council,1.034883
157,City of Bell Gardens,1.000000
199,Elk Valley Rancheria,1.000000
7,San Diego Association of Governments,0.783883


In [20]:
print(f"the average fleet age of Rural Operators in California is {age.average_age_of_fleet__in_years_.mean()}")

the average fleet age of Rural Operators in California is 7.0317539832406135


In [21]:
print(f"there are {(len(age>>filter(_.average_age_of_fleet__in_years_<=5)))} rural operators with an average fleet age less than 5 years in California")

there are 58 rural operators with an average fleet age less than 5 years in California


In [22]:
print(f"there are {(len(age>>filter(_.average_age_of_fleet__in_years_>=10)))} rural operators with an average fleet age more than 5 years in California")

there are 31 rural operators with an average fleet age more than 5 years in California


In [23]:
age>>filter(_.average_age_of_fleet__in_years_>=10)>>select(_.agency)


Unnamed: 0,agency
67,Altamont Corridor Express
197,Chemehuevi Indian Tribe
3,"City and County of San Francisco, dba: San Fra..."
167,City of Bellflower
168,City of Beverly Hills
188,City of California City
97,City of Cerritos
151,"City of Corcoran, dba: Corcoran Area Transit"
149,City of Cudahy
215,City of El Segundo


In [24]:
older = (age.query('_21_25 != 0 or _26_30 != 0 or _31_60 != 0 or _60plus!=0'))

In [25]:
print(f"There are {len(older>>count(_.agency))} agencies with vehicles over 15 years old")

There are 36 agencies with vehicles over 15 years old


In [78]:
older.describe()

Unnamed: 0,_0_9,_10_12,_13_15,_16_20,_21_25,_26_30,_31_60,_60plus,total_vehicles,average_age_of_fleet__in_years_,average_lifetime_miles_per_vehicle
count,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0
mean,283.166667,58.027778,23.361111,28.694444,16.138889,6.361111,19.388889,2.305556,437.444444,10.537363,286812.1
std,650.194059,134.723948,56.848245,61.27867,40.30041,16.54285,82.316906,12.644184,919.584951,5.521069,258076.7
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.439955,0.0
25%,21.0,4.75,1.0,0.75,1.0,0.0,0.0,0.0,41.5,6.996267,149048.8
50%,52.0,19.5,6.5,3.0,2.0,0.5,0.0,0.0,84.5,8.726812,253338.9
75%,178.0,53.25,16.0,32.75,8.5,1.25,1.0,0.0,384.25,12.676372,344382.7
max,3668.0,798.0,324.0,283.0,218.0,84.0,479.0,76.0,5142.0,28.864771,1293594.0


In [79]:
older["sum_15plus"] = older[["_16_20","_21_25","_26_30","_31_60","_60plus"]].sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [80]:
older>>select(_.agency, _.sum_15plus)>>arrange(-_.sum_15plus)

Unnamed: 0,agency,sum_15plus
9,San Francisco Bay Area Rapid Transit District,624
3,"City and County of San Francisco, dba: San Fra...",385
0,Los Angeles County Metropolitan Transportation...,352
8,Santa Clara Valley Transportation Authority,319
25,"Peninsula Corridor Joint Powers Board, dba: Ca...",217
19,"Southern California Regional Rail Authority, d...",149
6,"Alameda-Contra Costa Transit District, dba: AC...",122
12,"Sacramento Regional Transit District, dba: Sac...",114
17,North County Transit District,100
4,San Diego Metropolitan Transit System,60


* Nine agencies have more than 100 vehicles older than 15 years

In [81]:
older = (older>>select(_.agency, _.sum_15plus))

In [82]:
age = pd.merge(age, older, on=['agency'], how='left')

In [83]:
age

Unnamed: 0,agency,_0_9,_10_12,_13_15,_16_20,_21_25,_26_30,_31_60,_60plus,total_vehicles,average_age_of_fleet__in_years_,average_lifetime_miles_per_vehicle,sum_15plus
0,Access Services,918,9,2,0,0,0,0,0,929,5.964027,154171.849925,
1,"Alameda-Contra Costa Transit District, dba: AC...",708,109,103,112,8,2,0,0,1042,8.244583,256035.690100,122.0
2,Alpine County Local Transportation Commission,1,1,0,0,0,0,0,0,2,9.000000,0.000000,
3,Altamont Corridor Express,8,4,5,16,10,0,0,0,43,19.000000,499140.935350,26.0
4,Amador Regional Transit System,18,3,1,1,1,0,0,0,24,4.065359,0.000000,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,Western Contra Costa Transit Authority,56,12,7,6,1,0,0,0,82,7.480000,223090.097300,7.0
214,"Yolo County Transportation District, dba: Yolobus",48,21,0,8,8,0,0,0,85,7.554598,328419.482767,16.0
215,Yosemite Area Regional Transportation System,9,1,0,0,0,0,0,0,10,7.600000,0.000000,
216,Yuba-Sutter Transit Authority,48,3,0,1,0,0,0,0,52,4.035256,134500.657200,


### Get a count of doors

In [84]:
vehicles.sample()

Unnamed: 0,agency,city,state,legacy_ntd_id,ntd_id,organization_type,reporter_type,primary_uza_population,agency_voms,vehicle_type,years_old:,_13_15,_16_20,_21_25,_26_30,_31_60,_60+,total_vehicles,average_age_of_fleet__in_years_,average_lifetime_miles_per_vehicle,_0_9,_10_12,vehicle_groups,doortype,door_count
2794,Imperial County Transportation Commission,El Centro,CA,9226,90226,Independent Public Agency or Authority of Tran...,Full Reporter,107672,28,Cutaway,,0,0,0,0,0,0,26,3.807692,115903.7308,26,0,Vans,1,26


In [85]:
vehicles>>count(_.vehicle_type)>>arrange(-_.n)

Unnamed: 0,vehicle_type,n
9,Cutaway,170
4,Bus,133
20,Trucks and other Rubber Tire Vehicles (Service),94
3,Automobiles (Service),79
21,Van,64
14,Minivan,63
16,Over-the-road Bus,24
0,Articulated Bus,16
2,Automobile,11
17,Sports Utility Vehicle,10


In [86]:
#counting the number of doors on each individual vehicle. 
d = {
    'Articulated Bus':3,
    'Automobile':1,
    'Automated Guideway Vehicle':2,
    'Automobiles (Service)':0,
    'Bus':2,
    'Cable Car':2,
    'Commuter Rail Locomotive':0,
    'Commuter Rail Passenger Coach':2,
    'Commuter Rail Self-Propelled Passenger Car':2,
    'Cutaway':1,
    'Double Decker Bus':2,
    'Ferryboat':2,
    'Heavy Rail Passenger Car':2,
    'Light Rail Vehicle':2,
    'Minivan':1,
    'Other':0,
    'Over-the-road Bus':1,
    'Sports Utility Vehicle':1,
    'Steel Wheel Vehicles (Service)':0,
    'Trolleybus':2,
    'Trucks and other Rubber Tire Vehicles (Service)':0,
    'Van':1,
    'Vintage Trolley':2
}

# dictionary = {
#     'Articulated Bus':'articulated_bus',
#     'Automobiles':'automobile',
#     'Automated Guideway Vehicle':'automated_guideway_vehicle',
#     'Automobiles (Service)':'automobile_service',
#     'Bus':'bus',
#     'Cable Car':'cable_car',
#     'Commuter Rail Locomotive':'commuter_rail_locomotive',
#     'Commuter Rail Passenger Coach':'commuter_rail_passenger_coach',
#     'Commuter Rail Self-Propelled Passenger Car':'commuter_rail_selfp_passenger_coach',
#     'Cutaway':'cutaway',
#     'Double Decker Bus':'double_decker_bus',
#     'Ferryboat':'ferryboat',
#     'Heavy Rail Passenger Car':'heavy_rail_passenger_car',
#     'Light Rail Vehicle':'light_rail_vehicle',
#     'Minivan':'minivan',
#     'Other':'other',
#     'Over-the-road Bus':'over_the_road_bus',
#     'Sports Utility Vehicle':'sport_utility_vehicle',
#     'Steel Wheel Vehicles (Service)':'steel_wheel_vehicles_service',
#     'Trolleybus':'trolleybus',
#     'Trucks and other Rubber Tire Vehicles (Service)':'trucks_and_other_rubber_tire_vehicles_service',
#     'Van':'van',
#     'Vintage Trolley':'vintage_historic_trolley'
# }

In [87]:
vehicles['doortype'] = (vehicles['vehicle_type'].map(d))

In [88]:
#pd.to_numeric(vehicles['doortype'])

In [89]:
vehicles['door_count']= vehicles['total_vehicles']*vehicles['doortype']

In [90]:
vehicles.sample()

Unnamed: 0,agency,city,state,legacy_ntd_id,ntd_id,organization_type,reporter_type,primary_uza_population,agency_voms,vehicle_type,years_old:,_13_15,_16_20,_21_25,_26_30,_31_60,_60+,total_vehicles,average_age_of_fleet__in_years_,average_lifetime_miles_per_vehicle,_0_9,_10_12,vehicle_groups,doortype,door_count
2728,Lake Transit Authority,Lower Lake,CA,9R02-001,9R02-91053,Independent Public Agency or Authority of Tran...,Rural Reporter,0,29,Minivan,,0,0,0,0,0,0,4,4.0,0.0,4,0,Vans,1,4


In [91]:
doors = geography_utils.aggregate_by_geography(vehicles, 
                       group_cols = ["agency"],
                       sum_cols = ["door_count"],
                       mean_cols = ["door_count"], 
                       nunique_cols = ["vehicle_type"])
doors.rename(columns={'door_count_x': 'sum_doors', 'door_count_y': 'avg_doors'}, inplace=True)

In [92]:
doors.sample(5)

Unnamed: 0,agency,sum_doors,avg_doors,vehicle_type
94,"Mountain Area Regional Transit Authority, dba:...",39,7.8,5
125,Amador Regional Transit System,21,4.2,5
119,Santa Barbara County Association of Governments,17,17.0,1
121,Morongo Basin Transit Authority,56,14.0,4
187,City of Woodlake,3,3.0,1


### Adding dfs together

In [94]:
agency_counts = pd.merge(age, doors, on=['agency'], how='left')

In [95]:
agency_counts

Unnamed: 0,agency,_0_9,_10_12,_13_15,_16_20,_21_25,_26_30,_31_60,_60plus,total_vehicles,average_age_of_fleet__in_years_,average_lifetime_miles_per_vehicle,sum_15plus,sum_doors,avg_doors,vehicle_type
0,Access Services,918,9,2,0,0,0,0,0,929,5.964027,154171.849925,,912,152.00,6
1,"Alameda-Contra Costa Transit District, dba: AC...",708,109,103,112,8,2,0,0,1042,8.244583,256035.690100,122.0,1568,196.00,8
2,Alpine County Local Transportation Commission,1,1,0,0,0,0,0,0,2,9.000000,0.000000,,2,1.00,2
3,Altamont Corridor Express,8,4,5,16,10,0,0,0,43,19.000000,499140.935350,26.0,58,11.60,5
4,Amador Regional Transit System,18,3,1,1,1,0,0,0,24,4.065359,0.000000,2.0,21,4.20,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,Western Contra Costa Transit Authority,56,12,7,6,1,0,0,0,82,7.480000,223090.097300,7.0,117,19.50,6
214,"Yolo County Transportation District, dba: Yolobus",48,21,0,8,8,0,0,0,85,7.554598,328419.482767,16.0,134,26.80,5
215,Yosemite Area Regional Transportation System,9,1,0,0,0,0,0,0,10,7.600000,0.000000,,10,10.00,1
216,Yuba-Sutter Transit Authority,48,3,0,1,0,0,0,0,52,4.035256,134500.657200,,73,18.25,4
