In [1]:
import pandas as pd
from siuba import *
import numpy as np
from calitp import *

In [10]:
dla = pd.read_csv('gs://calitp-analytics-data/data-analyses/dla/dla_placenames.csv', index_col=[0])

In [11]:
dla

Unnamed: 0,ct_code,dla_name,data_origin,organization_type
0,6302,Humboldt Bay Harbor Recreation & Conservation ...,DLA Locode,
1,6330,Willow Creek Community Services District,DLA Locode,
2,5036,Trinidad,DLA Locode,City/Town
3,5049,Ukiah,DLA Locode,City/Town
4,5082,Willits,DLA Locode,City/Town
...,...,...,...,...
1036,5465,Leave Blank,DLA Locode,
1037,6250,U.S. Fish and Wildlife Service,DLA Locode,
1038,6031,Tidewater Southern Railway Company,DLA Locode,
1039,6176,Tri-Counties Regional Park Group,DLA Locode,


In [12]:
dla = dla>>select(_.ct_code, _.dla_name, _.data_origin)

In [13]:
## importing previous NTD/ITP merge from 5310 analysis

In [14]:
m1 = pd.read_csv("gs://calitp-analytics-data/data-analyses/dla/ntd_itp_merge_.csv")
m1.ntd_id_x.fillna(m1["ntd_id_y"], inplace=True)
m1 = m1>>select(_.ntd_id_x,
                _.agency_name_ntd,
               _.name_airtable,
               _.doing_business_as,
                _.mobility_services_operated,
                _.itp_id, 
                _._merge
)
m1['_merge'].replace({"right_only":"In Airtable", "left_only":"In NTD", "both":"In NTD, Airtable"}, inplace=True)
m1 = m1.rename(columns={'_merge':'data_origin', 'ntd_id_x':'ntd_id'})

In [15]:
m1.sample()

Unnamed: 0,ntd_id,agency_name_ntd,name_airtable,doing_business_as,mobility_services_operated,itp_id,data_origin
45,90238,City of Delano,City of Delano,,"Delano Area Rapid Transit,Delano Dial-A-Ride",91.0,"In NTD, Airtable"


In [16]:
#should come up blank
m1>>count(_.agency_name_ntd)>>arrange(-_.n)>>filter(_.n>1)

Unnamed: 0,agency_name_ntd,n


In [17]:
m1.sample()

Unnamed: 0,ntd_id,agency_name_ntd,name_airtable,doing_business_as,mobility_services_operated,itp_id,data_origin
757,,,Solano Transportation Authority,,Solano Mobility Local Taxi Card Program,,In Airtable


In [18]:
m1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 853 entries, 0 to 852
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ntd_id                      242 non-null    object 
 1   agency_name_ntd             238 non-null    object 
 2   name_airtable               825 non-null    object 
 3   doing_business_as           86 non-null     object 
 4   mobility_services_operated  442 non-null    object 
 5   itp_id                      372 non-null    float64
 6   data_origin                 853 non-null    object 
dtypes: float64(1), object(6)
memory usage: 46.8+ KB


In [19]:
# condensing agency name column. using NTD first, then filling in blanks with Airtable

In [20]:
m1.agency_name_ntd.fillna(m1['name_airtable'], inplace=True)


In [21]:
m1 = m1.rename(columns={'agency_name_ntd':'name'}).drop(columns={'name_airtable'})

## Merging with DLA Locode

##### first on `name`

In [22]:
cw1 = (pd.merge(dla, m1, left_on='dla_name', right_on='name', how='outer', indicator='merge_status'))

In [23]:
cw1.merge_status.value_counts()

left_only     936
right_only    747
both          106
Name: merge_status, dtype: int64

In [24]:
cw1

Unnamed: 0,ct_code,dla_name,data_origin_x,ntd_id,name,doing_business_as,mobility_services_operated,itp_id,data_origin_y,merge_status
0,6302.0,Humboldt Bay Harbor Recreation & Conservation ...,DLA Locode,,,,,,,left_only
1,6330.0,Willow Creek Community Services District,DLA Locode,,,,,,,left_only
2,5036.0,Trinidad,DLA Locode,,,,,,,left_only
3,5049.0,Ukiah,DLA Locode,,,,,,,left_only
4,5082.0,Willits,DLA Locode,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...
1784,,,,,Willits Seniors Inc,,"Willits Paratransit,Willits Harrah Senior Center",,In Airtable,right_only
1785,,,,,Work Training Center,,,471.0,In Airtable,right_only
1786,,,,,Worldpay,,,,In Airtable,right_only
1787,,,,,Yellow Cab CA,,"Aliso Viejo Senior Mobility Program,Garden Gro...",,In Airtable,right_only


In [25]:
cw_nomatch_dla = (cw1>>filter(_.merge_status=='left_only')>>select(_.ct_code,
                                                                     _.dla_name, _.data_origin_x))


In [26]:
cw1 = (cw1>>filter(_.merge_status=='both'))
cw1.merge_status.replace({"both":"DLA match on name"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cw1.merge_status.replace({"both":"DLA match on name"}, inplace=True)


In [27]:
cw1.sample()

Unnamed: 0,ct_code,dla_name,data_origin_x,ntd_id,name,doing_business_as,mobility_services_operated,itp_id,data_origin_y,merge_status
642,5953.0,Los Angeles County,DLA Locode,,Los Angeles County,"LA DPW,LA County,Los Angeles DPW,Los Angeles C...","the Link Florence-Firestone/Walnut Park,the Li...",179.0,In Airtable,DLA match on name


##### second on `doing_business_as`

In [28]:
cw2 = (pd.merge(cw_nomatch_dla, m1, left_on='dla_name', right_on='doing_business_as', how='outer', indicator='merge_status'))

In [29]:
cw2.merge_status.value_counts()


left_only     932
right_only    849
both            4
Name: merge_status, dtype: int64

In [30]:
add1 = (cw2>>filter(_.merge_status=='both'))
add1['merge_status']= add1['merge_status'].replace({"both":"DLA match on doing business as"})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  add1['merge_status']= add1['merge_status'].replace({"both":"DLA match on doing business as"})


In [31]:
add1

Unnamed: 0,ct_code,dla_name,data_origin_x,ntd_id,name,doing_business_as,mobility_services_operated,itp_id,data_origin,merge_status
237,6039.0,Petaluma Transit,DLA Locode,90213,City of Petaluma,Petaluma Transit,"Petaluma Transit,Petaluma Paratransit",247.0,"In NTD, Airtable",DLA match on doing business as
289,6331.0,Sonoma County Transit,DLA Locode,90089,County of Sonoma,Sonoma County Transit,"Sonoma County Transit,Healdsburg Shuttle,Sonom...",314.0,"In NTD, Airtable",DLA match on doing business as
291,6341.0,Tri Delta Transit,DLA Locode,90162,The Eastern Contra Costa Transit Authority,Tri Delta Transit,,336.0,"In NTD, Airtable",DLA match on doing business as
490,6008.0,Fresno Area Express,DLA Locode,90027,City of Fresno,Fresno Area Express,"Fresno Area Express,FAX Handy Ride",116.0,"In NTD, Airtable",DLA match on doing business as


In [32]:
cw_nomatch_dla_2 = (cw2>>filter(_.merge_status=='left_only')>>select(_.ct_code,
                                                                     _.dla_name, _.data_origin_x))

##### third on `mobility_services_operated`

In [33]:
cw3 = (pd.merge(cw_nomatch_dla_2, m1, left_on='dla_name', right_on='mobility_services_operated', how='outer', indicator='merge_status'))


In [34]:
cw3.merge_status.value_counts()


left_only     931
right_only    852
both            1
Name: merge_status, dtype: int64

In [35]:
add2 = (cw3>>filter(_.merge_status=='both'))

In [36]:
add2['merge_status']= add2['merge_status'].replace({"both":"DLA match on mobility services operated"})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  add2['merge_status']= add2['merge_status'].replace({"both":"DLA match on mobility services operated"})


In [37]:
add2

Unnamed: 0,ct_code,dla_name,data_origin_x,ntd_id,name,doing_business_as,mobility_services_operated,itp_id,data_origin,merge_status
301,6295.0,Capitol Corridor,DLA Locode,,Capitol Corridor Joint Powers Authority,,Capitol Corridor,56.0,In Airtable,DLA match on mobility services operated


In [38]:
nomatch = (cw3>>filter(_.merge_status!='both'))

In [39]:
nomatch['merge_status']= nomatch['merge_status'].replace({"left_only":"No Match", "right_only":"No Match"})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nomatch['merge_status']= nomatch['merge_status'].replace({"left_only":"No Match", "right_only":"No Match"})


In [40]:
nomatch

Unnamed: 0,ct_code,dla_name,data_origin_x,ntd_id,name,doing_business_as,mobility_services_operated,itp_id,data_origin,merge_status
0,6302.0,Humboldt Bay Harbor Recreation & Conservation ...,DLA Locode,,,,,,,No Match
1,6330.0,Willow Creek Community Services District,DLA Locode,,,,,,,No Match
2,5036.0,Trinidad,DLA Locode,,,,,,,No Match
3,5049.0,Ukiah,DLA Locode,,,,,,,No Match
4,5082.0,Willits,DLA Locode,,,,,,,No Match
...,...,...,...,...,...,...,...,...,...,...
1779,,,,,Vivalon Inc,,Golden Gate Transit Paratransit,,In Airtable,No Match
1780,,,,,Volunteer Transportation Center Inc,,Volunteers on the Go,,In Airtable,No Match
1781,,,,,West Berkeley Transportation Management Agency,,West Berkeley Shuttle,365.0,In Airtable,No Match
1782,,,,,Willits Seniors Inc,,"Willits Paratransit,Willits Harrah Senior Center",,In Airtable,No Match


### Concating all matches

In [41]:
cw = pd.concat([cw1, add1], ignore_index=True, sort=False)
cw_match = pd.concat([cw, add2], ignore_index=True, sort=False)

In [42]:
cw_match.sample(5)

Unnamed: 0,ct_code,dla_name,data_origin_x,ntd_id,name,doing_business_as,mobility_services_operated,itp_id,data_origin_y,merge_status,data_origin
79,6165.0,Riverside Transit Agency,DLA Locode,90031.0,Riverside Transit Agency,,"Riverside Transit,Riverside Transit Dial-A-Ride",269.0,"In NTD, Airtable",DLA match on name,
32,6497.0,Solano County Transit,DLA Locode,90232.0,Solano County Transit,,"Solano Express,SolTrans,SolTrans ADA Paratransit",310.0,"In NTD, Airtable",DLA match on name,
83,5948.0,Inyo County,DLA Locode,,Inyo County,,,,In Airtable,DLA match on name,
1,6140.0,Mendocino Council of Governments,DLA Locode,,Mendocino Council of Governments,,,,In Airtable,DLA match on name,
16,6129.0,Colusa County Transportation Commission,DLA Locode,,Colusa County Transportation Commission,,,,In Airtable,DLA match on name,


In [43]:
cw_match.data_origin_y.fillna(cw_match['data_origin'], inplace=True)

In [44]:
cw_match = cw_match.drop(columns='data_origin')

In [45]:
cw_match.data_origin_y.value_counts()

In Airtable         57
In NTD, Airtable    53
In NTD               1
Name: data_origin_y, dtype: int64

In [46]:
cw_match['data_origin'] = cw_match['data_origin_y'].astype(str) + ' and DLA' 

In [47]:
cw_match = cw_match.drop(columns={'data_origin_x','data_origin_y'})

In [48]:
cw_match.sample(5)

Unnamed: 0,ct_code,dla_name,ntd_id,name,doing_business_as,mobility_services_operated,itp_id,merge_status,data_origin
62,6094.0,Tulare County Association of Governments,,Tulare County Association of Governments,,,,DLA match on name,In Airtable and DLA
71,5953.0,Los Angeles County,,Los Angeles County,"LA DPW,LA County,Los Angeles DPW,Los Angeles C...","the Link Florence-Firestone/Walnut Park,the Li...",179.0,DLA match on name,In Airtable and DLA
35,6000.0,San Francisco Bay Area Rapid Transit District,90003,San Francisco Bay Area Rapid Transit District,,Bay Area Rapid Transit,279.0,DLA match on name,"In NTD, Airtable and DLA"
59,6285.0,Kern Regional Transit,9R02-91059,Kern Regional Transit,,"Kern Transit,Mojave Dial-A-Ride,Rosamonde Dial...",146.0,DLA match on name,"In NTD, Airtable and DLA"
54,5949.0,San Luis Obispo County,,San Luis Obispo County,,,,DLA match on name,In Airtable and DLA


In [68]:
#cw_match.to_csv(f"{GCS_FILE_PATH}dla_ntd_itp_matches.csv')

In [50]:
#concating to have all records
cw_all = pd.concat([cw_match, nomatch], ignore_index=True, sort=False)

In [51]:
cw_all.sample(5)

Unnamed: 0,ct_code,dla_name,ntd_id,name,doing_business_as,mobility_services_operated,itp_id,merge_status,data_origin,data_origin_x
405,6257.0,Crockett Communuty Foundation,,,,,,No Match,,DLA Locode
1748,,,,Ecolane,,"RTD Van Go!,Cruz On-demand",,No Match,In Airtable,
737,6438.0,Koreatown Youth and Community Center,,,,,,No Match,,DLA Locode
1255,,,,Cloverdale Transit,,,,No Match,In Airtable,
566,5044.0,Visalia,,,,,,No Match,,DLA Locode


In [52]:
cw_all.data_origin.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1894 entries, 0 to 1893
Series name: data_origin
Non-Null Count  Dtype 
--------------  ----- 
963 non-null    object
dtypes: object(1)
memory usage: 14.9+ KB


In [53]:
cw_all.data_origin.fillna(cw_all['data_origin_x'], inplace=True)
cw_all.name.fillna(cw_all['dla_name'], inplace=True)
cw_all = cw_all.drop(columns={'data_origin_x', 'dla_name'})


In [54]:
cw_all.data_origin.value_counts()

DLA Locode                  931
In Airtable                 615
In NTD, Airtable            209
In Airtable and DLA          57
In NTD, Airtable and DLA     53
In NTD                       28
In NTD and DLA                1
Name: data_origin, dtype: int64

In [55]:
cw_all>>filter(_.data_origin.isnull())

Unnamed: 0,ct_code,ntd_id,name,doing_business_as,mobility_services_operated,itp_id,merge_status,data_origin


In [56]:
cw_all.sample(5)

Unnamed: 0,ct_code,ntd_id,name,doing_business_as,mobility_services_operated,itp_id,merge_status,data_origin
524,5146.0,,Coalinga,,,,No Match,DLA Locode
903,6468.0,,San Elijo Lagoon Conservancy,,,,No Match,DLA Locode
558,6265.0,,Kern County Parks and Recreation Department,,,,No Match,DLA Locode
345,6057.0,,Port Of Oakland,,,,No Match,DLA Locode
957,5410.0,,Irvine,,,,No Match,DLA Locode


In [57]:
cw_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1894 entries, 0 to 1893
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ct_code                     1042 non-null   float64
 1   ntd_id                      298 non-null    object 
 2   name                        1894 non-null   object 
 3   doing_business_as           99 non-null     object 
 4   mobility_services_operated  497 non-null    object 
 5   itp_id                      433 non-null    float64
 6   merge_status                1894 non-null   object 
 7   data_origin                 1894 non-null   object 
dtypes: float64(2), object(6)
memory usage: 118.5+ KB


In [58]:
cw_all>>filter(_.mobility_services_operated.notnull())

Unnamed: 0,ct_code,ntd_id,name,doing_business_as,mobility_services_operated,itp_id,merge_status,data_origin
0,6241.0,9R02-91047,Mendocino Transit Authority,,"Mendocino Transit Authority,Ukiah Dial-A-Ride,...",198.0,DLA match on name,"In NTD, Airtable and DLA"
3,6428.0,9R02-91097,Redwood Coast Transit Authority,,"Redwood Coast Transit,Redwood Coast Dial-A-Ride",261.0,DLA match on name,"In NTD, Airtable and DLA"
4,6436.0,9R02-91053,Lake Transit Authority,,"Lake Transit,Clearlake/Lower Lake Dial-A-Ride,...",159.0,DLA match on name,"In NTD, Airtable and DLA"
5,6162.0,9R02-91036,Humboldt Transit Authority,,"Humboldt Transit Authority Dial-A-Ride,Redwood...",135.0,DLA match on name,"In NTD, Airtable and DLA"
6,5908.0,9R02-91089,Tehama County,,"Tehama Rural Area eXpress,ParaTRAX",334.0,DLA match on name,"In NTD, Airtable and DLA"
...,...,...,...,...,...,...,...,...
1889,,,Vivalon Inc,,Golden Gate Transit Paratransit,,No Match,In Airtable
1890,,,Volunteer Transportation Center Inc,,Volunteers on the Go,,No Match,In Airtable
1891,,,West Berkeley Transportation Management Agency,,West Berkeley Shuttle,365.0,No Match,In Airtable
1892,,,Willits Seniors Inc,,"Willits Paratransit,Willits Harrah Senior Center",,No Match,In Airtable


In [66]:
cw_all.data_origin.value_counts()

DLA Locode                  931
In Airtable                 615
In NTD, Airtable            209
In Airtable and DLA          57
In NTD, Airtable and DLA     53
In NTD                       28
In NTD and DLA                1
Name: data_origin, dtype: int64

In [59]:
cw_all.merge_status.value_counts()

No Match                                   1783
DLA match on name                           106
DLA match on doing business as                4
DLA match on mobility services operated       1
Name: merge_status, dtype: int64

In [60]:
cw_all>>filter(_.merge_status=='DLA match on mobility services operated')

Unnamed: 0,ct_code,ntd_id,name,doing_business_as,mobility_services_operated,itp_id,merge_status,data_origin
110,6295.0,,Capitol Corridor Joint Powers Authority,,Capitol Corridor,56.0,DLA match on mobility services operated,In Airtable and DLA


In [61]:
cw_all>>filter(_.doing_business_as.notnull())>>filter(_.merge_status=='DLA match on doing business as')

Unnamed: 0,ct_code,ntd_id,name,doing_business_as,mobility_services_operated,itp_id,merge_status,data_origin
106,6039.0,90213,City of Petaluma,Petaluma Transit,"Petaluma Transit,Petaluma Paratransit",247.0,DLA match on doing business as,"In NTD, Airtable and DLA"
107,6331.0,90089,County of Sonoma,Sonoma County Transit,"Sonoma County Transit,Healdsburg Shuttle,Sonom...",314.0,DLA match on doing business as,"In NTD, Airtable and DLA"
108,6341.0,90162,The Eastern Contra Costa Transit Authority,Tri Delta Transit,,336.0,DLA match on doing business as,"In NTD, Airtable and DLA"
109,6008.0,90027,City of Fresno,Fresno Area Express,"Fresno Area Express,FAX Handy Ride",116.0,DLA match on doing business as,"In NTD, Airtable and DLA"


In [62]:
cw_all = cw_all[['ct_code', 'ntd_id', 'itp_id', 'name', 'merge_status', 'data_origin']]

In [63]:
cw_all

Unnamed: 0,ct_code,ntd_id,itp_id,name,merge_status,data_origin
0,6241.0,9R02-91047,198.0,Mendocino Transit Authority,DLA match on name,"In NTD, Airtable and DLA"
1,6140.0,,,Mendocino Council of Governments,DLA match on name,In Airtable and DLA
2,6133.0,,,Humboldt County Association of Governments,DLA match on name,In Airtable and DLA
3,6428.0,9R02-91097,261.0,Redwood Coast Transit Authority,DLA match on name,"In NTD, Airtable and DLA"
4,6436.0,9R02-91053,159.0,Lake Transit Authority,DLA match on name,"In NTD, Airtable and DLA"
...,...,...,...,...,...,...
1889,,,,Vivalon Inc,No Match,In Airtable
1890,,,,Volunteer Transportation Center Inc,No Match,In Airtable
1891,,,365.0,West Berkeley Transportation Management Agency,No Match,In Airtable
1892,,,,Willits Seniors Inc,No Match,In Airtable


In [67]:
#cw_all.to_csv(f"{GCS_FILE_PATH}dla_ntd_itp_crosswalk.csv")