# Add DLA to TIRCP and Blackcat Organizations

In [1]:
import pandas as pd
from siuba import *
import numpy as np
from calitp_data_analysis.sql import *

## Read in Data & rename cols for easier merge

In [2]:
dla = pd.read_excel('gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx')

In [3]:
dla = to_snakecase(dla)

In [4]:
dt = pd.read_parquet('gs://calitp-analytics-data/data-analyses/grants/BlackCat_TIRCP_Dec_2022.parquet')

In [5]:
dla = dla>>select(_.agency_locode, _.agency_name, _.district)

In [6]:
## changing string format to match dt
dla['agency_name'] = dla['agency_name'].str.title()

In [7]:
dla>>filter(_.agency_name.str.contains('Alameda'))

Unnamed: 0,agency_locode,agency_name,district
196,6002,Alameda - Contra Costa Transit District,4
197,5014,Alameda,4
198,5933,Alameda County,4
199,6068,Alameda County Transportation Authority,4
221,6380,Alameda County Social Services Agency,4
228,6430,Alameda County Transportation Improvement Auth...,4
238,6480,Alameda County Transportation Commission,4
239,6481,Alameda County Waste Management Authority,4
329,6273,Alameda County Congestion Management Agency,4
330,6017,Alameda County Transit District,4


In [8]:
dla['agency_name'] = dla['agency_name'].str.replace( "- ", "" )

In [9]:
dla['agency_name'] = dla['agency_name'].str.replace( " -", " " )

In [10]:
dla['agency_name'] = dla['agency_name'].str.replace( "-", " " )

In [11]:
dla = dla.rename(columns={"agency_locode":"locode","agency_name":"name", "district":"caltrans_district"})

In [12]:
## look at DT

In [13]:
dt>>filter(_.BC_TIRCP_merge =='Found in both TIRCP and BlackCat')

Unnamed: 0,BlackCat_Orgs,TIRCP_Orgs,BC_TIRCP_merge,calitp_itp_id,caltrans_district
2,Fresno County Rural Transit Agency,Fresno County Rural Transit Agency,Found in both TIRCP and BlackCat,117.0,06 - Fresno
8,San Diego Metropolitan Transit System,San Diego Metropolitan Transit System,Found in both TIRCP and BlackCat,278.0,11 - San Diego
9,Humboldt Transit Authority,Humboldt Transit Authority,Found in both TIRCP and BlackCat,135.0,01 - Eureka
28,City Of Wasco,City Of Wasco,Found in both TIRCP and BlackCat,,District 6: Fresno / Bakersfield
45,Lake Transit Authority,Lake Transit Authority,Found in both TIRCP and BlackCat,159.0,01 - Eureka
47,Livermore Amador Valley Transit Authority,Livermore Amador Valley Transit Authority,Found in both TIRCP and BlackCat,,District 4: Bay Area / Oakland
60,San Joaquin Regional Transit District,San Joaquin Regional Transit District,Found in both TIRCP and BlackCat,284.0,10 - Stockton
62,San Mateo County Transit District,San Mateo County Transit District,Found in both TIRCP and BlackCat,,District 4: Bay Area / Oakland
63,Santa Clara Valley Transportation Authority,Santa Clara Valley Transportation Authority,Found in both TIRCP and BlackCat,294.0,04 - Oakland
71,Transit Joint Powers Authority Of Merced County,Transit Joint Powers Authority Of Merced County,Found in both TIRCP and BlackCat,,District 10: Stockton


## Create new col for Blackcat and TIRCP org names

In [14]:
dt['name']= np.NaN

In [15]:
dt.name.fillna(dt['BlackCat_Orgs'], inplace=True)
dt.name.fillna(dt['TIRCP_Orgs'], inplace=True)

In [16]:
##filter down cols

In [17]:
dt2 = dt>>select(_.name, _.BC_TIRCP_merge, _.caltrans_district)

In [18]:
dt2.sample()

Unnamed: 0,name,BC_TIRCP_merge,caltrans_district
35,County Of Shasta Department Of Public Works,Black Cat Only,


## First Merge

In [19]:
merge = pd.merge(dla, dt2, on="name", how="outer", indicator=True)

In [20]:
merge

Unnamed: 0,locode,name,caltrans_district_x,BC_TIRCP_merge,caltrans_district_y,_merge
0,6302.0,Humboldt Bay Harbor Recreation & Conservation ...,1.0,,,left_only
1,6330.0,Willow Creek Community Services District,1.0,,,left_only
2,5036.0,Trinidad,1.0,,,left_only
3,5049.0,Ukiah,1.0,,,left_only
4,5082.0,Willits,1.0,,,left_only
...,...,...,...,...,...,...
1230,,San Francisco Bay Area Water Emergency Transpo...,,TIRCP Only,District 4: Bay Area / Oakland,right_only
1231,,City Of Cupertino,,TIRCP Only,District 4: Bay Area / Oakland,right_only
1232,,City Of Glendale,,TIRCP Only,District 7: Los Angeles,right_only
1233,,City Of Oakland,,TIRCP Only,District 4: Bay Area / Oakland,right_only


In [21]:
merge._merge.value_counts()

left_only     988
right_only    194
both           53
Name: _merge, dtype: int64

In [22]:
merge>>filter(_._merge=='both')

Unnamed: 0,locode,name,caltrans_district_x,BC_TIRCP_merge,caltrans_district_y,_merge
11,6241.0,Mendocino Transit Authority,1.0,Black Cat Only,01 - Eureka,both
42,6428.0,Redwood Coast Transit Authority,1.0,Black Cat Only,,both
43,6436.0,Lake Transit Authority,1.0,Found in both TIRCP and BlackCat,01 - Eureka,both
48,6162.0,Humboldt Transit Authority,1.0,Found in both TIRCP and BlackCat,01 - Eureka,both
73,6147.0,Plumas County Transportation Commission,2.0,Black Cat Only,,both
84,6427.0,Lassen Transit Service Agency,2.0,Black Cat Only,02 - Redding,both
90,6496.0,Shasta Regional Transportation Agency,2.0,TIRCP Only,District 2:Redding,both
112,6225.0,El Dorado County Transit Authority,3.0,Black Cat Only,,both
125,6478.0,Tahoe Transportation District,3.0,Black Cat Only,,both
149,6195.0,Yolo County Transportation District,3.0,Black Cat Only,03 - Marysville,both


In [23]:
(merge>>filter(_._merge=='both')).BC_TIRCP_merge.value_counts()

Black Cat Only                      31
TIRCP Only                          12
Found in both TIRCP and BlackCat    10
Name: BC_TIRCP_merge, dtype: int64

In [24]:
(merge>>filter(_._merge=='right_only')).BC_TIRCP_merge.value_counts()

Black Cat Only                      166
TIRCP Only                           23
Found in both TIRCP and BlackCat      5
Name: BC_TIRCP_merge, dtype: int64

In [25]:
(merge>>filter(_._merge=='both')).BC_TIRCP_merge.value_counts()

Black Cat Only                      31
TIRCP Only                          12
Found in both TIRCP and BlackCat    10
Name: BC_TIRCP_merge, dtype: int64

In [26]:
(merge>>filter(_._merge!='both')).sample(5)

Unnamed: 0,locode,name,caltrans_district_x,BC_TIRCP_merge,caltrans_district_y,_merge
476,6189.0,Desert Tortoise Preserve Committee,6.0,,,left_only
756,5384.0,Desert Hot Springs,8.0,,,left_only
624,5283.0,Palos Verdes Est,7.0,,,left_only
993,6383.0,Council For Tribal Employment Rights,53.0,,,left_only
13,6140.0,Mendocino Council Of Governments,1.0,,,left_only


In [27]:
match = merge[(merge['_merge'] == 'both')]

In [28]:
nomatch = merge[(merge['_merge'] == 'right_only')]

In [29]:
nomatch_total = merge[(merge['_merge'] != 'both')]

## Second Merge with other versions of same name 
* we have cities that have names entered as "City of X" and "X"

In [30]:
dla2 = pd.read_excel('gs://calitp-analytics-data/data-analyses/dla/lst_Agencies_20211208.xlsx')

In [31]:
dla2 = dla2>>select(_.AgencyID,
                    #_.Agency, 
                    _.Agency2, _.CT_Districts) 

In [32]:
dla2= dla2.rename(columns={"AgencyID":"locode",
    "CT_Districts":"caltrans_district"})

In [33]:
dla2['Agency2'] = dla2['Agency2'].str.title()
#dla2['Agency'] = dla2['Agency'].str.title()

In [34]:
dla2.sample()

Unnamed: 0,locode,Agency2,caltrans_district
24,115,,


In [35]:
#using nomatches from leftover first merge

In [36]:
nomatch = nomatch>>select(_.name, _.BC_TIRCP_merge, _.caltrans_district_y)

In [37]:
nomatch.sample()

Unnamed: 0,name,BC_TIRCP_merge,caltrans_district_y
1056,City Of Mcfarland,Black Cat Only,


In [38]:
#merge2_1 = pd.merge(dla2, nomatch, left_on='Agency', right_on="name", how="outer", indicator='second_merge')

In [39]:
#merge2_1.second_merge.value_counts()

In [40]:
#merge2_1>>filter(_.second_merge=='both')

In [41]:
merge2_2 = pd.merge(dla2, nomatch, left_on='Agency2', right_on="name", how="outer", indicator='second_merge')

In [42]:
merge2_2.second_merge.value_counts()

left_only     930
right_only    147
both           47
Name: second_merge, dtype: int64

In [43]:
merge2_match = merge2_2>>filter(_.second_merge=='both')

In [44]:
#merge2_match = pd.concat([merge2_match, (merge2_1>>filter(_.second_merge=='both'))])


In [45]:
merge2_match>>arrange(_.Agency2)

Unnamed: 0,locode,Agency2,caltrans_district,name,BC_TIRCP_merge,caltrans_district_y,second_merge
976,M016,Anaheim Transportation Network,12,Anaheim Transportation Network,TIRCP Only,District 12: Orange County,both
327,998,Capitol Corridor Joint Powers Authority,"03, 04",Capitol Corridor Joint Powers Authority,TIRCP Only,District 4: Bay Area / Oakland,both
15,5021,City Of Arcata,1,City Of Arcata,Black Cat Only,,both
654,5370,City Of Arvin,6,City Of Arvin,Black Cat Only,,both
380,5077,City Of Auburn,3,City Of Auburn,Black Cat Only,,both
846,5399,City Of California City,9,City Of California City,Black Cat Only,,both
644,5258,City Of Chowchilla,6,City Of Chowchilla,Black Cat Only,,both
505,5318,City Of Cupertino,4,City Of Cupertino,TIRCP Only,District 4: Bay Area / Oakland,both
627,5143,City Of Dinuba,6,City Of Dinuba,Black Cat Only,,both
455,5056,City Of Dixon,4,City Of Dixon,Black Cat Only,,both


## Third Merge with CalSMART 

In [46]:
nomatch2 = (merge2_2>>filter(_.second_merge=='right_only')>>select(_.name,
                                                                _.BC_TIRCP_merge,
                                                                _.caltrans_district_y))

In [47]:
#reread
dla3 = pd.read_excel('gs://calitp-analytics-data/data-analyses/dla/lst_Agencies_20211208.xlsx')

In [48]:
dla3 = dla3>>select(_.AgencyID, _.CalSMART_Agency, _.CT_Districts) 

In [49]:
dla3= dla3.rename(columns={"AgencyID":"locode",
    "CT_Districts":"caltrans_district"})

In [50]:
dla3.sample()

Unnamed: 0,locode,CalSMART_Agency,caltrans_district
259,5137,City of Richmond,4


In [51]:
dla3['CalSMART_Agency'] = dla3['CalSMART_Agency'].str.title()

In [52]:
## merging

In [53]:
merge3 = pd.merge(dla3, nomatch2, left_on='CalSMART_Agency', right_on="name", how="outer", indicator='third_merge')

In [54]:
merge3.third_merge.value_counts()

left_only     970
right_only    140
both            7
Name: third_merge, dtype: int64

In [55]:
merge3_match = merge3>>filter(_.third_merge=='both')

In [56]:
merge3>>filter(_.third_merge=='right_only')

Unnamed: 0,locode,CalSMART_Agency,caltrans_district,name,BC_TIRCP_merge,caltrans_district_y,third_merge
977,,,,Yuba-Sutter Transit Authority,Black Cat Only,03 - Marysville,right_only
978,,,,Butte County Association Of Governments Butte ...,Black Cat Only,,right_only
979,,,,Calaveras County Public Works,Black Cat Only,,right_only
980,,,,County Of Los Angeles - Department Of Public W...,Black Cat Only,,right_only
981,,,,County Of Mariposa,Black Cat Only,,right_only
...,...,...,...,...,...,...,...
1112,,,,Foothill Transit,TIRCP Only,District 7: Los Angeles,right_only
1113,,,,Bay Area Rapid Transit,TIRCP Only,District 4: Bay Area / Oakland,right_only
1114,,,,Long Beach Transit,TIRCP Only,District 7: Los Angeles,right_only
1115,,,,Santa Monica Big Blue Bus,TIRCP Only,District 7: Los Angeles,right_only


### fixing county names

In [57]:
nomatch3 = merge3>>filter(_.third_merge=='right_only')

In [58]:
nomatch3 = nomatch3>>select(_.name, _.BC_TIRCP_merge, _.caltrans_district_y)

In [59]:
nomatch_county_city = nomatch3[(nomatch3['name'].str.contains('County')) | (nomatch3['name'].str.contains('City'))]

In [60]:
nomatch_county_city.sample()

Unnamed: 0,name,BC_TIRCP_merge,caltrans_district_y
993,Alpine County Community Development,Black Cat Only,


In [61]:
df_copy = nomatch_county_city>>select(_['name'])

In [62]:
df_copy['str_len'] = df_copy['name'].str.split().str.len()

In [63]:
df_copy_fix = df_copy>>filter(_.str_len<=3)

In [64]:
df_copy_fix[['name_pt1', 'name_pt2']] = df_copy_fix['name'].str.split(' Of ', 1, expand=True)


  df_copy_fix[['name_pt1', 'name_pt2']] = df_copy_fix['name'].str.split(' Of ', 1, expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy_fix[['name_pt1', 'name_pt2']] = df_copy_fix['name'].str.split(' Of ', 1, expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy_fix[['name_pt1', 'name_pt2']] = df_copy_fix['name'].str.split(' Of ', 1, expand=True)


In [65]:
df_copy_fix['new_name'] = df_copy_fix['name_pt2'] + ' ' + df_copy_fix['name_pt1']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy_fix['new_name'] = df_copy_fix['name_pt2'] + ' ' + df_copy_fix['name_pt1']


In [66]:
new_name_mapping = (dict(df_copy_fix[['name', 'new_name']].values))
    
nomatch_county_city['new_name'] = nomatch_county_city['name'].map(new_name_mapping)

nomatch_county_city['new_name'] = nomatch_county_city['new_name'].fillna(nomatch_county_city['name'])
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nomatch_county_city['new_name'] = nomatch_county_city['name'].map(new_name_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nomatch_county_city['new_name'] = nomatch_county_city['new_name'].fillna(nomatch_county_city['name'])


In [67]:
new_name_mapping

{'County Of Mariposa': 'Mariposa County',
 'County Of Siskiyou': 'Siskiyou County',
 'County Of Tulare': 'Tulare County',
 'County Of Sonoma': 'Sonoma County',
 'County Of Ventura': 'Ventura County'}

In [68]:
nomatch_county_city

Unnamed: 0,name,BC_TIRCP_merge,caltrans_district_y,new_name
978,Butte County Association Of Governments Butte ...,Black Cat Only,,Butte County Association Of Governments Butte ...
979,Calaveras County Public Works,Black Cat Only,,Calaveras County Public Works
980,County Of Los Angeles - Department Of Public W...,Black Cat Only,,County Of Los Angeles - Department Of Public W...
981,County Of Mariposa,Black Cat Only,,Mariposa County
982,County Of Nevada Public Works,Black Cat Only,,County Of Nevada Public Works
983,County Of Sacramento Department Of Transportation,Black Cat Only,,County Of Sacramento Department Of Transportation
984,County Of Shasta Department Of Public Works,Black Cat Only,,County Of Shasta Department Of Public Works
985,County Of Siskiyou,Black Cat Only,,Siskiyou County
986,County Of Tulare,Black Cat Only,,Tulare County
988,Placer County Public Works,Black Cat Only,,Placer County Public Works


In [69]:
nomatch_county_city = nomatch_county_city>>select(_.new_name, _.BC_TIRCP_merge, _.caltrans_district_y)

In [70]:
merge4 = pd.merge(dla2, nomatch_county_city, left_on='Agency2', right_on="new_name", how="outer", indicator='fourth_merge')

In [71]:
merge4.fourth_merge.value_counts()

left_only     972
right_only     20
both            5
Name: fourth_merge, dtype: int64

In [72]:
merge4_match = merge4>>filter(_.fourth_merge=='both')

## Concating Merge matches

In [73]:
matches = pd.concat([match, merge2_match, merge3_match, merge4_match])

In [74]:
matches

Unnamed: 0,locode,name,caltrans_district_x,BC_TIRCP_merge,caltrans_district_y,_merge,Agency2,caltrans_district,second_merge,CalSMART_Agency,third_merge,new_name,fourth_merge
11,6241.0,Mendocino Transit Authority,1.0,Black Cat Only,01 - Eureka,both,,,,,,,
42,6428.0,Redwood Coast Transit Authority,1.0,Black Cat Only,,both,,,,,,,
43,6436.0,Lake Transit Authority,1.0,Found in both TIRCP and BlackCat,01 - Eureka,both,,,,,,,
48,6162.0,Humboldt Transit Authority,1.0,Found in both TIRCP and BlackCat,01 - Eureka,both,,,,,,,
73,6147.0,Plumas County Transportation Commission,2.0,Black Cat Only,,both,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,5902,,,Black Cat Only,,,Siskiyou County,2,,,,Siskiyou County,both
534,5920,,,Black Cat Only,,,Sonoma County,4,,,,Sonoma County,both
659,5946,,,Black Cat Only,,,Tulare County,6,,,,Tulare County,both
767,5952,,,Black Cat Only,,,Ventura County,7,,,,Ventura County,both


In [75]:
matches._merge.fillna(matches['second_merge'], inplace=True)
matches._merge.fillna(matches['third_merge'], inplace=True)
matches._merge.fillna(matches['fourth_merge'], inplace=True)


In [76]:
matches.name.fillna(matches['new_name'], inplace=True)

In [77]:
matches.caltrans_district.fillna(matches['caltrans_district_x'], inplace=True)
matches.caltrans_district.fillna(matches['caltrans_district_y'], inplace=True)

In [78]:
matches = matches.drop(columns={'caltrans_district_x', 'caltrans_district_y', 'Agency2', 'second_merge', 'CalSMART_Agency', 'third_merge', 'fourth_merge', 'new_name'})

In [79]:
matches._merge.value_counts()

both          112
left_only       0
right_only      0
Name: _merge, dtype: int64

In [80]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112 entries, 11 to 890
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   locode             112 non-null    object  
 1   name               112 non-null    object  
 2   BC_TIRCP_merge     112 non-null    object  
 3   _merge             112 non-null    category
 4   caltrans_district  109 non-null    object  
dtypes: category(1), object(4)
memory usage: 4.6+ KB


In [81]:
matches

Unnamed: 0,locode,name,BC_TIRCP_merge,_merge,caltrans_district
11,6241.0,Mendocino Transit Authority,Black Cat Only,both,1.0
42,6428.0,Redwood Coast Transit Authority,Black Cat Only,both,1.0
43,6436.0,Lake Transit Authority,Found in both TIRCP and BlackCat,both,1.0
48,6162.0,Humboldt Transit Authority,Found in both TIRCP and BlackCat,both,1.0
73,6147.0,Plumas County Transportation Commission,Black Cat Only,both,2.0
...,...,...,...,...,...
355,5902,Siskiyou County,Black Cat Only,both,2
534,5920,Sonoma County,Black Cat Only,both,4
659,5946,Tulare County,Black Cat Only,both,6
767,5952,Ventura County,Black Cat Only,both,7


## Concat with non-matches

In [82]:
##from first full merge
nomatch_total.sample()

Unnamed: 0,locode,name,caltrans_district_x,BC_TIRCP_merge,caltrans_district_y,_merge
459,5416.0,Marina,5.0,,,left_only


In [83]:
## add in the few name changes we used to get matches to the non_match_total so we dont have the originals in the final dataframe

##mapping to new column
nomatch_total['new_name'] = nomatch_total['name'].map(new_name_mapping)
## fill names from other column
nomatch_total['new_name'] = nomatch_total['new_name'].fillna(nomatch_total['name'])

## rename newname col and delete columns
nomatch_total = nomatch_total.drop(columns={'name'})
nomatch_total = nomatch_total.rename(columns= {'new_name':'name'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nomatch_total['new_name'] = nomatch_total['name'].map(new_name_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nomatch_total['new_name'] = nomatch_total['new_name'].fillna(nomatch_total['name'])


In [84]:
#concattt
test = pd.concat([nomatch_total, matches])

In [85]:
test>>arrange(_.name)

Unnamed: 0,locode,caltrans_district_x,BC_TIRCP_merge,caltrans_district_y,_merge,name,caltrans_district
1133,,,Black Cat Only,,right_only,Able Industries,
89,736,,Black Cat Only,,both,Able Industries,
563,6312.0,7.0,,,left_only,Access Services,
765,5408.0,8.0,,,left_only,Adelanto,
680,5435.0,7.0,,,left_only,Agoura Hills,
...,...,...,...,...,...,...,...
150,6224.0,3.0,,,left_only,Yuba Sutter Transit Authority,
1042,,,Black Cat Only,03 - Marysville,right_only,Yuba-Sutter Transit Authority,
744,5457.0,8.0,,,left_only,Yucaipa,
745,5466.0,8.0,,,left_only,Yucca Valley,


In [86]:
len(test)

1294

In [87]:
test.name.duplicated().sum()

79

In [88]:
## get one district col instead of three
test.caltrans_district.fillna(test['caltrans_district_x'], inplace=True)
test.caltrans_district.fillna(test['caltrans_district_y'], inplace=True)

In [89]:
test = test.drop(columns={'caltrans_district_x', 'caltrans_district_y'})

### Find and remove duplicates

In [90]:
duplicates= (test>>group_by(_.name)>>count(_.name)).rename(columns={"n":"n_name"})

In [91]:
test2 = pd.merge(test, duplicates, on='name')

In [92]:
## looking at the Caltrans entries
## we dont want to drop caltrans becasue opf the districts, but cannot group by district yet
(test2>>filter(_.n_name>1)>>arrange(_.name)).iloc[0:40]

Unnamed: 0,locode,BC_TIRCP_merge,_merge,name,caltrans_district,n_name
1117,,Black Cat Only,right_only,Able Industries,,2
1118,736,Black Cat Only,both,Able Industries,,2
1217,,TIRCP Only,right_only,Anaheim Transportation Network,District 12: Orange County,2
1218,M016,TIRCP Only,both,Anaheim Transportation Network,12,2
20,6201.0,,left_only,Caltrans,1.0,13
21,6202.0,,left_only,Caltrans,2.0,13
22,6203.0,,left_only,Caltrans,3.0,13
23,6204.0,,left_only,Caltrans,4.0,13
24,6205.0,,left_only,Caltrans,5.0,13
25,6206.0,,left_only,Caltrans,6.0,13


In [93]:
def remove_duplicates(df):
    if (df['name'] == 'Caltrans'):
        return "keep"
    elif (df['name'] != 'Caltrans') & (df['n_name'] > 1) & (df['_merge']=='both'):
        return "keep"
    elif (df['name'] != 'Caltrans') & (df['n_name'] > 1) & (df['_merge']!='both'):
        return "drop"
    elif (df['name'] != 'Caltrans') & (df['n_name'] == 1):
        return "keep"
        

In [94]:
test2['keep_or_drop'] = test2.apply(remove_duplicates, axis = 1)

In [95]:
(test2>>filter(_.n_name>1)>>arrange(_.name)).iloc[0:40]

Unnamed: 0,locode,BC_TIRCP_merge,_merge,name,caltrans_district,n_name,keep_or_drop
1117,,Black Cat Only,right_only,Able Industries,,2,drop
1118,736,Black Cat Only,both,Able Industries,,2,keep
1217,,TIRCP Only,right_only,Anaheim Transportation Network,District 12: Orange County,2,drop
1218,M016,TIRCP Only,both,Anaheim Transportation Network,12,2,keep
20,6201.0,,left_only,Caltrans,1.0,13,keep
21,6202.0,,left_only,Caltrans,2.0,13,keep
22,6203.0,,left_only,Caltrans,3.0,13,keep
23,6204.0,,left_only,Caltrans,4.0,13,keep
24,6205.0,,left_only,Caltrans,5.0,13,keep
25,6206.0,,left_only,Caltrans,6.0,13,keep


In [96]:
finaldf = test2>>filter(_.keep_or_drop=='keep')

In [97]:
duplicates2= ((finaldf>>group_by(_.locode)>>count(_.locode))>>filter(_.n.notnull())).rename(columns={"n":"n_locode"})

In [98]:
finaldf2 = pd.merge(finaldf, duplicates2, on='locode')

In [99]:
(finaldf2>>filter(_.n_locode==2))>>arrange(_.locode)

Unnamed: 0,locode,BC_TIRCP_merge,_merge,name,caltrans_district,n_name,keep_or_drop,n_locode
637,5006.0,,left_only,Los Angeles,7.0,1,keep,2
638,5006,TIRCP Only,both,City Of Los Angeles,7,2,keep,2
273,5012.0,,left_only,Oakland,4.0,1,keep,2
274,5012,TIRCP Only,both,City Of Oakland,4,2,keep,2
18,5021.0,,left_only,Arcata,1.0,1,keep,2
...,...,...,...,...,...,...,...,...
862,6305,Black Cat Only,both,Yosemite Area Regional Transportation System,"06, 09, 10",2,keep,2
863,6308.0,,left_only,Transit Joint Powers Authority For Merced County,10.0,1,keep,2
864,6308,Found in both TIRCP and BlackCat,both,Transit Joint Powers Authority Of Merced County,10,2,keep,2
320,6365.0,,left_only,San Francisco Bay Area Water Transit Authority,4.0,1,keep,2


In [100]:
def remove_duplicates2(df):
    if (df['n_locode'] != 2):
        return "keep"
    elif (df['n_locode'] == 2) & (df['_merge']=='both'):
        return "keep"
    elif (df['n_locode'] == 2) & (df['_merge'] == 'left_only'):
        return "drop"
        

In [101]:
finaldf2['keep_or_drop2'] = finaldf2.apply(remove_duplicates2, axis = 1)

In [102]:
finaldf2>>filter(_.n_locode==2)>>arrange(_.locode)

Unnamed: 0,locode,BC_TIRCP_merge,_merge,name,caltrans_district,n_name,keep_or_drop,n_locode,keep_or_drop2
637,5006.0,,left_only,Los Angeles,7.0,1,keep,2,drop
638,5006,TIRCP Only,both,City Of Los Angeles,7,2,keep,2,keep
273,5012.0,,left_only,Oakland,4.0,1,keep,2,drop
274,5012,TIRCP Only,both,City Of Oakland,4,2,keep,2,keep
18,5021.0,,left_only,Arcata,1.0,1,keep,2,drop
...,...,...,...,...,...,...,...,...,...
862,6305,Black Cat Only,both,Yosemite Area Regional Transportation System,"06, 09, 10",2,keep,2,keep
863,6308.0,,left_only,Transit Joint Powers Authority For Merced County,10.0,1,keep,2,drop
864,6308,Found in both TIRCP and BlackCat,both,Transit Joint Powers Authority Of Merced County,10,2,keep,2,keep
320,6365.0,,left_only,San Francisco Bay Area Water Transit Authority,4.0,1,keep,2,drop


In [103]:
finaldf3 = (finaldf2>>filter(_.keep_or_drop2=='keep'))

In [104]:
finaldf3

Unnamed: 0,locode,BC_TIRCP_merge,_merge,name,caltrans_district,n_name,keep_or_drop,n_locode,keep_or_drop2
0,6302.0,,left_only,Humboldt Bay Harbor Recreation & Conservation ...,1.0,1,keep,1,keep
1,6330.0,,left_only,Willow Creek Community Services District,1.0,1,keep,1,keep
2,5036.0,,left_only,Trinidad,1.0,1,keep,1,keep
3,5049.0,,left_only,Ukiah,1.0,1,keep,1,keep
4,5082.0,,left_only,Willits,1.0,1,keep,1,keep
...,...,...,...,...,...,...,...,...,...
1219,6376.0,Black Cat Only,both,North County Transit District,11.0,1,keep,1,keep
1220,6471.0,Black Cat Only,both,Imperial County Transportation Commission,11.0,1,keep,1,keep
1221,6066.0,TIRCP Only,both,San Diego Association Of Governments,11.0,1,keep,1,keep
1222,6071.0,TIRCP Only,both,Orange County Transportation Authority,12.0,1,keep,1,keep


In [105]:
finaldf3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1179 entries, 0 to 1223
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   locode             1046 non-null   object  
 1   BC_TIRCP_merge     245 non-null    object  
 2   _merge             1179 non-null   category
 3   name               1179 non-null   object  
 4   caltrans_district  1057 non-null   object  
 5   n_name             1179 non-null   int64   
 6   keep_or_drop       1179 non-null   object  
 7   n_locode           1179 non-null   int64   
 8   keep_or_drop2      1179 non-null   object  
dtypes: category(1), int64(2), object(6)
memory usage: 84.2+ KB


## Add New Flag

In [106]:
def recategorize(df):   
    if (df['_merge']=='right_only') and (df['BC_TIRCP_merge'] == 'Black Cat Only'):
        return 'BlackCat Only'
    elif (df['_merge']=='right_only') and (df['BC_TIRCP_merge'] == 'TIRCP Only'):
        return 'TIRCP Only'
    elif (df['_merge']=='right_only') and (df['BC_TIRCP_merge'] == 'Found in both TIRCP and BlackCat'):
        return 'TIRCP and BlackCat'
    elif (df['_merge']=='left_only'):
        return 'DLA Only'
    elif (df['_merge']=='both') and (df['BC_TIRCP_merge'] == 'TIRCP Only'):
        return 'TIRCP and DLA'
    elif (df['_merge']=='both') and (df['BC_TIRCP_merge'] == 'Black Cat Only'):
        return 'BlackCat and DLA'
    elif (df['_merge']=='both') and (df['BC_TIRCP_merge'] == 'Found in both TIRCP and BlackCat'):
        return "TIRCP, BlackCat and DLA"
    else: 
        return ""

In [107]:
finaldf3['BC_TIRCP_DLA_merge'] = finaldf3.apply(recategorize, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  finaldf3['BC_TIRCP_DLA_merge'] = finaldf3.apply(recategorize, axis = 1)


In [108]:
len(finaldf3)

1179

In [109]:
finaldf3._merge.value_counts()

left_only     934
right_only    133
both          112
Name: _merge, dtype: int64

In [110]:
finaldf3.BC_TIRCP_DLA_merge.value_counts()

DLA Only                   934
BlackCat Only              123
BlackCat and DLA            74
TIRCP and DLA               25
TIRCP, BlackCat and DLA     13
TIRCP Only                  10
Name: BC_TIRCP_DLA_merge, dtype: int64

In [111]:
(finaldf3>>filter(_._merge=='both'))

Unnamed: 0,locode,BC_TIRCP_merge,_merge,name,caltrans_district,n_name,keep_or_drop,n_locode,keep_or_drop2,BC_TIRCP_DLA_merge
19,5021,Black Cat Only,both,City Of Arcata,1,2,keep,2,keep,BlackCat and DLA
62,5902,Black Cat Only,both,Siskiyou County,2,3,keep,1,keep,BlackCat and DLA
92,5077,Black Cat Only,both,City Of Auburn,3,2,keep,2,keep,BlackCat and DLA
149,5473,Black Cat Only,both,Town Of Truckee,3,2,keep,2,keep,BlackCat and DLA
162,5182,Black Cat Only,both,City Of Roseville,3,2,keep,2,keep,BlackCat and DLA
...,...,...,...,...,...,...,...,...,...,...
1219,6376.0,Black Cat Only,both,North County Transit District,11.0,1,keep,1,keep,BlackCat and DLA
1220,6471.0,Black Cat Only,both,Imperial County Transportation Commission,11.0,1,keep,1,keep,BlackCat and DLA
1221,6066.0,TIRCP Only,both,San Diego Association Of Governments,11.0,1,keep,1,keep,TIRCP and DLA
1222,6071.0,TIRCP Only,both,Orange County Transportation Authority,12.0,1,keep,1,keep,TIRCP and DLA


In [112]:
finaldf3>>filter(_._merge=='right_only')

Unnamed: 0,locode,BC_TIRCP_merge,_merge,name,caltrans_district,n_name,keep_or_drop,n_locode,keep_or_drop2,BC_TIRCP_DLA_merge
1029,,Black Cat Only,right_only,Yuba-Sutter Transit Authority,03 - Marysville,1,keep,133,keep,BlackCat Only
1030,,Black Cat Only,right_only,Butte County Association Of Governments Butte ...,,1,keep,133,keep,BlackCat Only
1031,,Black Cat Only,right_only,Calaveras County Public Works,,1,keep,133,keep,BlackCat Only
1032,,Black Cat Only,right_only,County Of Los Angeles - Department Of Public W...,,1,keep,133,keep,BlackCat Only
1033,,Black Cat Only,right_only,County Of Nevada Public Works,,1,keep,133,keep,BlackCat Only
...,...,...,...,...,...,...,...,...,...,...
1157,,TIRCP Only,right_only,Foothill Transit,District 7: Los Angeles,1,keep,133,keep,TIRCP Only
1158,,TIRCP Only,right_only,Bay Area Rapid Transit,District 4: Bay Area / Oakland,1,keep,133,keep,TIRCP Only
1159,,TIRCP Only,right_only,Long Beach Transit,District 7: Los Angeles,1,keep,133,keep,TIRCP Only
1160,,TIRCP Only,right_only,Santa Monica Big Blue Bus,District 7: Los Angeles,1,keep,133,keep,TIRCP Only


In [113]:
finaldf3>>filter(_._merge=='left_only')

Unnamed: 0,locode,BC_TIRCP_merge,_merge,name,caltrans_district,n_name,keep_or_drop,n_locode,keep_or_drop2,BC_TIRCP_DLA_merge
0,6302.0,,left_only,Humboldt Bay Harbor Recreation & Conservation ...,1.0,1,keep,1,keep,DLA Only
1,6330.0,,left_only,Willow Creek Community Services District,1.0,1,keep,1,keep,DLA Only
2,5036.0,,left_only,Trinidad,1.0,1,keep,1,keep,DLA Only
3,5049.0,,left_only,Ukiah,1.0,1,keep,1,keep,DLA Only
4,5082.0,,left_only,Willits,1.0,1,keep,1,keep,DLA Only
...,...,...,...,...,...,...,...,...,...,...
1024,5465.0,,left_only,Leave Blank,53.0,1,keep,1,keep,DLA Only
1025,6250.0,,left_only,U.S. Fish And Wildlife Service,53.0,1,keep,1,keep,DLA Only
1026,6031.0,,left_only,Tidewater Southern Railway Company,53.0,1,keep,1,keep,DLA Only
1027,6176.0,,left_only,Tri Counties Regional Park Group,53.0,1,keep,1,keep,DLA Only


In [114]:
finaldf3>>filter(_.name.str.contains("Los Angeles"))

Unnamed: 0,locode,BC_TIRCP_merge,_merge,name,caltrans_district,n_name,keep_or_drop,n_locode,keep_or_drop2,BC_TIRCP_DLA_merge
636,5953.0,,left_only,Los Angeles County,7.0,1,keep,1,keep,DLA Only
638,5006.0,TIRCP Only,both,City Of Los Angeles,7,2,keep,2,keep,TIRCP and DLA
639,6033.0,,left_only,Los Angeles Junction Railway Company,7.0,1,keep,1,keep,DLA Only
660,6065.0,,left_only,Los Angeles County Metropolitan Transportation...,7.0,1,keep,1,keep,DLA Only
661,6080.0,,left_only,Los Angeles Conservation Corps,7.0,1,keep,1,keep,DLA Only
694,6384.0,,left_only,"Cal State Univeristy Los Angeles, Auxillary Se...",7.0,1,keep,1,keep,DLA Only
709,6508.0,,left_only,Los Angeles Unified School District,7.0,1,keep,1,keep,DLA Only
1032,,Black Cat Only,right_only,County Of Los Angeles - Department Of Public W...,,1,keep,133,keep,BlackCat Only
1152,,TIRCP Only,right_only,Los Angeles County Metropolitan Transportation,District 7: Los Angeles,1,keep,133,keep,TIRCP Only
1153,,TIRCP Only,right_only,Los Angeles-San Diego-San Luis Obispo Rail Cor...,Various,1,keep,133,keep,TIRCP Only


* still some that are duplicates 
    * first because of hyphens - but now that we took out hyphens, they are still not merging
    * one we know:
        * Los Angeles County Metropolitan District
        
    * manually changing this for now after export 

### Fix district number to name 

In [115]:
district_dictionary = {
    1: "District 1: Eureka",
    2: "District 2: Redding",
    3: "District 3: Marysville",
    4: "District 4: Oakland",
    5: "District 5: San Luis Obispo",
    6: "District 6: Fresno",
    7: "District 7: Los Angeles",
    8: "District 8: San Bernardino",
    9: "District 9: Bishop",
    10: "District 10: Stockton",
    11: "District 11: San Diego",
    12: "District 12: Orange County",
    "District 4: Bay Area / Oakland" : "District 4: Oakland",
    "10 - Stockton":"District 10: Stockton", 
    "01 - Eureka" : "District 1: Eureka", 
    "03 - Marysville" : "District 3: Marysville",
    "04 - Oakland" : "District 4: Oakland", 
    "District 5: San Luis Obispo / Santa Barbara" : "District 5: San Luis Obispo"
}

In [116]:
finaldf3["caltrans_district"] = finaldf3["caltrans_district"].replace(district_dictionary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  finaldf3["caltrans_district"] = finaldf3["caltrans_district"].replace(district_dictionary)


In [117]:
finaldf3>>filter( _.caltrans_district.isnull())

Unnamed: 0,locode,BC_TIRCP_merge,_merge,name,caltrans_district,n_name,keep_or_drop,n_locode,keep_or_drop2,BC_TIRCP_DLA_merge
1030,,Black Cat Only,right_only,Butte County Association Of Governments Butte ...,,1,keep,133,keep,BlackCat Only
1031,,Black Cat Only,right_only,Calaveras County Public Works,,1,keep,133,keep,BlackCat Only
1032,,Black Cat Only,right_only,County Of Los Angeles - Department Of Public W...,,1,keep,133,keep,BlackCat Only
1033,,Black Cat Only,right_only,County Of Nevada Public Works,,1,keep,133,keep,BlackCat Only
1034,,Black Cat Only,right_only,County Of Sacramento Department Of Transportation,,1,keep,133,keep,BlackCat Only
...,...,...,...,...,...,...,...,...,...,...
1150,,Black Cat Only,right_only,Roderick Hayfork Senior Nutrition Center,,1,keep,133,keep,BlackCat Only
1151,,Black Cat Only,right_only,Yurok Tribe Transit,,1,keep,133,keep,BlackCat Only
1163,6905,Black Cat Only,both,Colusa County Transit Agency,,2,keep,1,keep,BlackCat and DLA
1165,709,Black Cat Only,both,Oparc,,2,keep,1,keep,BlackCat and DLA


In [118]:
## check to see if the name change worked
## if check is successful,there will be only a Mariposa County 
## if check is NOT successful, there will be a Mariposa County and a County of Mariposa
# finaldf3>>filter(_.name.str.contains('Mariposa'))

In [119]:
finaldf3.sample(5)

Unnamed: 0,locode,BC_TIRCP_merge,_merge,name,caltrans_district,n_name,keep_or_drop,n_locode,keep_or_drop2,BC_TIRCP_DLA_merge
887,6445.0,,left_only,Chula Vista Elementary School District,District 11: San Diego,1,keep,1,keep,DLA Only
403,6254.0,,left_only,"California State University,Montery Bay",District 5: San Luis Obispo,1,keep,1,keep,DLA Only
1033,,Black Cat Only,right_only,County Of Nevada Public Works,,1,keep,133,keep,BlackCat Only
826,6127.0,,left_only,Amador County Transportation Commission,District 10: Stockton,1,keep,1,keep,DLA Only
438,5289.0,,left_only,Pismo Beach,District 5: San Luis Obispo,1,keep,1,keep,DLA Only


In [120]:
finaldf3 = finaldf3>>select(_.locode, _.name, _.caltrans_district, _.BC_TIRCP_DLA_merge)

In [121]:
finaldf3

Unnamed: 0,locode,name,caltrans_district,BC_TIRCP_DLA_merge
0,6302.0,Humboldt Bay Harbor Recreation & Conservation ...,District 1: Eureka,DLA Only
1,6330.0,Willow Creek Community Services District,District 1: Eureka,DLA Only
2,5036.0,Trinidad,District 1: Eureka,DLA Only
3,5049.0,Ukiah,District 1: Eureka,DLA Only
4,5082.0,Willits,District 1: Eureka,DLA Only
...,...,...,...,...
1219,6376.0,North County Transit District,District 11: San Diego,BlackCat and DLA
1220,6471.0,Imperial County Transportation Commission,District 11: San Diego,BlackCat and DLA
1221,6066.0,San Diego Association Of Governments,District 11: San Diego,TIRCP and DLA
1222,6071.0,Orange County Transportation Authority,District 12: Orange County,TIRCP and DLA


## Save final

In [122]:
# finaldf3.info()

In [123]:
## converting locode to string so that parquet will work
# finaldf3['caltrans_district']= finaldf3['caltrans_district'].astype(str)
# finaldf3['locode']= finaldf3['locode'].astype(str)

In [124]:
## find what row was creating the problem
# finaldf3[(finaldf3 == 'M016').any(axis=1)]

In [125]:
# finaldf3.to_parquet('gs://calitp-analytics-data/data-analyses/grants/BlackCat_TIRCP_DLA.parquet')

In [126]:
# finaldf3.to_csv('gs://calitp-analytics-data/data-analyses/grants/BlackCat_TIRCP_DLA.csv')