# Connecting Locode info with exisitng E-76 data

In [3]:
#!pip install openpyxl

In [1]:
import pandas as pd
from siuba import *

import numpy as np

from datetime import date
from IPython.display import Markdown, HTML, display_html

from calitp import *

import ipywidgets as widgets
from ipywidgets import *
from IPython.display import Markdown
from IPython.core.display import display

## Importing Data
* using data from read in GCS:
    * [E-76 Obligated List](https://dot.ca.gov/programs/local-assistance/reports/e-76-obligated)
    * Obtained Locode and Agency list 

In [2]:
df = pd.read_csv('gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/function_data.csv', low_memory=False).drop('Unnamed: 0', axis=1)



In [3]:
df.head()

Unnamed: 0,prefix,project_no,agency,prepared_date,fed_requested,ac_requested,total_requested,status_comment,locode,dist,ftip_no,project_location,type_of_work,seq,mpo,prepared_y
0,BPMPL,5904(121),Humboldt County,2018-12-18,0.0,0.0,0.0,Authorized,5904,1,HBPLOCAL,14 Bridges In Humboldt County,Bridge Preventive Maintenance - Deck Joints,3,NON-MPO,2018-01-01
1,ER,32D0(008),Mendocino County,2018-12-17,11508.0,0.0,13000.0,Authorized,5910,1,,"Comptche Ukiah Road, Cr 223 Pm 17.25",Permanent Restoration,3,NON-MPO,2018-01-01
2,ER,4820(004),Humboldt County,2018-12-07,45499.64,0.0,51394.58,Authorized,5904,1,,Mattole Rd Pm 43.17,Permanent Restoration,5,NON-MPO,2018-01-01
3,CML,5924(244),Sacramento County,2018-12-11,207002.0,0.0,247002.0,Authorized,5924,3,SAC25086,Fair Oaks Blvd. Between Howe Ave And Munroe St,Create A Smart Growth Corridor With Barrier Se...,1,SACOG,2018-01-01
4,CML,5924(214),Sacramento County,2018-12-05,0.0,5680921.0,5702041.0,Authorized,5924,3,SAC24753,Florin Rd Between Power Inn Rd. And Florin Per...,Streetscape (tc),3,SACOG,2018-01-01


In [4]:
locode = pd.concat(pd.read_excel('gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx', sheet_name=None), ignore_index=True)


In [5]:
locode.head()

Unnamed: 0,Agency Locode,Agency Name,District,County Name,RTPA Name,MPO Name,MPO Locode FADS,"""Active"" E76s (7-12-2021)"
0,6302,Humboldt Bay Harbor Recreation & Conservation ...,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,
1,6330,Willow Creek Community Services District,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,
2,5036,Trinidad,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,
3,5049,Ukiah,1,Mendocino County,Mendocino Council of Governments,NON-MPO,NON-MPO,Yes
4,5082,Willits,1,Mendocino County,Mendocino Council of Governments,NON-MPO,NON-MPO,


In [6]:
locode_df = to_snakecase(locode)

In [7]:
locode_df.rename(columns={'agency_locode':'locode'}, inplace=True)

In [8]:
locode_df.head()

Unnamed: 0,locode,agency_name,district,county_name,rtpa_name,mpo_name,mpo_locode_fads,active_e76s______7_12_2021_
0,6302,Humboldt Bay Harbor Recreation & Conservation ...,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,
1,6330,Willow Creek Community Services District,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,
2,5036,Trinidad,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,
3,5049,Ukiah,1,Mendocino County,Mendocino Council of Governments,NON-MPO,NON-MPO,Yes
4,5082,Willits,1,Mendocino County,Mendocino Council of Governments,NON-MPO,NON-MPO,


In [9]:
locode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1041 entries, 0 to 1040
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   locode                       1041 non-null   int64 
 1   agency_name                  1041 non-null   object
 2   district                     1041 non-null   int64 
 3   county_name                  942 non-null    object
 4   rtpa_name                    1041 non-null   object
 5   mpo_name                     1041 non-null   object
 6   mpo_locode_fads              1041 non-null   object
 7   active_e76s______7_12_2021_  464 non-null    object
dtypes: int64(2), object(6)
memory usage: 65.2+ KB


In [10]:
#should be possible based on these two. 

In [11]:
locode_df>>filter(_.agency_name.str.contains('Humboldt'))>>count(_.locode)

Unnamed: 0,locode,n
0,5904,1
1,6133,1
2,6162,1
3,6302,1
4,6487,1


In [12]:
df>>filter(_.agency.str.contains('Humboldt'))>>count(_.locode)

Unnamed: 0,locode,n
0,5904,627
1,5940,1
2,6302,1


## Create Agency/Locode list from Obligated Data

In [13]:
# creating locode crosswalk from the original data to see if that works. 

In [14]:
group = df.groupby('agency')
crosswalk_all = group.apply(lambda x: x['locode'].unique())
crosswalk_all.head()

agency
Access Services                                 [6312]
Agoura Hills                               [5435, nan]
Ala-Con Costa T                                 [6002]
Alameda                                         [5014]
Alameda - Contra Costa Transit District         [6002]
dtype: object

In [15]:
crosswalk_all >> pipe(_.explode)

<bound method Series.explode of agency
Access Services                                             [6312]
Agoura Hills                                           [5435, nan]
Ala-Con Costa T                                             [6002]
Alameda                                                     [5014]
Alameda - Contra Costa Transit District                     [6002]
                                                    ...           
Yrts                                                         [nan]
Yuba City                                                   [5163]
Yuba County                                                 [5916]
Yucaipa                                    [5457, nan, 5954, NBIL]
Yucca Valley                                           [5466, nan]
Length: 671, dtype: object>

In [16]:
cw = pd.DataFrame([crosswalk_all])


In [17]:
cw

agency,Access Services,Agoura Hills,Ala-Con Costa T,Alameda,Alameda - Contra Costa Transit District,Alameda Corridor Transportation Authority,Alameda County,Alameda County Congestion Management Agency,Alameda County Transportation Commission,Alameda County Transportation Improvement Authority,...,Yolo County Transportation District,Yorba Linda,Yosemite Area Regional Transportation System Jpa,Yountville,Yreka City,Yrts,Yuba City,Yuba County,Yucaipa,Yucca Valley
0,[6312],"[5435, nan]",[6002],[5014],[6002],[6246],[5933],[6273],[6480],[6430],...,[6195],"[5402, nan]",[6305],[5395],[5020],[nan],[5163],[5916],"[5457, nan, 5954, NBIL]","[5466, nan]"


In [18]:
cw = cw >> gather('agency', 'locode', _["Access Services":"Yucca Valley"])

### Add rows for agencies with more than one locode

In [19]:
cw2 = (cw 
     >> pipe(_.explode('locode'))
) 

In [20]:
cw2

Unnamed: 0,agency,locode
0,Access Services,6312
1,Agoura Hills,5435
1,Agoura Hills,
2,Ala-Con Costa T,6002
3,Alameda,5014
...,...,...
669,Yucaipa,
669,Yucaipa,5954
669,Yucaipa,NBIL
670,Yucca Valley,5466


In [21]:
cw2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 885 entries, 0 to 670
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   agency  885 non-null    object
 1   locode  680 non-null    object
dtypes: object(2)
memory usage: 20.7+ KB


### Convert locode to numeric

In [22]:
locode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1041 entries, 0 to 1040
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   locode                       1041 non-null   int64 
 1   agency_name                  1041 non-null   object
 2   district                     1041 non-null   int64 
 3   county_name                  942 non-null    object
 4   rtpa_name                    1041 non-null   object
 5   mpo_name                     1041 non-null   object
 6   mpo_locode_fads              1041 non-null   object
 7   active_e76s______7_12_2021_  464 non-null    object
dtypes: int64(2), object(6)
memory usage: 65.2+ KB


In [23]:
def get_num(x):
    try:
        return int(x)
    except Exception:
        try:
            return float(x)
        except Exception:
            return x


In [24]:
cw2['locode'] = cw2['locode'].apply(get_num)

## Joining Dataframes

In [25]:
join = (cw2 >> full_join(_, locode_df, on = "locode")) 

In [26]:
join.head()

Unnamed: 0,agency,locode,agency_name,district,county_name,rtpa_name,mpo_name,mpo_locode_fads,active_e76s______7_12_2021_
0,Access Services,6312.0,Access Services,7.0,Los Angeles County,Los Angeles County Metropolitan Transportation...,Southern California Association Of Governments,SCAG,
1,Agoura Hills,5435.0,Agoura Hills,7.0,Los Angeles County,NON-RTPA,Southern California Association Of Governments,SCAG,Yes
2,Agoura Hills,,,,,,,,
3,Alhambra,,,,,,,,
4,Amador,,,,,,,,


## Analysis of the Locodes

### How many have no Locode?

In [27]:
join>>count(_.locode)>>arrange(-_.n)

Unnamed: 0,locode,n
1038,7500,12
538,5953,5
488,5903,4
539,5954,4
501,5916,3
...,...,...
1042,7504,1
1043,7505,1
1044,32L0,1
1045,38R0,1


In [28]:
print(len(join>>filter(_.locode==0)))

0


In [29]:
join>>filter(_.locode==0)

Unnamed: 0,agency,locode,agency_name,district,county_name,rtpa_name,mpo_name,mpo_locode_fads,active_e76s______7_12_2021_


In [30]:
join>>filter(_.locode==0)>>count(_.agency)>>arrange(-_.n)

Unnamed: 0,agency,n


#### Dropping agencies with no locode

In [31]:
join.drop(join[join.locode == 0].index, inplace=True)

In [32]:
join>>filter(_.locode==0)

Unnamed: 0,agency,locode,agency_name,district,county_name,rtpa_name,mpo_name,mpo_locode_fads,active_e76s______7_12_2021_


### What Agencies have incorrect locodes?
Possible from manual entries

In [33]:
multiple_id = (join>>count(_.locode)>>arrange(-_.n)>>filter(_.n>1))

In [48]:
print(len(multiple_id))

30


In [49]:
(join
    >>filter(_.agency_name.str.contains('Los Angeles County Metropolitan Transportatio'))
     >>select(_.agency_name)
)

Unnamed: 0,agency_name
486,Los Angeles County Metropolitan Transportation...
487,Los Angeles County Metropolitan Transportation...


In [50]:
@interact
def find_agencies(locode=multiple_id.locode.unique().tolist()):
    
    filtering = (join >> filter(_.locode== locode))
    
    display(Markdown(f"**Agencies using Locode number {locode}**"))
    
    display(filtering)


interactive(children=(Dropdown(description='locode', options=(7500, 5953, 5903, 5954, 5916, 5940, 5957, 6190, …

### Checking the Obligated df for project location

In [56]:
df>>filter(_.locode=='5954')>>filter(_.agency=="Tehama County")

Unnamed: 0,prefix,project_no,agency,prepared_date,fed_requested,ac_requested,total_requested,status_comment,locode,dist,ftip_no,project_location,type_of_work,seq,mpo,prepared_y
19512,ACSTP,40A0(067),Tehama County,2021-06-03,0.0,5478.37,5660.65,Authorized,5954,2,,Dusty Way Closure And Monitoring.,Ca19-2 Eo Work To Remove Debris And Cleanup. ...,1,SCAG,2021-01-01


In [57]:
locode_df>>filter(_.agency_name=='Tehama County')

Unnamed: 0,locode,agency_name,district,county_name,rtpa_name,mpo_name,mpo_locode_fads,active_e76s______7_12_2021_
45,5908,Tehama County,2,Tehama County,Tehama County Transportation Commission,NON-MPO,NON-MPO,Yes


##### ---
* is it worth manually changing the locodes? or not because it might somehow be connected? 

In [58]:
df>>filter(_.locode=='5953')>>filter(_.agency=="Marin County")

Unnamed: 0,prefix,project_no,agency,prepared_date,fed_requested,ac_requested,total_requested,status_comment,locode,dist,ftip_no,project_location,type_of_work,seq,mpo,prepared_y
19568,ACSTP,40A0(087),Marin County,2021-06-16,0.0,676463.01,764107.59,Authorized,5953,4,,Lucas Valley Rd At Mp 3.92,Lucas Valley Rd At Mp 3.92 Roadway Damage. Un...,2,SCAG,2021-01-01


In [59]:
df>>filter(_.locode=='5953')>>filter(_.agency=="Paradise")

Unnamed: 0,prefix,project_no,agency,prepared_date,fed_requested,ac_requested,total_requested,status_comment,locode,dist,ftip_no,project_location,type_of_work,seq,mpo,prepared_y
16249,ACSTP,38Y0(025),Paradise,2020-04-06,0.0,107607.5,143000.0,Authorized,5953,3,,In Paradise On Neal Rd From Wayland Rd To Skyway.,Pavement Repair Of 1.63 Miles Of Neal Rd.,1,SCAG,2020-01-01
17589,ER,38Y0(025),Paradise,2020-10-29,107607.5,-107607.5,0.0,Authorized,5953,3,,In Paradise On Neal Rd From Wayland Rd To Skyway.,Pavement Repair Of 1.63 Miles Of Neal Rd.,2,SCAG,2020-01-01
18969,ER,38Y0(025),Paradise,2021-05-14,0.0,0.0,0.0,Authorized,5953,3,,In Paradise On Neal Rd From Wayland Rd To Skyway.,Pavement Repair Of 1.63 Miles Of Neal Rd.,3,SCAG,2021-01-01
19548,ER,38Y0(025),Paradise,2021-08-13,0.0,0.0,0.0,Authorized,5953,3,,In Paradise On Neal Rd From Wayland Rd To Skyway.,Pavement Repair Of 1.63 Miles Of Neal Rd.,4,SCAG,2021-01-01


In [61]:
#location is in Paradise, while Locode is Los Angeles County

In [62]:
locode_df>>filter(_.agency_name=='Paradise')

Unnamed: 0,locode,agency_name,district,county_name,rtpa_name,mpo_name,mpo_locode_fads,active_e76s______7_12_2021_
175,5425,Paradise,3,Butte County,Butte County Association of Governments,Butte County Association Of Governments,BCAG,Yes


* Some agencies with same locodes are the same on the oblgiated list, but just have different spellings. 
    * for example: Locode 6002 Alameda-Contra Costa Transit District spelled
        * Ala-Con Consta T 
        * Alameda-Contra Costa Transit District
* Documenting these allows us to have a correct list of agencies instead of having duplicates      
        


## Agencies to Change

| Locodes  |Agency in Locode List | Agencies to Change |
| ------------- | ------------- | ------------- |
| 5953 | Los Angeles County | Los Angeles, Marin County, Paradise, Trinity County  |
| 5903 |Modoc County | Alpine County, Monterey County, Nevada County  |
| 5954 |San Bernardino County | San Bernardino, Tehama County, Yucaipa    |
| 5916 |Yuba County | Shasta County, Tuolumne County|
| 5940 | Mariposa County | Humboldt, Mariposa| 
| 5957 | San Diego County | San Diego, Cathedral City|
| 6190 | U.S. Forest Service, Pacific Southwest Region | Usda Forest Ser, United States Forest Service| 
| 5020 |Yreka City | Sonoma County|
| 5275 | Indio | Palm Springs | 
| 5351 | Pico Rivera | Los Angeles County |
| 5391 | Morro Bay | Ora Co Trans Au | 
| 5463 | Calabasas | Calaveras |
| 5912 | Butte County | Santa Barbara County | 
| 5921 | Napa County | Shasta County |
| 5930 | Calaveras County | Los Angeles County | 
| 5936 | Santa Cruz County | Monterey County |
| 5939 | Merced County | Merced | 
| 5956 | Riverside County | Riverside |
| 5958 | Imperial County | Imperial | 
| 5961 | Kern County (District 9) | Kern |
| 6000 | San Francisco Bay Area Rapid Transit District | Bay Area Rt | 
| 6002 | Alameda - Contra Costa Transit District | Ala-Con Costa T | 
| 6065 | Los Angeles County Metropolitan Transportation... | La Co M T A |
| 6081 | Department of Parks and Recreation | Parks And Rec | 
| 6264 | Santa Clara Valley Transportation Authority | Vta |
| 6343 | Marin County Transit District | Mctd | 
| 6365 | San Francisco Bay Area Water Transit Authority | Wta |

## Agencies to find
  
| Locodes  | Agency in List | Agencies to find |
| ------------- | ------------- | ------------- |
| 7500  | NaN | Banning, Fowler, Lancaster, Los Angeles, Palmdale, Richmond, San Luis Obispo, San Mateo, Sgvc, Stockton, Sutter, Ventura  |
| 40A0  | NaN | Mendocino, San Bernardino, Santa Cruz  |
| NBIL  | NaN | La Quinta, Yucaipa                     |