# Creating Data Crosswalks 
**3 categories:**
1. Exisiting Agencies 
    * For example, misspelled names
2. Null Agencies
3. Unmatched Agencies

In [1]:
import pandas as pd
from siuba import *

import numpy as np

from datetime import date
from IPython.display import Markdown, HTML, display_html

from calitp import *

#### Reading in the Obligated & Waiting Data

In [2]:
df = pd.read_csv('gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/clean_obligated_waiting.csv', low_memory=False).drop('Unnamed: 0', axis=1)



In [3]:
df.head()

Unnamed: 0,location,prefix,project_no,agency,prepared_date,submit__to_hq_date,hq_review_date,submit_to_fhwa_date,to_fmis_date,fed_requested,...,project_location,type_of_work,seq,date_request_initiated,date_completed_request,mpo,warning,projectID,projectNO,compare_id_locode
0,Obligated,BPMPL,5904(121),Humboldt County,2018-12-18,2018-12-18,2018-12-18,2018-12-18,2018-12-27,0.0,...,14 Bridges In Humboldt County,Bridge Preventive Maintenance - Deck Joints,3,,,NON-MPO,,5904,121,True
1,Obligated,ER,32D0(008),Mendocino County,2018-12-17,2018-12-19,2018-12-20,2018-12-20,2018-12-27,11508.0,...,"Comptche Ukiah Road, Cr 223 Pm 17.25",Permanent Restoration,3,2018-12-17,2018-12-18,NON-MPO,,32D0,8,False
2,Obligated,ER,4820(004),Humboldt County,2018-12-07,2018-12-21,2018-12-21,2018-12-21,2018-12-27,45499.64,...,Mattole Rd Pm 43.17,Permanent Restoration,5,2018-12-06,2018-12-07,NON-MPO,,4820,4,False
3,Obligated,CML,5924(244),Sacramento County,2018-12-11,2018-12-11,2018-12-21,2018-12-27,2018-12-27,207002.0,...,Fair Oaks Blvd. Between Howe Ave And Munroe St,Create A Smart Growth Corridor With Barrier Se...,1,2018-12-07,2018-12-07,SACOG,,5924,244,True
4,Obligated,CML,5924(214),Sacramento County,2018-12-05,2018-12-11,2018-12-21,2018-12-27,2018-12-27,0.0,...,Florin Rd Between Power Inn Rd. And Florin Per...,Streetscape (tc),3,2018-11-28,2018-12-04,SACOG,,5924,214,True


In [4]:
def get_num(x):
    try:
        return int(x)
    except Exception:
        try:
            return float(x)
        except Exception:
            return x

In [5]:
df['locode'] = df['locode'].apply(get_num)

In [6]:
df['projectID'] = df['projectID'].apply(get_num)

#### Reading in the Official Locode List

In [7]:
l_df = pd.concat(pd.read_excel('gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx', sheet_name=None), ignore_index=True)

In [8]:
l_df = to_snakecase(l_df)

In [9]:
l_df['agency_locode'] = l_df['agency_locode'].apply(get_num)

In [10]:
l_df.head()

Unnamed: 0,agency_locode,agency_name,district,county_name,rtpa_name,mpo_name,mpo_locode_fads,active_e76s______7_12_2021_
0,6302,Humboldt Bay Harbor Recreation & Conservation ...,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,
1,6330,Willow Creek Community Services District,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,
2,5036,Trinidad,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,
3,5049,Ukiah,1,Mendocino County,Mendocino Council of Governments,NON-MPO,NON-MPO,Yes
4,5082,Willits,1,Mendocino County,Mendocino Council of Governments,NON-MPO,NON-MPO,


## Creating Crosswalk 1: Locode Agency Relationships

In [11]:
df>>count(_.agency, _.locode)

Unnamed: 0,agency,locode,n
0,Access Services,6312,17
1,Agoura Hills,5435,2
2,Ala-Con Costa T,6002,1
3,Alameda,5014,21
4,Alameda - Contra Costa Transit District,6002,6
...,...,...,...
675,Yuba County,5916,141
676,Yucaipa,5457,20
677,Yucaipa,5954,1
678,Yucaipa,NBIL,1


In [12]:
df_doc1 = (df>>count(_.agency, _.locode))

In [13]:
df_doc1 = df_doc1.drop('n', 1)

  df_doc1 = df_doc1.drop('n', 1)


In [14]:
df_doc1 = df_doc1.rename(columns={'locode': 'agency_locode', 'agency':'agency_name'})

In [15]:
df_doc1

Unnamed: 0,agency_name,agency_locode
0,Access Services,6312
1,Agoura Hills,5435
2,Ala-Con Costa T,6002
3,Alameda,5014
4,Alameda - Contra Costa Transit District,6002
...,...,...
675,Yuba County,5916
676,Yucaipa,5457
677,Yucaipa,5954
678,Yucaipa,NBIL


In [16]:
df_doc2 = l_df[['agency_name','agency_locode']].copy()

In [17]:
df_doc2 = df_doc2.rename(columns={'agency_name': 'primary_agency_name','agency_locode':'primary_agency_locode'})

In [18]:
df_doc2

Unnamed: 0,primary_agency_name,primary_agency_locode
0,Humboldt Bay Harbor Recreation & Conservation ...,6302
1,Willow Creek Community Services District,6330
2,Trinidad,5036
3,Ukiah,5049
4,Willits,5082
...,...,...
1036,Leave Blank,5465
1037,U.S. Fish and Wildlife Service,6250
1038,Tidewater Southern Railway Company,6031
1039,Tri-Counties Regional Park Group,6176


### Merging Official Locode List and unique agencies in Obligated List

In [19]:
data_doc = pd.merge(df_doc1, df_doc2, how='left', left_on='agency_locode', right_on='primary_agency_locode')

In [20]:
data_doc

Unnamed: 0,agency_name,agency_locode,primary_agency_name,primary_agency_locode
0,Access Services,6312,Access Services,6312.0
1,Agoura Hills,5435,Agoura Hills,5435.0
2,Ala-Con Costa T,6002,Alameda - Contra Costa Transit District,6002.0
3,Alameda,5014,Alameda,5014.0
4,Alameda - Contra Costa Transit District,6002,Alameda - Contra Costa Transit District,6002.0
...,...,...,...,...
675,Yuba County,5916,Yuba County,5916.0
676,Yucaipa,5457,Yucaipa,5457.0
677,Yucaipa,5954,San Bernardino County,5954.0
678,Yucaipa,NBIL,,


### Exceptions in Data

In [21]:
d = {'agency_name': ['Santa Barbara County', 'Calaveras','Los Angeles County', 'Palm Springs',
                'Marin County','Paradise','Trinity County','Humboldt','Alpine County','Monterey County',
                'Nevada County','Los Angeles County','Tehama County','Yucaipa','Cathedral City','Monterey County',
                'Solano County Transit','Tuolumne County'],
     'agency_locode': [5912, 5463, 5930, 5275, 5953, 5953, 5953, 5940, 5903, 
                5903, 5903, 5351, 5954, 5954, 5957, 5936, 6503, 5916],
     'primary_agency_name': ['Butte County', 'Calabasas','Calaveras County', 'Indio','Los Angeles County',
                             'Los Angeles County','Los Angeles County','Mariposa County','Modoc County','Modoc County',
                             'Modoc County','Pico Rivera','San Bernardino County','San Bernardino County',
                             'San Diego County','Santa Cruz County',
                             'Transbay Joint Powers Authority','Yuba County'],
     'primary_agency_locode': [5912, 5463, 5930, 5275, 5953, 5953, 5953, 5940, 5903, 5903,
                               5903, 5351, 5954, 5954, 5957, 5936, 6503, 5916]}

In [22]:
exceptions = pd.DataFrame(data=d)

In [23]:
exceptions

Unnamed: 0,agency_name,agency_locode,primary_agency_name,primary_agency_locode
0,Santa Barbara County,5912,Butte County,5912
1,Calaveras,5463,Calabasas,5463
2,Los Angeles County,5930,Calaveras County,5930
3,Palm Springs,5275,Indio,5275
4,Marin County,5953,Los Angeles County,5953
5,Paradise,5953,Los Angeles County,5953
6,Trinity County,5953,Los Angeles County,5953
7,Humboldt,5940,Mariposa County,5940
8,Alpine County,5903,Modoc County,5903
9,Monterey County,5903,Modoc County,5903


### Removing entries that have NaN or contain wrong locodes/agencies for later 

In [24]:
#saving as df for later use
unmatched1 = data_doc>>filter(_.primary_agency_name.isnull())

In [25]:
unmatched1

Unnamed: 0,agency_name,agency_locode,primary_agency_name,primary_agency_locode
40,Banning,7500,,
88,Caltrans,7504,,
190,Fowler,7500,,
204,Gold Coast Transit District,7505,,
210,Grass Valley,32L0,,
266,La Quinta,NBIL,,
277,Lancaster,7500,,
299,Los Angeles,7500,,
300,Los Angeles,38Y0,,
325,Mendocino,40A0,,


In [26]:
data_doc = data_doc[data_doc['primary_agency_name'].notna()]

In [27]:
data_doc>>filter(_.primary_agency_name.isnull())

Unnamed: 0,agency_name,agency_locode,primary_agency_name,primary_agency_locode


### Merging Exceptions and Data Doc to remove exceptions

In [28]:
# code help: https://stackoverflow.com/questions/28901683/pandas-get-rows-which-are-not-in-other-dataframe

In [29]:
df_all = data_doc.merge(exceptions.drop_duplicates(), on=['agency_name','agency_locode','primary_agency_name','primary_agency_locode'], 
                   how='left', indicator=True)
df_all

Unnamed: 0,agency_name,agency_locode,primary_agency_name,primary_agency_locode,_merge
0,Access Services,6312,Access Services,6312.0,left_only
1,Agoura Hills,5435,Agoura Hills,5435.0,left_only
2,Ala-Con Costa T,6002,Alameda - Contra Costa Transit District,6002.0,left_only
3,Alameda,5014,Alameda,5014.0,left_only
4,Alameda - Contra Costa Transit District,6002,Alameda - Contra Costa Transit District,6002.0,left_only
...,...,...,...,...,...
653,Yuba City,5163,Yuba City,5163.0,left_only
654,Yuba County,5916,Yuba County,5916.0,left_only
655,Yucaipa,5457,Yucaipa,5457.0,left_only
656,Yucaipa,5954,San Bernardino County,5954.0,both


In [30]:
df_all = (df_all>>filter(_._merge=='left_only'))

In [31]:
df_all.drop('_merge', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all.drop('_merge', axis=1, inplace=True)


In [32]:
df_all

Unnamed: 0,agency_name,agency_locode,primary_agency_name,primary_agency_locode
0,Access Services,6312,Access Services,6312.0
1,Agoura Hills,5435,Agoura Hills,5435.0
2,Ala-Con Costa T,6002,Alameda - Contra Costa Transit District,6002.0
3,Alameda,5014,Alameda,5014.0
4,Alameda - Contra Costa Transit District,6002,Alameda - Contra Costa Transit District,6002.0
...,...,...,...,...
652,Yreka City,5020,Yreka City,5020.0
653,Yuba City,5163,Yuba City,5163.0
654,Yuba County,5916,Yuba County,5916.0
655,Yucaipa,5457,Yucaipa,5457.0


In [33]:
#df_all['primary_agency_locode'] = df_all['primary_agency_locode'].apply(get_num)
#df_all['agency_locode'] = df_all['agency_locode'].apply(get_num)

### Compare agency names

In [34]:
compare_names = np.where(df_all["agency_name"] == df_all["primary_agency_name"], True, False)
df_all["compare_names"] = compare_names

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all["compare_names"] = compare_names


In [35]:
df_all

Unnamed: 0,agency_name,agency_locode,primary_agency_name,primary_agency_locode,compare_names
0,Access Services,6312,Access Services,6312.0,True
1,Agoura Hills,5435,Agoura Hills,5435.0,True
2,Ala-Con Costa T,6002,Alameda - Contra Costa Transit District,6002.0,False
3,Alameda,5014,Alameda,5014.0,True
4,Alameda - Contra Costa Transit District,6002,Alameda - Contra Costa Transit District,6002.0,True
...,...,...,...,...,...
652,Yreka City,5020,Yreka City,5020.0,True
653,Yuba City,5163,Yuba City,5163.0,True
654,Yuba County,5916,Yuba County,5916.0,True
655,Yucaipa,5457,Yucaipa,5457.0,True


In [36]:
df_all.compare_names.value_counts()

True     585
False     55
Name: compare_names, dtype: int64

In [37]:
df_all>>filter(_.compare_names==False)>>select(_.agency_name, _.primary_agency_name, _.primary_agency_locode)

Unnamed: 0,agency_name,primary_agency_name,primary_agency_locode
2,Ala-Con Costa T,Alameda - Contra Costa Transit District,6002.0
42,Bay Area Rt,San Francisco Bay Area Rapid Transit District,6000.0
62,Butte County Association Of Governments,Butte County Association of Governments,6092.0
65,Calaveras Council Of Governments,Calaveras Council of Governments,6128.0
106,"City & County Of San Francisco, Mta/Parking & ...","City & County of San Francisco, MTA/Parking & ...",6328.0
107,City/County Association Of Governments Of San ...,City/County Association of Governments of San ...,6419.0
115,Coachella Valley Association Of Governments,Coachella Valley Association of Governments,6164.0
133,Council Of Fresno County Governments,Council of Fresno County Governments,6086.0
134,Council Of San Benito County Governments,Council of San Benito County Governments,6060.0
148,Department Of Parks And Recreation,Department of Parks and Recreation,6081.0


In [38]:
df_all >> group_by(_.primary_agency_name) >> summarize(n=_.agency_name.nunique()) >> arrange(-_.n) >>filter(_.n>1)


Unnamed: 0,primary_agency_name,n
557,"U.S. Forest Service, Pacific Southwest Region",3
3,Alameda - Contra Costa Transit District,2
131,Department of Parks and Recreation,2
211,Imperial County,2
225,Kern County (District 9),2
270,Los Angeles County,2
271,Los Angeles County Metropolitan Transportation...,2
283,Marin County Transit District,2
285,Mariposa County,2
294,Merced County,2


In [39]:
#running through these matches and checking to make sure they are already documented in `issues_dla_data_locode.xlsx`
df_all >> filter(_.primary_agency_name=='Yreka City') >> arrange(_.agency_name)


Unnamed: 0,agency_name,agency_locode,primary_agency_name,primary_agency_locode,compare_names
547,Sonoma County,5020,Yreka City,5020.0,False
652,Yreka City,5020,Yreka City,5020.0,True


In [40]:
#looking at the locode count too
df_all>>count(_.agency_locode)>>arrange(-_.n)>>filter(_.n>=2)

Unnamed: 0,agency_locode,n
538,6190,3
17,5020,2
341,5391,2
440,5916,2
445,5921,2
463,5939,2
464,5940,2
477,5953,2
478,5954,2
480,5956,2


In [41]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 640 entries, 0 to 657
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   agency_name            640 non-null    object 
 1   agency_locode          640 non-null    object 
 2   primary_agency_name    640 non-null    object 
 3   primary_agency_locode  640 non-null    float64
 4   compare_names          640 non-null    bool   
dtypes: bool(1), float64(1), object(3)
memory usage: 25.6+ KB


In [42]:
df_all

Unnamed: 0,agency_name,agency_locode,primary_agency_name,primary_agency_locode,compare_names
0,Access Services,6312,Access Services,6312.0,True
1,Agoura Hills,5435,Agoura Hills,5435.0,True
2,Ala-Con Costa T,6002,Alameda - Contra Costa Transit District,6002.0,False
3,Alameda,5014,Alameda,5014.0,True
4,Alameda - Contra Costa Transit District,6002,Alameda - Contra Costa Transit District,6002.0,True
...,...,...,...,...,...
652,Yreka City,5020,Yreka City,5020.0,True
653,Yuba City,5163,Yuba City,5163.0,True
654,Yuba County,5916,Yuba County,5916.0,True
655,Yucaipa,5457,Yucaipa,5457.0,True


#### `df_all` to CSV

In [43]:
#df_all.to_csv('agencylocode_primary_crosswalk_6.csv',index=False)

## Crosswalk 2: Agencies with Null Locodes

### Extract the obliations with the null lococde values 

In [44]:
null = df>>filter(_.locode.isnull())

In [45]:
len(null)

1499

In [46]:
null.agency.nunique()

205

In [47]:
null_doc1 = (null>> count(_.agency, _.projectID)>>arrange(-_.n))

In [48]:
null_doc1 = null_doc1.drop('n', 1)

  null_doc1 = null_doc1.drop('n', 1)


In [49]:
null_doc1.rename(columns={'agency': 'agency_name', 'projectID': 'agency_locode_id'}, inplace=True)

In [50]:
null_doc1

Unnamed: 0,agency_name,agency_locode_id
163,San Joaquin,5929
103,Los Angeles,5006
187,Stockton,5008
186,Stanislaus,5938
115,Modesto,5059
...,...,...
193,Torrance,5249
197,Twentynine Palm,NBIL
198,Union Pacific,6020
208,Westmorland,5278


In [51]:
df_doc2

Unnamed: 0,primary_agency_name,primary_agency_locode
0,Humboldt Bay Harbor Recreation & Conservation ...,6302
1,Willow Creek Community Services District,6330
2,Trinidad,5036
3,Ukiah,5049
4,Willits,5082
...,...,...
1036,Leave Blank,5465
1037,U.S. Fish and Wildlife Service,6250
1038,Tidewater Southern Railway Company,6031
1039,Tri-Counties Regional Park Group,6176


### Merge with Official Locode List

In [52]:
null_doc_full = pd.merge(null_doc1, df_doc2, how='left', left_on='agency_locode_id', right_on='primary_agency_locode')

In [53]:
null_doc_full

Unnamed: 0,agency_name,agency_locode_id,primary_agency_name,primary_agency_locode
0,San Joaquin,5929,San Joaquin County,5929.0
1,Los Angeles,5006,Los Angeles,5006.0
2,Stockton,5008,Stockton,5008.0
3,Stanislaus,5938,Stanislaus County,5938.0
4,Modesto,5059,Modesto,5059.0
...,...,...,...,...
210,Torrance,5249,Torrance,5249.0
211,Twentynine Palm,NBIL,,
212,Union Pacific,6020,Union Pacific Railroad,6020.0
213,Westmorland,5278,Westmorland,5278.0


In [54]:
unmatched2 = null_doc_full>>filter(_.primary_agency_name.isnull())

In [55]:
#keeping for next part
unmatched2

Unnamed: 0,agency_name,agency_locode_id,primary_agency_name,primary_agency_locode
27,Inyo,24M0,,
163,Yucaipa,NBIL,,
166,Anaheim,11CA,,
184,La Jolla Band Of Luiseno Indians,X075,,
205,Sanbag,499,,
211,Twentynine Palm,NBIL,,


In [56]:
null_doc_full = null_doc_full[null_doc_full['primary_agency_name'].notna()]

In [57]:
null_doc_full

Unnamed: 0,agency_name,agency_locode_id,primary_agency_name,primary_agency_locode
0,San Joaquin,5929,San Joaquin County,5929.0
1,Los Angeles,5006,Los Angeles,5006.0
2,Stockton,5008,Stockton,5008.0
3,Stanislaus,5938,Stanislaus County,5938.0
4,Modesto,5059,Modesto,5059.0
...,...,...,...,...
209,Temecula,5459,Temecula,5459.0
210,Torrance,5249,Torrance,5249.0
212,Union Pacific,6020,Union Pacific Railroad,6020.0
213,Westmorland,5278,Westmorland,5278.0


In [58]:
#null_doc_full['primary_agency_locode'] = null_doc_full['primary_agency_locode'].apply(get_num)
#null_doc_full['agency_locode_id'] = null_doc_full['agency_locode_id'].apply(get_num)

In [59]:
null_doc_full

Unnamed: 0,agency_name,agency_locode_id,primary_agency_name,primary_agency_locode
0,San Joaquin,5929,San Joaquin County,5929.0
1,Los Angeles,5006,Los Angeles,5006.0
2,Stockton,5008,Stockton,5008.0
3,Stanislaus,5938,Stanislaus County,5938.0
4,Modesto,5059,Modesto,5059.0
...,...,...,...,...
209,Temecula,5459,Temecula,5459.0
210,Torrance,5249,Torrance,5249.0
212,Union Pacific,6020,Union Pacific Railroad,6020.0
213,Westmorland,5278,Westmorland,5278.0


### Comparing the columns 

In [60]:
compare_names2 = np.where(null_doc_full["agency_name"] == null_doc_full["primary_agency_name"], True, False)
null_doc_full["compare_names"] = compare_names2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_doc_full["compare_names"] = compare_names2


In [61]:
null_doc_full.compare_names.value_counts()

True     159
False     50
Name: compare_names, dtype: int64

In [62]:
null_doc_full>>filter(_.compare_names==False)

Unnamed: 0,agency_name,agency_locode_id,primary_agency_name,primary_agency_locode,compare_names
0,San Joaquin,5929,San Joaquin County,5929.0,False
3,Stanislaus,5938,Stanislaus County,5938.0,False
5,Los Angeles,5953,Los Angeles County,5953.0,False
7,Merced,5939,Merced County,5939.0,False
9,Sanbag,6053,San Bernardino Associated Governments,6053.0,False
11,San Bernardino,5954,San Bernardino County,5954.0,False
14,Riverside,5956,Riverside County,5956.0,False
15,Sd Assoc Gov'T,6066,San Diego Association of Governments,6066.0,False
16,Tuolumne,5932,Tuolumne County,5932.0,False
18,Calaveras,5930,Calaveras County,5930.0,False


In [63]:
# good with these false values- none are new issues, but rather contain acronyms or correct spelling. 

In [64]:
null_doc_full>>filter(_.compare_names==True)

Unnamed: 0,agency_name,agency_locode_id,primary_agency_name,primary_agency_locode,compare_names
1,Los Angeles,5006,Los Angeles,5006.0,True
2,Stockton,5008,Stockton,5008.0,True
4,Modesto,5059,Modesto,5059.0,True
6,Turlock,5165,Turlock,5165.0,True
8,Merced,5085,Merced,5085.0,True
...,...,...,...,...,...
207,Santa Paula,5121,Santa Paula,5121.0,True
209,Temecula,5459,Temecula,5459.0,True
210,Torrance,5249,Torrance,5249.0,True
213,Westmorland,5278,Westmorland,5278.0,True


#### `null_doc_full` to CSV

In [65]:
#drop `compare_names` column first

In [66]:
null_doc_full = null_doc_full.drop('compare_names', 1)

  null_doc_full = null_doc_full.drop('compare_names', 1)


In [67]:
#null_doc_full.to_csv('null_doc_full1.csv',index=False)

## Crosswalk 3: Unmatched

In [68]:
unmatched1

Unnamed: 0,agency_name,agency_locode,primary_agency_name,primary_agency_locode
40,Banning,7500,,
88,Caltrans,7504,,
190,Fowler,7500,,
204,Gold Coast Transit District,7505,,
210,Grass Valley,32L0,,
266,La Quinta,NBIL,,
277,Lancaster,7500,,
299,Los Angeles,7500,,
300,Los Angeles,38Y0,,
325,Mendocino,40A0,,


In [69]:
unmatched2 = unmatched2.rename(columns={'agency_locode_id': 'agency_locode'})

In [70]:
unmatched2

Unnamed: 0,agency_name,agency_locode,primary_agency_name,primary_agency_locode
27,Inyo,24M0,,
163,Yucaipa,NBIL,,
166,Anaheim,11CA,,
184,La Jolla Band Of Luiseno Indians,X075,,
205,Sanbag,499,,
211,Twentynine Palm,NBIL,,


In [71]:
unmatched_total = pd.concat([unmatched1, unmatched2], ignore_index=True)

In [72]:
unmatched_total>>arrange(_.agency_name)

Unnamed: 0,agency_name,agency_locode,primary_agency_name,primary_agency_locode
24,Anaheim,11CA,,
0,Banning,7500,,
1,Caltrans,7504,,
2,Fowler,7500,,
3,Gold Coast Transit District,7505,,
4,Grass Valley,32L0,,
22,Inyo,24M0,,
25,La Jolla Band Of Luiseno Indians,X075,,
5,La Quinta,NBIL,,
6,Lancaster,7500,,


In [73]:
unmatched_total= unmatched_total.drop('primary_agency_name', 1)
unmatched_total= unmatched_total.drop('primary_agency_locode', 1)

  unmatched_total= unmatched_total.drop('primary_agency_name', 1)
  unmatched_total= unmatched_total.drop('primary_agency_locode', 1)


### Merge onto Official List on Agency Name
* Assuming that the agency name are the same

In [74]:
unmatched_est = pd.merge(unmatched_total, df_doc2, how='left', left_on='agency_name', right_on='primary_agency_name')

unmatched_est

Unnamed: 0,agency_name,agency_locode,primary_agency_name,primary_agency_locode
0,Banning,7500,Banning,5214.0
1,Caltrans,7504,Caltrans,6201.0
2,Caltrans,7504,Caltrans,6202.0
3,Caltrans,7504,Caltrans,6203.0
4,Caltrans,7504,Caltrans,6204.0
5,Caltrans,7504,Caltrans,6205.0
6,Caltrans,7504,Caltrans,6206.0
7,Caltrans,7504,Caltrans,6207.0
8,Caltrans,7504,Caltrans,6208.0
9,Caltrans,7504,Caltrans,6209.0


In [75]:
#unmatched_est['primary_agency_locode'] = unmatched_est['primary_agency_locode'].apply(get_num)

In [76]:
unmatched_est

Unnamed: 0,agency_name,agency_locode,primary_agency_name,primary_agency_locode
0,Banning,7500,Banning,5214.0
1,Caltrans,7504,Caltrans,6201.0
2,Caltrans,7504,Caltrans,6202.0
3,Caltrans,7504,Caltrans,6203.0
4,Caltrans,7504,Caltrans,6204.0
5,Caltrans,7504,Caltrans,6205.0
6,Caltrans,7504,Caltrans,6206.0
7,Caltrans,7504,Caltrans,6207.0
8,Caltrans,7504,Caltrans,6208.0
9,Caltrans,7504,Caltrans,6209.0


In [77]:
#some of these are located in other crosswalks. 
## Ex. "Sgvc" and "Sanbag" are already documented. 
## This crosswalk addressed the agencies that have locodes other than those listed in `agency_locode`. 

In [78]:
no_official = unmatched_est>>filter(_.primary_agency_name.isnull())

In [79]:
no_official

Unnamed: 0,agency_name,agency_locode,primary_agency_name,primary_agency_locode
15,Gold Coast Transit District,7505,,
21,Mendocino,40A0,,
29,Sgvc,7500,,
31,Sutter,7500,,
32,Ventura,7500,,
34,Inyo,24M0,,
37,La Jolla Band Of Luiseno Indians,X075,,
38,Sanbag,499,,
39,Twentynine Palm,NBIL,,


#### Exporting Unmatched NaN values for manipulation in Excel

In [80]:
# will correct these manually in excel 
## exporting to do that

#no_official.to_csv('unmatched_no_officials.csv',index=False)

#### `unmatched_est` to CSV

In [81]:
unmatched_est = unmatched_est[unmatched_est['primary_agency_name'].notna()]

In [82]:
#unmatched_est.to_csv('unmatched_estimate.csv',index=False)