# Data Prep for ATP 

**Duplicate notebook using concat method**

* manual cleaning needed outside of notebook for funding data

In [1]:
import intake
import numpy as np
import pandas as pd
from calitp import to_snakecase
from dla_utils import _dla_utils
from IPython.display import HTML, Markdown
from siuba import *

import altair as alt

import data_cleaning



In [2]:
pd.set_option("display.max_columns",500)

* some `a1_imp_agcy_fed_ma_num` are not present 
*  merge on `project_app_id`


* need function for reading in funding data and which projects get selected for funding
* sheets of Master_AllData that we need:
    * Master_Yes
    * Statewide SUR Details (merge with SUR Funding
    * Statewide SUR Funding (merge with SUR Details) 
* using a copy of the data to account for multiple headers

In [5]:
GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/dla/atp/'


## Read in Master Data

In [20]:
# reading the clean data (from atp script)
#df = data_cleaning.read_clean_data()

In [21]:
#df.sample()

In [73]:
def read_app_data():
    """
    Function for reading in the application data. Can then merge with funded data
    """
    # identify information columns that we need to drop
    columns_to_drop = ['a1_imp_agcy_contact','a1_imp_agcy_email','a1_imp_agcy_phone',
                      'a1_proj_partner_contact', 'a1_proj_partner_email', 'a1_proj_partner_phone']
    
    #read in data
    df = to_snakecase(pd.read_excel(f'{GCS_FILE_PATH}Master_AllData_Cycle5_Field_Mapping_COPY.xls',
              sheet_name='Master_Yes',
                                        header=[2]))
    df = df.drop(columns = columns_to_drop)
    
    #drop columns that will contain funding data (this df has the columns but no information for them)
    #we know funding data starts with columns `original_prog__amt___pa_ed_`
    df.drop(df.iloc[:,(df.columns.get_loc('original_prog__amt___pa_ed_')):], inplace=True, axis=1)  
    
    #drop identifier columns that are fully null (these columns are populated in funding data
    df = df.drop(columns={'#', 'atp_id', 'ppno', 'ppno_1'})
    
    return df

In [74]:
master_data = read_app_data()

In [75]:
master_data.head()

Unnamed: 0,awarded,project_cycle,a2_ct_dist,a3_proj_type,project_app_id,a2_info_proj_name,a2_county,a1_locode,a1_imp_agcy_name,a1_imp_agcy_street,a1_imp_agcy_city,a1_imp_agcy_zip,a1_imp_agcy_title,a1_imp_agcy_ma,a1_imp_agcy_state_ma_num,a1_imp_agcy_fed_ma_num,a1_proj_partner_exists,a1_proj_partner_agcy,a1_proj_partner_title,assembly_district,a2_assem_dist_a,a2_assem_dist_b,a2_assem_dist_c,congressional_district,a2_congress_dist_a,a2_congress_dist_b,a2_congress_dist_c,senate_district,a2_senate_dist_a,a2_senate_dist_b,a2_senatedistc,a2_info_proj_descr,a2_info_proj_loc,a2_mop_uza_population,a2_mpo,a2_past_proj,a2_past_proj_qty,a2_proj_lat,a2_proj_long,a2_proj_scope_summary,a2_project_location_map,a2_rtpa,a3_plan_active_trans,a3_plan_active_trans_exists,a3_plan_bicycle,a3_plan_bicycle_exists,a3_plan_ped,a3_plan_ped_exists,a3_plan_srts,a3_plan_srts_exists,a3_st_bicycle_applies,a3_st_bicycle_pct,a3_st_num_schools,a3_st_ped_applies,a3_st_ped_pct,a3_st_srts,a3_trail_elig_cost,a3_trail_fed_funding,a3_trail_trans_pct,a3_current_plan,a3_trails,a3_plan_none,a3_plan_other,a3_plan_other_desc,a2_output_outcome,b_sig_inter_new_bike_boxes,b_class_1,b_class_2,b_class_3,b_class_4,a4_bike_gap_pct,b_light_intersection,b_mid_block_new_rrfb_signal,b_mid_block_surf_improv,b_bsp_new_bikes,b_bike_new_secured_lockers,b_bike_new_racks,b_bsp_new_station,b_other_bike_improv_1,b_other_bike_improv_qty_1,b_other_bike_improv_2,b_other_bike_improv_qty_2,b_light_rdwy_seg,b_sig_inter_timing_improv,b_un_sig_new_rrfb_signal,b_un_sig_cross_surf_improv,a4_easement_support,m_cls_1_trails_widen_recon_exist,m_cls_1_trails_new__less_than_9,m_cls_1_trails_new_over_9,m_non_cls_trails_new,m_other_trail_imprv_1,m_other_trail_improv_qty_1,m_other_trail_imprv_2,m_other_trail_improv_qty_2,m_non_cls_widen_recon_exist,p_amenities_bench,a4_ped_gap_pct,p_mid_block_cross_new_rrfb_signal,p_light_intersection,p_lighting_rdwy_seg,p_mid_block_cross_surf_improv,p_new_ada_ramp,p_sidewlks_new_barrier_protect,p_sidewlks_new_4_to_8,p_sidewlks_new_over_8,p_other_ped_imprv_1,p_other_ped_qty_1,p_other_ped_imprv_2,p_other_ped_qty_2,p_reconstruct_ramp_to_ada_stand,p_sidewlks_reconstruct_enhance_exist,p_sig_inter_enhance_exist_crosswlk,p_sig_inter_new_crosswlk,p_sig_inter_ped_heads,p_sig_inter_shorten_cross,p_sig_inter_timing_improv,p_amenities_trash_can,p_amenities_shade_tree,p_amenities_shade_tree_type,p_un_sig_inter_new_traff_sig,p_un_sig_inter_new_roundabout,p_un_sig_inter_new_rrfb_sig,p_un_sig_inter_shorten_cross,p_un_sig_inter_cross_surface_improv,p_sidewlks_widen_existing,a4_row_100,a4_row_gov_ease,a4_row_private_ease,v_other_traffic_calming_imprv_1,v_speed_feedback_signs,v_other_traffic_calming_qty_1,v_other_traffic_calming_imprv_2,v_other_traffic_calming_qty_2,v_remove_right_turn_pocket,v_remove_travel_ln,v_sig_inter_new_roundabout,v_sig_inter_timing_improv,v_un_sig_inter_new_traf_sig,v_un_sig_inter_new_roundabout,a4_reg_init,a4_reg_init_pct,a4_com_init,a4_com_init_pct,a4_safe_route,a4_safe_route_pct,a4_fl_mile,a4_fl_mile_pct,a4_emp_based,a4_emp_based_pct,a4_other_ni,a4_other_ni_descr,a4_other_ni_pct,a4_wb_audits,a4_bike_classes,a4_ped_classes,a4_demo_events,a4_com_enc,a4_le_methods,a4_com_meetings,a4_classrooms,a4_school_assem,a4_after_school,a4_bike_rodeos,a4_mock_cities,a4_walk_bus,a4_bike_train,a4_com_challenges,a4_srts_enc,a4_srts_le,a4_srts_training,a4_act_other_1,a4_act_other_1_descr,a4_act_other_2,a4_act_other_2_decr,a4_comm_trad_media,a4_comm_large_media,a4_comm_print,a4_comm_social,a4_comm_web,a4_comm_other,a4_comm_other_descr,a4_comm_language,a4_collab_pub_health,a4_collab_le,a4_collab_non_profit,a4_collab_schools,a4_collab_pub_works,a4_collab_other,a4_colab_other_descr,a4_plan_ped,a4_plan_bike,a4_plan_atp,a4_plan_school_routes,a4_row_open_street_demo
0,N,5,10,Infrastructure - Small,10-Merced County-1,Planada Sidewalk Infill Project,MER,5939,Merced County,345 west 7th street,Merced,95340,Deputy Director,Yes,00033S,10-5939R,No,,,21,21,,,16,16,,,12,12,,,"PA&ED, PS&E, and CON funding for construction ...",1) South side of Haskell Ave from Cody ave to ...,Project is located outside one of the ten larg...,MCAG,No,0,37.29,120.31,The Planada Sidewalk Infill Project is located...,,,,No,,Yes,,Yes,,No,Yes,20,1,Yes,80,Yes,0,,0,No,No,No,No,,Sidewalk infill along portions of Haskell aven...,0,0,0,1500,0,,0,0,0,0,0,0,0,,0,,0,0,0,0,0,,0,0,0,0,,0,,0,0,0,0,0,0,0,0,6,0,1500,0,,0,,0,5,0,4,3,0,0,0,0,0,,0,0,0,0,0,0,No,No,Yes,,0,0,,0,0,0,0,0,0,0,N,0,N,0,N,0,N,0,,0,N,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,,N,N,N,N,N,N,,,N,N,N,N,N,N,,N,N,N,N,No
1,N,5,12,Infrastructure - Medium,"12-Santa Ana, City of-4",Bishop Street Bicycle Boulevard Project,ORA,5063,"Santa Ana, City of","20 Civic Center Plaza, M-43",Santa Ana,92702,Senior Civil Engineer,Yes,00289S,12-5063,No,,,69,69,,,46,46,,,34,34,,,Bishop Street Class 3 Bicycle Boulevard with T...,Bishop Street from Flower Street to Standard A...,Project is located within one of the ten large...,SCAG,Yes,2,33.74,117.86,This project will implement a Class 3 bicycle ...,,,,Yes,,Yes,,No,,Yes,Yes,50,0,Yes,50,No,0,,0,Yes,No,No,No,,"Install 1.15 mile bike boulevard, construction...",0,0,0,6336,0,,0,0,0,0,0,0,0,,0,,0,0,2,0,0,,0,0,0,0,,0,,0,0,0,100,0,0,0,0,0,0,0,0,,0,,0,38,0,15,16,0,18,3,0,0,,1,6,0,18,0,0,Yes,No,No,,0,0,,0,0,8800,0,0,0,0,N,0,N,0,N,0,N,0,,0,N,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,,N,N,N,N,N,N,,,N,N,N,N,N,N,,N,N,N,N,No
2,N,5,4,Infrastructure - Small,"4-Pacifica, City of-1",Palmetto Ave - Esplanade Ave Bicycle & Pedestr...,SM,5350,"Pacifica, City of",151 Milagra Drive,City of Pacifica,94044,Associate Civil Engineer,Yes,,04-5350-F15,No,,,22,22,,,14,14,,,13,13,,,CON funding for installing bicycling facilitie...,On Palmetto Ave between Paloma Ave and West Av...,Project is located outside one of the ten larg...,MTC,No,0,37.65,-122.49,The project will install a combination of Clas...,,,,No,,Yes,,Yes,,No,Yes,50,2,Yes,50,No,0,,0,Yes,No,0,0,,Bicycling and pedestrian amenities will be ins...,0,0,13752,5748,0,,0,0,0,0,0,0,0,,0,,0,0,0,1,0,,0,0,0,0,,0,,0,0,0,40,2,0,0,0,20,0,0,0,,0,,0,9,0,0,0,0,0,0,0,0,,0,0,0,0,0,0,Yes,No,No,,0,0,,0,0,0,0,0,0,0,N,0,N,0,N,0,N,0,,0,N,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,,N,N,N,N,N,N,,,N,N,N,N,N,N,,N,N,N,N,No
3,N,5,12,Infrastructure - Large,"12-Santa Ana, City of-13",Jefferson ES_Thorpe Fundamental_McFadden Int_G...,ORA,5063,"Santa Ana, City of","20 Civic Center Plaza, M-43",Santa Ana,92702,Senior Civil Engineer,Yes,00289S,12-5063,No,,,69,69,,,46,46,,,34,34,,,Pedestrian traffic safety improvements for Jef...,"In the City of Santa Ana, the safe routes to s...",Project is located within one of the ten large...,SCAG,Yes,2,33.71,117.89,"This project will be repairing, replacing and ...",,,,Yes,,Yes,,No,,Yes,No,0,5,Yes,100,Yes,0,,0,Yes,No,No,No,,"Construct curb extensions at 8 intersections, ...",0,0,0,0,0,,0,0,0,0,0,0,0,,0,,0,0,0,0,0,,0,0,0,0,,0,,0,0,0,50,0,0,0,0,60,0,0,0,Left Turn Arrow,3,Enhanced Crosswalk Unsignalized,3,218,1000,7,0,0,1,0,0,0,,0,0,0,7,0,0,Yes,No,No,,0,0,,0,0,0,0,0,0,0,N,0,N,0,N,0,N,0,,0,N,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,,N,N,N,N,N,N,,,N,N,N,N,N,N,,N,N,N,N,No
4,N,5,12,Infrastructure - Large,"12-Santa Ana, City of-14",Lathrop Intermediate_Lowell ES_Martin ES_Pio P...,ORA,5063,"Santa Ana, City of","20 Civic Center Plaza, M-43",Santa Ana,92702,Senior Civil Engineer,Yes,00289S,12-5063,No,,,69,69,,,46,46,,,34,34,,,Pedestrian traffic safety improvements for La...,"In the City of Santa Ana, the safe routes to s...",Project is located within one of the ten large...,SCAG,Yes,4,33.73,117.87,"This project will be repairing, replacing and ...",,,,Yes,,Yes,,No,,Yes,No,0,5,Yes,100,Yes,0,,0,Yes,No,No,No,,"Construct curb extensions at 6 intersections, ...",0,0,0,0,0,,0,0,0,0,0,0,0,,0,,0,0,0,0,0,,0,0,0,0,,0,,0,0,0,50,0,0,0,0,43,0,0,0,Enhance crosswalk (unsignalized),7,Raised Crosswalk,2,189,3455,5,0,0,1,0,0,0,,0,0,2,5,0,0,Yes,No,No,,0,0,,0,0,0,0,0,0,0,N,0,N,0,N,0,N,0,,0,N,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,,N,N,N,N,N,N,,,N,N,N,N,N,N,,N,N,N,N,No


## Function to read SUR funding data

In [16]:
def read_SUR_funding_data():
    """
    Function to read in ATP funding data. Function will need to change for future data.
    Notes:
    * `atp_id` columns appear the same but the sur_details has an extra zero in the middle of the string so it would not match
    * `a3_project_type` also is entered differently however, details has more details than the funding sheet. Has information on size of project. can add to new column
    * `a1_imp_agcy_name_x` has manual errors so selecting `a1_imp_agcy_name_y`
    """
    # identify information columns that we need to drop
    columns_to_drop = ['a1_imp_agcy_contact','a1_imp_agcy_email','a1_imp_agcy_phone',
                      'a1_proj_partner_contact', 'a1_proj_partner_email', 'a1_proj_partner_phone']
    #read in SUR details and SUR funding data
    sur_details = to_snakecase(pd.read_excel(f'{GCS_FILE_PATH}Master_AllData_Cycle5_Field_Mapping_COPY.xls',
              sheet_name='Statewide SUR Details'))
    sur_details = sur_details.drop(columns = columns_to_drop)
    
    sur_funding = to_snakecase(pd.read_excel(f'{GCS_FILE_PATH}Master_AllData_Cycle5_Field_Mapping_COPY.xls',
              sheet_name='Statewide SUR Funding'))
    
    #drop the last few columns of SUR Details that have no funding data entered, but have columns
    sur_details.drop(sur_details.iloc[:,199:], inplace=True, axis=1)
    
    #remove rows with all null values
    cols_to_check = sur_funding.columns
    sur_funding['is_na'] = sur_funding[cols_to_check].isnull().apply(lambda x: all(x), axis=1) 
    sur_funding = sur_funding>>filter(_.is_na==False)
    sur_funding = sur_funding.drop(columns={'is_na'})

    #delete rows identified that are not part of the data (informational cells) or a sum total for all entries
    delete_row = sur_funding[sur_funding["project_cycle"]== 'Added Field not from App'].index
    sur_funding = sur_funding.drop(delete_row)
    
    delete_row = sur_funding[sur_funding["total_project_cost"]== '370,984,000.00'].index
    sur_funding = sur_funding.drop(delete_row)
    
    #merge sur_funding and sur_details
    merge_on = ['project_app_id', 'project_cycle', 'a2_ct_dist', 'a1_locode']
    df = (pd.merge(sur_details, sur_funding, how="outer", on = merge_on, indicator=True))
    
    #keep entries that merge. Right_only rows are misentered and more informational columns  
    df = df>>filter(_._merge=='both')
    
    # filling the null values for some of the duplicate columns
    # manually checking that values are the same as of now- will add function to check when we get the data links
    df['awarded_x'] = df['awarded_x'].fillna(df['awarded_y'])
    df['ppno_y'] = df['ppno_y'].fillna(df['ppno_x'])
    
    #renaming and dropping duplicate columns 
    ## a1_imp_agcy_name_x has manual errors so selecting a1_imp_agcy_name_y
    df = df.rename(columns={'awarded_x':'awarded',
                                'ppno_y':'ppno',
                                'a1_imp_agcy_name_y':'a1_imp_agcy_name',
                                'a2_info_proj_name_y':'a2_info_proj_name'
                               })
    df = df.drop(columns={'awarded_y', 'a1_imp_agcy_name_x', 'a2_info_proj_name_x','ppno_x', '_merge'})
    
    return df

In [17]:
funded = read_SUR_funding_data()

In [18]:
len(funded)

49

In [51]:
funded.head()

Unnamed: 0,awarded,project_cycle,a2_ct_dist,#,atp_id_x,ppno_1,a3_proj_type_x,project_app_id,a2_county,a1_locode,a1_imp_agcy_street,a1_imp_agcy_city,a1_imp_agcy_zip,a1_imp_agcy_title,a1_imp_agcy_ma,a1_imp_agcy_state_ma_num,a1_imp_agcy_fed_ma_num,a1_proj_partner_exists,a1_proj_partner_agcy,a1_proj_partner_title,assembly_district,a2_assem_dist_a,a2_assem_dist_b,a2_assem_dist_c,congressional_district,a2_congress_dist_a,a2_congress_dist_b,a2_congress_dist_c,senate_district,a2_senate_dist_a,a2_senate_dist_b,a2_senatedistc,a2_info_proj_descr,a2_info_proj_loc,a2_mop_uza_population,a2_mpo,a2_past_proj,a2_past_proj_qty,a2_proj_lat,a2_proj_long,a2_proj_scope_summary,a2_project_location_map,a2_rtpa,a3_plan_active_trans,a3_plan_active_trans_exists,a3_plan_bicycle,a3_plan_bicycle_exists,a3_plan_ped,a3_plan_ped_exists,a3_plan_srts,a3_plan_srts_exists,a3_st_bicycle_applies,a3_st_bicycle_pct,a3_st_num_schools,a3_st_ped_applies,a3_st_ped_pct,a3_st_srts,a3_trail_elig_cost,a3_trail_fed_funding,a3_trail_trans_pct,a3_current_plan,a3_trails,a3_plan_none,a3_plan_other,a3_plan_other_desc,a2_output_outcome,b_sig_inter_new_bike_boxes,b_class_1,b_class_2,b_class_3,b_class_4,a4_bike_gap_pct,b_light_intersection,b_mid_block_new_rrfb_signal,b_mid_block_surf_improv,b_bsp_new_bikes,b_bike_new_secured_lockers,b_bike_new_racks,b_bsp_new_station,b_other_bike_improv_1,b_other_bike_improv_qty_1,b_other_bike_improv_2,b_other_bike_improv_qty_2,b_light_rdwy_seg,b_sig_inter_timing_improv,b_un_sig_new_rrfb_signal,b_un_sig_cross_surf_improv,a4_easement_support,m_cls_1_trails_widen_recon_exist,m_cls_1_trails_new__less_than_9,m_cls_1_trails_new_over_9,m_non_cls_trails_new,m_other_trail_imprv_1,m_other_trail_improv_qty_1,m_other_trail_imprv_2,m_other_trail_improv_qty_2,m_non_cls_widen_recon_exist,p_amenities_bench,a4_ped_gap_pct,p_mid_block_cross_new_rrfb_signal,p_light_intersection,p_lighting_rdwy_seg,p_mid_block_cross_surf_improv,p_new_ada_ramp,p_sidewlks_new_barrier_protect,p_sidewlks_new_4_to_8,p_sidewlks_new_over_8,p_other_ped_imprv_1,p_other_ped_qty_1,p_other_ped_imprv_2,p_other_ped_qty_2,p_reconstruct_ramp_to_ada_stand,p_sidewlks_reconstruct_enhance_exist,p_sig_inter_enhance_exist_crosswlk,p_sig_inter_new_crosswlk,p_sig_inter_ped_heads,p_sig_inter_shorten_cross,p_sig_inter_timing_improv,p_amenities_trash_can,p_amenities_shade_tree,p_amenities_shade_tree_type,p_un_sig_inter_new_traff_sig,p_un_sig_inter_new_roundabout,p_un_sig_inter_new_rrfb_sig,p_un_sig_inter_shorten_cross,p_un_sig_inter_cross_surface_improv,p_sidewlks_widen_existing,agency_fully_own_r_w,"require_r_w_easement,_from_gov",_require_rw_easement_from_private,v_other_traffic_calming_imprv_1,v_speed_feedback_signs,v_other_traffic_calming_qty_1,v_other_traffic_calming_imprv_2,v_other_traffic_calming_qty_2,v_remove_right_turn_pocket,v_remove_travel_ln,v_sig_inter_new_roundabout,v_sig_inter_timing_improv,v_un_sig_inter_new_traf_sig,v_un_sig_inter_new_roundabout,a4_reg_init,a4_reg_init_pct,a4_com_init,a4_com_init_pct,a4_safe_route,a4_safe_route_pct,a4_fl_mile,a4_fl_mile_pct,a4_emp_based,a4_emp_based_pct,a4_other_ni,a4_other_ni_descr,a4_other_ni_pct,a4_wb_audits,a4_bike_classes,a4_ped_classes,a4_demo_events,a4_com_enc,a4_le_methods,a4_com_meetings,a4_classrooms,a4_school_assem,a4_after_school,a4_bike_rodeos,a4_mock_cities,a4_walk_bus,a4_bike_train,a4_com_challenges,a4_srts_enc,a4_srts_le,a4_srts_training,a4_act_other_1,a4_act_other_1_descr,a4_act_other_2,a4_act_other_2_decr,a4_comm_trad_media,a4_comm_large_media,a4_comm_print,a4_comm_social,a4_comm_web,a4_comm_other,a4_comm_other_descr,a4_comm_language,a4_collab_pub_health,a4_collab_le,a4_collab_non_profit,a4_collab_schools,a4_collab_pub_works,a4_collab_other,a4_colab_other_descr,a4_plan_ped,a4_plan_bike,a4_plan_atp,a4_plan_school_routes,a4_row_open_street_demo,project_status,solicitation_abv,solicitation,soliciting_agency,atp_id_y,ppno,ppno1,oversight_id,a3_proj_type_y,project_size,total_project_cost,a1_imp_agcy_name,a2_info_proj_name,original_prog__amt___pa_ed_,orig__prog__year__pa_ed_,original_prog__amt___ps_e_,orig__prog__year__ps_e_,original_prog__amt___rw_,orig__prog__year__rw_,orignal_prog__amt___con_,orig__prog__year__con_,original_prog__amt___con_ni_,orig__prog__year__con_ni_,total,_2122_pa_ed,_2122_ps_e,_2122_rw,_2122_con,_2122_con_ni,_2122_total,fund_year_1,_2223_pa_ed,_2223_ps_e,_2223_rw,_2223_con,_2223_con_ni,_2223_total,fund_year_2,_2324_pa_ed,_2324_ps_e,_2324_rw,_2324_con,_2324_con_ni,_2324_total,fund_year_3,_2425_pa_ed,_2425_ps_e,_2425_rw,_2425_con,_2425_con_ni,_2425_total,fund_year_4,total_atp_$,match?,unnamed:_59,paed,ps_e_,rw,con,con_ni,total_atp_,unnamed:_66,year__pa_ed_,year__ps_e_,year__rw_,year__con_,year__con_ni_
0,Y,5.0,5.0,,ATP5-05-001R,,Infrastructure - Small,"5-Santa Barbara, City of-3",SB,5007,630 Garden Street,Santa Barbara,93101.0,Principal Transportation Engineer,Yes,00167S,05-5007F15,No,,,37,0.0,3.0,7.0,24,0.0,2.0,4.0,19,0.0,1.0,9.0,Design and construct buffered bike lanes on De...,On De La Vina Street from Alamar Avenue to Pad...,Project is located outside one of the ten larg...,SBCAG,No,0.0,34.43,-119.72,-Existing Conditions-\r\rDe La Vina Street had...,,,,No,,Yes,,Yes,,No,Yes,50.0,3.0,Yes,50.0,No,0.0,,,Yes,,0,0,,Install .65 miles of new Class II buffered bik...,,,3475.0,4965.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,210.0,,,,,,21.0,,,,,,,,,,,,,6.0,,,X,,,,,,,,,,,,,,N,0.0,N,0.0,N,0.0,N,0.0,,0.0,N,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,N,N,N,N,N,N,,,N,N,N,N,N,N,,N,N,N,N,No,Active,R,SMALL URBAN & RURAL SOLICITATION-CYCLE 5,CALTRANS,ATP5-5-001R,3057,,Standard,Infrastructure,Small,1998000.0,"Santa Barbara, City of",Upper De La Vina Street Gap Closure and Safe C...,290000.0,21/22,29000.0,23/24,8000.0,23/24,1671000.0,24/25,,,1998000.0,290000.0,,,,,290000.0,21/22,,,,,,0.0,22/23,,29000.0,8000.0,,,37000.0,23/24,,,,1671000.0,,1671000.0,24/25,1998000.0,YES,YES,290.0,29.0,8.0,1671.0,0.0,1998.0,1998000.0,21/22,23/24,23/24,24/25,
1,Y,5.0,7.0,,ATP5-07-002S,,Infrastructure - Small,"7-South El Monte, City of-1",LA,5352,1415 Santa Anita Avenue,South El Monte,91733.0,Project Manager,Yes,00054S,07-5352,No,,,57,57.0,,,38,38.0,,,22,22.0,,,This project focuses on school and pedestrian ...,The project is fully in the City of South El M...,Project is located within one of the ten large...,SCAG,No,0.0,34.05,118.05,This project focuses on school and pedestrian ...,,,,No,,No,,No,,Yes,No,0.0,7.0,Yes,100.0,Yes,0.0,,,Yes,,0,0,,Installation of pedestrian safety upgrades at ...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100.0,1.0,,,2.0,,,,,LED Blinding Stop Signs,13.0,Speed Radar Feedback Signs,5.0,,,9.0,,8.0,,,,,,,,8.0,,32.0,,X,,,,,,,,,,,,,,N,0.0,N,0.0,N,0.0,N,0.0,,0.0,N,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,N,N,N,N,N,N,,,N,N,N,N,N,N,,N,N,N,N,No,Active,S,STATEWIDE SOLICITATION-CYCLE 5,CALTRANS,ATP5-7-002S,5858,,Standard,Infrastructure,Small,1637000.0,"South El Monte, City of",South El Monte Safe Routes to School Pedestria...,10000.0,21/22,130000.0,22/23,,,1497000.0,23/24,,,1637000.0,10000.0,,,,,10000.0,21/22,,130000.0,,,,130000.0,22/23,,,,1497000.0,,1497000.0,23/24,,,,,,0.0,24/25,1637000.0,YES,YES,10.0,130.0,0.0,1497.0,0.0,1637.0,1637000.0,21/22,22/23,,23/24,
2,Y,5.0,4.0,,ATP5-04-003S,2343B,Infrastructure + NI - Large,"4-Fairfield, City of-1",SOL,5132,1000 Webster Street,Fairfield,94533.0,"Asst Director of PW, City Engineer",Yes,,04-5132R,No,,,11,11.0,,,3,3.0,,,3,3.0,,,Construct Class IV separated bikeways with Cla...,West Texas Street between Beck Avenue and Penn...,Project is located within one of the ten large...,MTC,No,0.0,38.25,-122.06,"Located adjacent to downtown Fairfield, this p...",,,,Yes,,Yes,,Yes,,Yes,Yes,38.0,2.0,Yes,62.0,Yes,0.0,,,Yes,,0,0,Heart of Fairfield Specific Plan,Road diet replacing travel lanes with Class IV...,12.0,,3120.0,,5720.0,,3.0,,,,,,,,,,,4090.0,5.0,,,,,,,,,,,,,,,,,,,3.0,4020.0,,,,,,,27.0,2100.0,14.0,2.0,32.0,2.0,5.0,,,,1.0,,1.0,2.0,4.0,6140.0,,,X,,,,,,,7164.0,,5.0,1.0,,N,0.0,N,0.0,Y,100.0,N,0.0,,0.0,N,,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,4.0,Mobile Bike Repair Events,8.0,Evaluation Time Periods - 2 each hand tallies ...,Y,N,Y,Y,N,N,,,N,N,N,N,N,Y,Solano Transportation Authority (STA) is the C...,N,N,N,N,No,Active,S,STATEWIDE SOLICITATION-CYCLE 5,CALTRANS,ATP5-4-003S,2343A,2343B,Standard,Combined (IF and NI),Large,16922000.0,"Fairfield, City of",West Texas Street Complete Streets Project,,,838000.0,22/23,,,9948000.0,23/24,117000.0,22/23,10903000.0,,,,,,0.0,21/22,,838000.0,,,117000.0,955000.0,22/23,,,,9948000.0,,9948000.0,23/24,,,,,,0.0,24/25,10903000.0,YES,YES,0.0,838.0,0.0,9948.0,117.0,10903.0,10903000.0,,22/23,,23/24,22/23
3,Y,5.0,5.0,,ATP5-05-004S,3058B,Infrastructure + NI - Large,"5-Santa Cruz, City of-2",SCR,5025,809 Center St,Santa Cruz,95060.0,Senior Engineer,Yes,00244S,05-5025R,No,,,29,29.0,,,"18, 20",18.0,20.0,,17,17.0,,,Construction of .8 miles of Segment 7 of the R...,Adjacent to the Santa Cruz Branch Rail Line be...,Project is located outside one of the ten larg...,AMBAG,Yes,2.0,36.96,-122.03,The project will close a .8 mile gap in the Ra...,,SCCRTC,,Yes,,No,,No,,Yes,Yes,50.0,1.0,Yes,50.0,Yes,0.0,Yes,100.0,Yes,X,No,No,,Construction of .8 miles of Segment 7 of the R...,,,,,,,,,,,,,,,,,,,,,,,,,4172.0,,Wayfinding signage,40.0,Lighting,45.0,,,100.0,,,,,,,,,,,,,,,,,,,,,38.0,Willow and heritage,,,,,,,,,X,,,,,,,,,,,,N,0.0,N,30.0,N,70.0,N,0.0,,0.0,N,,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,24.0,4.0,0.0,24.0,12.0,0.0,0.0,0.0,144.0,0.0,6.0,20.0,Parent education classes on bike and pedestria...,8.0,Group family rides,Y,N,Y,Y,Y,N,,Spanish,Y,Y,Y,Y,Y,N,,N,N,N,N,No,Active,S,STATEWIDE SOLICITATION-CYCLE 5,CALTRANS,ATP5-5-004S,3058A,,Standard,Combined (IF and NI),Large,12030000.0,"Santa Cruz, City of",Santa Cruz Rail Trail Segment 7 Phase 2 Constr...,,,,,,,8634000.0,21/22,550000.0,21/22,9184000.0,,,,8634000.0,550000.0,9184000.0,21/22,,,,,,0.0,22/23,,,,,,0.0,23/24,,,,,,0.0,24/25,9184000.0,YES,YES,0.0,0.0,0.0,8634.0,550.0,9184.0,9184000.0,,,,21/22,21/22
4,Y,5.0,11.0,,ATP5-11-005S,,Infrastructure + NI - Small,"11-Oceanside, City of-1",SD,5079,300 N Coast Highway,Oceanside,92024.0,Active Transportation and Micromobility Coordi...,Yes,00369S,11-5079R,No,,,76,76.0,,,49,49.0,,,36,36.0,,,The Laurel Elementary SRTS includes infrastruc...,The project is located in the Eastside communi...,Project is located within one of the ten large...,SANDAG,No,0.0,33.21,-117.37,Laurel Elementary Safe Routes to School (SRTS)...,,,,No,,Yes,,Yes,,No,Yes,10.0,1.0,Yes,90.0,Yes,0.0,,,No,,No,No,,Infrastructure improvements of a mini roundabo...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,25.0,2.0,,,1.0,29.0,,868.0,,traffic circle and pavement markings,1.0,,,,,,,,,,,,,,1.0,,1.0,12.0,,X,,,traffic calming median on San Diego,,1.0,,,,,,,,,N,0.0,N,0.0,Y,100.0,N,0.0,,0.0,N,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,2.0,1.0,0.0,0.0,2.0,0.0,0.0,12.0,Stakeholder Meetings,2.0,Crossing Guard Trainings,Y,N,Y,Y,Y,N,,Spanish,Y,Y,Y,Y,Y,N,,N,N,N,N,No,Active,S,STATEWIDE SOLICITATION-CYCLE 5,CALTRANS,ATP5-11-005S,1442,,Standard,Combined (IF and NI),Small,1535000.0,"Oceanside, City of",Laurel Elementary Safe Routes to School,160000.0,21/22,160000.0,22/23,,,1075000.0,23/24,127000.0,21/22,1522000.0,160000.0,,,,127000.0,287000.0,21/22,,160000.0,,,,160000.0,22/23,,,,1075000.0,,1075000.0,23/24,,,,,,0.0,24/25,1522000.0,YES,YES,160.0,160.0,0.0,1075.0,127.0,1522.0,1522000.0,21/22,22/23,,23/24,21/22


In [54]:
import re

In [57]:
dcolumns1 = [col for col in funded.columns if isinstance(col, str) and re.match('.*_x', col)]

In [58]:
dcolumns2 = [col for col in funded.columns if isinstance(col, str) and re.match('.*_y', col)]

In [59]:
dcolumns1

['atp_id_x', 'a3_proj_type_x']

In [60]:
dcolumns2

['atp_id_y',
 'a3_proj_type_y',
 'orig__prog__year__pa_ed_',
 'orig__prog__year__ps_e_',
 'orig__prog__year__rw_',
 'orig__prog__year__con_',
 'orig__prog__year__con_ni_',
 'fund_year_1',
 'fund_year_2',
 'fund_year_3',
 'fund_year_4']

In [61]:
funded>>select(_.atp_id_x, _.atp_id_y,
               _.a3_proj_type_x, _.a3_proj_type_y)

Unnamed: 0,atp_id_x,atp_id_y,a3_proj_type_x,a3_proj_type_y
0,ATP5-05-001R,ATP5-5-001R,Infrastructure - Small,Infrastructure
1,ATP5-07-002S,ATP5-7-002S,Infrastructure - Small,Infrastructure
2,ATP5-04-003S,ATP5-4-003S,Infrastructure + NI - Large,Combined (IF and NI)
3,ATP5-05-004S,ATP5-5-004S,Infrastructure + NI - Large,Combined (IF and NI)
4,ATP5-11-005S,ATP5-11-005S,Infrastructure + NI - Small,Combined (IF and NI)
5,ATP5-06-006S,ATP5-6-006S,Infrastructure + NI - Small,Combined (IF and NI)
6,ATP5-10-007S,ATP5-10-007S,Infrastructure - Small,Infrastructure
7,ATP5-05-008S,ATP5-5-008S,Infrastructure - Large,Infrastructure
8,ATP5-02-009R,ATP5-2-009R,Infrastructure - Large,Infrastructure
9,ATP5-06-010S,ATP5-6-010S,Infrastructure + NI - Small,Combined (IF and NI)


## Merging with Master_Data

In [77]:
master_data.sample()

Unnamed: 0,awarded,project_cycle,a2_ct_dist,a3_proj_type,project_app_id,a2_info_proj_name,a2_county,a1_locode,a1_imp_agcy_name,a1_imp_agcy_street,a1_imp_agcy_city,a1_imp_agcy_zip,a1_imp_agcy_title,a1_imp_agcy_ma,a1_imp_agcy_state_ma_num,a1_imp_agcy_fed_ma_num,a1_proj_partner_exists,a1_proj_partner_agcy,a1_proj_partner_title,assembly_district,a2_assem_dist_a,a2_assem_dist_b,a2_assem_dist_c,congressional_district,a2_congress_dist_a,a2_congress_dist_b,a2_congress_dist_c,senate_district,a2_senate_dist_a,a2_senate_dist_b,a2_senatedistc,a2_info_proj_descr,a2_info_proj_loc,a2_mop_uza_population,a2_mpo,a2_past_proj,a2_past_proj_qty,a2_proj_lat,a2_proj_long,a2_proj_scope_summary,a2_project_location_map,a2_rtpa,a3_plan_active_trans,a3_plan_active_trans_exists,a3_plan_bicycle,a3_plan_bicycle_exists,a3_plan_ped,a3_plan_ped_exists,a3_plan_srts,a3_plan_srts_exists,a3_st_bicycle_applies,a3_st_bicycle_pct,a3_st_num_schools,a3_st_ped_applies,a3_st_ped_pct,a3_st_srts,a3_trail_elig_cost,a3_trail_fed_funding,a3_trail_trans_pct,a3_current_plan,a3_trails,a3_plan_none,a3_plan_other,a3_plan_other_desc,a2_output_outcome,b_sig_inter_new_bike_boxes,b_class_1,b_class_2,b_class_3,b_class_4,a4_bike_gap_pct,b_light_intersection,b_mid_block_new_rrfb_signal,b_mid_block_surf_improv,b_bsp_new_bikes,b_bike_new_secured_lockers,b_bike_new_racks,b_bsp_new_station,b_other_bike_improv_1,b_other_bike_improv_qty_1,b_other_bike_improv_2,b_other_bike_improv_qty_2,b_light_rdwy_seg,b_sig_inter_timing_improv,b_un_sig_new_rrfb_signal,b_un_sig_cross_surf_improv,a4_easement_support,m_cls_1_trails_widen_recon_exist,m_cls_1_trails_new__less_than_9,m_cls_1_trails_new_over_9,m_non_cls_trails_new,m_other_trail_imprv_1,m_other_trail_improv_qty_1,m_other_trail_imprv_2,m_other_trail_improv_qty_2,m_non_cls_widen_recon_exist,p_amenities_bench,a4_ped_gap_pct,p_mid_block_cross_new_rrfb_signal,p_light_intersection,p_lighting_rdwy_seg,p_mid_block_cross_surf_improv,p_new_ada_ramp,p_sidewlks_new_barrier_protect,p_sidewlks_new_4_to_8,p_sidewlks_new_over_8,p_other_ped_imprv_1,p_other_ped_qty_1,p_other_ped_imprv_2,p_other_ped_qty_2,p_reconstruct_ramp_to_ada_stand,p_sidewlks_reconstruct_enhance_exist,p_sig_inter_enhance_exist_crosswlk,p_sig_inter_new_crosswlk,p_sig_inter_ped_heads,p_sig_inter_shorten_cross,p_sig_inter_timing_improv,p_amenities_trash_can,p_amenities_shade_tree,p_amenities_shade_tree_type,p_un_sig_inter_new_traff_sig,p_un_sig_inter_new_roundabout,p_un_sig_inter_new_rrfb_sig,p_un_sig_inter_shorten_cross,p_un_sig_inter_cross_surface_improv,p_sidewlks_widen_existing,a4_row_100,a4_row_gov_ease,a4_row_private_ease,v_other_traffic_calming_imprv_1,v_speed_feedback_signs,v_other_traffic_calming_qty_1,v_other_traffic_calming_imprv_2,v_other_traffic_calming_qty_2,v_remove_right_turn_pocket,v_remove_travel_ln,v_sig_inter_new_roundabout,v_sig_inter_timing_improv,v_un_sig_inter_new_traf_sig,v_un_sig_inter_new_roundabout,a4_reg_init,a4_reg_init_pct,a4_com_init,a4_com_init_pct,a4_safe_route,a4_safe_route_pct,a4_fl_mile,a4_fl_mile_pct,a4_emp_based,a4_emp_based_pct,a4_other_ni,a4_other_ni_descr,a4_other_ni_pct,a4_wb_audits,a4_bike_classes,a4_ped_classes,a4_demo_events,a4_com_enc,a4_le_methods,a4_com_meetings,a4_classrooms,a4_school_assem,a4_after_school,a4_bike_rodeos,a4_mock_cities,a4_walk_bus,a4_bike_train,a4_com_challenges,a4_srts_enc,a4_srts_le,a4_srts_training,a4_act_other_1,a4_act_other_1_descr,a4_act_other_2,a4_act_other_2_decr,a4_comm_trad_media,a4_comm_large_media,a4_comm_print,a4_comm_social,a4_comm_web,a4_comm_other,a4_comm_other_descr,a4_comm_language,a4_collab_pub_health,a4_collab_le,a4_collab_non_profit,a4_collab_schools,a4_collab_pub_works,a4_collab_other,a4_colab_other_descr,a4_plan_ped,a4_plan_bike,a4_plan_atp,a4_plan_school_routes,a4_row_open_street_demo
303,N,5,1,Infrastructure - Medium,"1-Eureka, City of-1",Bay to Zoo Trail,HUM,5017,"Eureka, City of",531 K Street,Eureka,95501,Traffic Project Manager,Yes,00298S,,No,,,2,2,,,2,2,,,2,2,,,Construct approximately 2 miles of Class 1 bik...,The Bay to Zoo Trail is located in the City of...,Project is located outside one of the ten larg...,Caltrans,Yes,1,40.8,-124.14,The Bay to Zoo Trail creates approximately two...,,Humboldt CAG,,No,,No,,No,,No,Yes,90,1,Yes,10,Yes,0,No,0,Yes,Yes,0,0,CIP / HCOAG - Vroom / Transportation Safety Ac...,Construction of approximately two miles of Cla...,0,9550,0,4580,0,,3,3,1,0,0,0,0,,0,,0,0,0,0,2,,0,0,0,0,,0,,0,0,5,25,0,0,0,0,3,0,1380,0,,0,,0,11,264,0,0,0,0,0,0,0,,0,0,0,6,0,4300,No,No,Yes,,0,0,,0,0,0,0,0,0,0,N,0,N,0,N,0,N,0,,0,N,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,,N,N,N,N,N,N,,,N,N,N,N,N,N,,N,N,N,N,No


In [78]:
funded.sample()

Unnamed: 0,awarded,project_cycle,a2_ct_dist,#,atp_id_x,ppno_1,a3_proj_type_x,project_app_id,a2_county,a1_locode,a1_imp_agcy_street,a1_imp_agcy_city,a1_imp_agcy_zip,a1_imp_agcy_title,a1_imp_agcy_ma,a1_imp_agcy_state_ma_num,a1_imp_agcy_fed_ma_num,a1_proj_partner_exists,a1_proj_partner_agcy,a1_proj_partner_title,assembly_district,a2_assem_dist_a,a2_assem_dist_b,a2_assem_dist_c,congressional_district,a2_congress_dist_a,a2_congress_dist_b,a2_congress_dist_c,senate_district,a2_senate_dist_a,a2_senate_dist_b,a2_senatedistc,a2_info_proj_descr,a2_info_proj_loc,a2_mop_uza_population,a2_mpo,a2_past_proj,a2_past_proj_qty,a2_proj_lat,a2_proj_long,a2_proj_scope_summary,a2_project_location_map,a2_rtpa,a3_plan_active_trans,a3_plan_active_trans_exists,a3_plan_bicycle,a3_plan_bicycle_exists,a3_plan_ped,a3_plan_ped_exists,a3_plan_srts,a3_plan_srts_exists,a3_st_bicycle_applies,a3_st_bicycle_pct,a3_st_num_schools,a3_st_ped_applies,a3_st_ped_pct,a3_st_srts,a3_trail_elig_cost,a3_trail_fed_funding,a3_trail_trans_pct,a3_current_plan,a3_trails,a3_plan_none,a3_plan_other,a3_plan_other_desc,a2_output_outcome,b_sig_inter_new_bike_boxes,b_class_1,b_class_2,b_class_3,b_class_4,a4_bike_gap_pct,b_light_intersection,b_mid_block_new_rrfb_signal,b_mid_block_surf_improv,b_bsp_new_bikes,b_bike_new_secured_lockers,b_bike_new_racks,b_bsp_new_station,b_other_bike_improv_1,b_other_bike_improv_qty_1,b_other_bike_improv_2,b_other_bike_improv_qty_2,b_light_rdwy_seg,b_sig_inter_timing_improv,b_un_sig_new_rrfb_signal,b_un_sig_cross_surf_improv,a4_easement_support,m_cls_1_trails_widen_recon_exist,m_cls_1_trails_new__less_than_9,m_cls_1_trails_new_over_9,m_non_cls_trails_new,m_other_trail_imprv_1,m_other_trail_improv_qty_1,m_other_trail_imprv_2,m_other_trail_improv_qty_2,m_non_cls_widen_recon_exist,p_amenities_bench,a4_ped_gap_pct,p_mid_block_cross_new_rrfb_signal,p_light_intersection,p_lighting_rdwy_seg,p_mid_block_cross_surf_improv,p_new_ada_ramp,p_sidewlks_new_barrier_protect,p_sidewlks_new_4_to_8,p_sidewlks_new_over_8,p_other_ped_imprv_1,p_other_ped_qty_1,p_other_ped_imprv_2,p_other_ped_qty_2,p_reconstruct_ramp_to_ada_stand,p_sidewlks_reconstruct_enhance_exist,p_sig_inter_enhance_exist_crosswlk,p_sig_inter_new_crosswlk,p_sig_inter_ped_heads,p_sig_inter_shorten_cross,p_sig_inter_timing_improv,p_amenities_trash_can,p_amenities_shade_tree,p_amenities_shade_tree_type,p_un_sig_inter_new_traff_sig,p_un_sig_inter_new_roundabout,p_un_sig_inter_new_rrfb_sig,p_un_sig_inter_shorten_cross,p_un_sig_inter_cross_surface_improv,p_sidewlks_widen_existing,agency_fully_own_r_w,"require_r_w_easement,_from_gov",_require_rw_easement_from_private,v_other_traffic_calming_imprv_1,v_speed_feedback_signs,v_other_traffic_calming_qty_1,v_other_traffic_calming_imprv_2,v_other_traffic_calming_qty_2,v_remove_right_turn_pocket,v_remove_travel_ln,v_sig_inter_new_roundabout,v_sig_inter_timing_improv,v_un_sig_inter_new_traf_sig,v_un_sig_inter_new_roundabout,a4_reg_init,a4_reg_init_pct,a4_com_init,a4_com_init_pct,a4_safe_route,a4_safe_route_pct,a4_fl_mile,a4_fl_mile_pct,a4_emp_based,a4_emp_based_pct,a4_other_ni,a4_other_ni_descr,a4_other_ni_pct,a4_wb_audits,a4_bike_classes,a4_ped_classes,a4_demo_events,a4_com_enc,a4_le_methods,a4_com_meetings,a4_classrooms,a4_school_assem,a4_after_school,a4_bike_rodeos,a4_mock_cities,a4_walk_bus,a4_bike_train,a4_com_challenges,a4_srts_enc,a4_srts_le,a4_srts_training,a4_act_other_1,a4_act_other_1_descr,a4_act_other_2,a4_act_other_2_decr,a4_comm_trad_media,a4_comm_large_media,a4_comm_print,a4_comm_social,a4_comm_web,a4_comm_other,a4_comm_other_descr,a4_comm_language,a4_collab_pub_health,a4_collab_le,a4_collab_non_profit,a4_collab_schools,a4_collab_pub_works,a4_collab_other,a4_colab_other_descr,a4_plan_ped,a4_plan_bike,a4_plan_atp,a4_plan_school_routes,a4_row_open_street_demo,project_status,solicitation_abv,solicitation,soliciting_agency,atp_id_y,ppno,ppno1,oversight_id,a3_proj_type_y,project_size,total_project_cost,a1_imp_agcy_name,a2_info_proj_name,original_prog__amt___pa_ed_,orig__prog__year__pa_ed_,original_prog__amt___ps_e_,orig__prog__year__ps_e_,original_prog__amt___rw_,orig__prog__year__rw_,orignal_prog__amt___con_,orig__prog__year__con_,original_prog__amt___con_ni_,orig__prog__year__con_ni_,total,_2122_pa_ed,_2122_ps_e,_2122_rw,_2122_con,_2122_con_ni,_2122_total,fund_year_1,_2223_pa_ed,_2223_ps_e,_2223_rw,_2223_con,_2223_con_ni,_2223_total,fund_year_2,_2324_pa_ed,_2324_ps_e,_2324_rw,_2324_con,_2324_con_ni,_2324_total,fund_year_3,_2425_pa_ed,_2425_ps_e,_2425_rw,_2425_con,_2425_con_ni,_2425_total,fund_year_4,total_atp_$,match?,unnamed:_59,paed,ps_e_,rw,con,con_ni,total_atp_,unnamed:_66,year__pa_ed_,year__ps_e_,year__rw_,year__con_,year__con_ni_
18,Y,5.0,1.0,,ATP5-01-019S,,Infrastructure - Medium,"1-Arcata, City of-1",HUM,5021,736 F Street,Arcata,95521.0,City Engineer,Yes,00024S,01-5021R,Yes,County of Humboldt,Deputy Director Public Works,2,2.0,,,2,2.0,,,2,2.0,,,"PA&ED, PS&E, ROW and CON of 3.3 mile Class I t...",Project is located in northern Arcata along NC...,Project is located outside one of the ten larg...,Caltrans,Yes,3.0,40.9,-124.08,The Arcata Annie & Mary Trail Connectivity Pro...,,Humboldt CAG,,No,,Yes,,Yes,,No,Yes,50.0,0.0,Yes,50.0,No,0.0,Yes,96.0,Yes,X,0,0,Humboldt County Regional Trails Master Plan,"Creation of 3.3 miles of Class I trail, walkwa...",,,2200.0,,,,,,,,,3.0,,,,,,,,,,,,,17424.0,,,,,,,5.0,100.0,,,,,24.0,,,17424.0,,,,,2.0,,,,,,,,,,,,3.0,,1.0,,,X,X,,,,,,,,,,,,N,0.0,N,0.0,N,0.0,N,0.0,,0.0,N,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,N,N,N,N,N,N,,,N,N,N,N,N,N,,N,N,N,N,No,Active,S,STATEWIDE SOLICITATION-CYCLE 5,CALTRANS,ATP5-1-019S,2548,,Standard,Infrastructure,Medium,5286000.0,"Arcata, City of",Arcata Annie & Mary Trail Connectivity Project,67000.0,21/22,240000.0,22/23,255000.0,22/23,3658000.0,23/24,,,4220000.0,67000.0,,,,,67000.0,21/22,,240000.0,255000.0,,,495000.0,22/23,,,,3658000.0,,3658000.0,23/24,,,,,,0.0,24/25,4220000.0,YES,YES,67.0,240.0,255.0,3658.0,0.0,4220.0,4220000.0,21/22,22/23,22/23,23/24,


In [85]:
subset = master_data>>select(_.project_app_id, _.project_cycle, _.a2_ct_dist, _.a1_locode)

In [84]:
merge_on = ['project_app_id', 'project_cycle', 'a2_ct_dist', 'a1_locode']

In [86]:
# will have suplicates for atp_id_x a3_proj_type_x
(pd.merge(subset, funded, how='outer', on= merge_on, indicator='full_merge')).head()

Unnamed: 0,project_app_id,project_cycle,a2_ct_dist,a1_locode,awarded,#,atp_id_x,ppno_1,a3_proj_type_x,a2_county,a1_imp_agcy_street,a1_imp_agcy_city,a1_imp_agcy_zip,a1_imp_agcy_title,a1_imp_agcy_ma,a1_imp_agcy_state_ma_num,a1_imp_agcy_fed_ma_num,a1_proj_partner_exists,a1_proj_partner_agcy,a1_proj_partner_title,assembly_district,a2_assem_dist_a,a2_assem_dist_b,a2_assem_dist_c,congressional_district,a2_congress_dist_a,a2_congress_dist_b,a2_congress_dist_c,senate_district,a2_senate_dist_a,a2_senate_dist_b,a2_senatedistc,a2_info_proj_descr,a2_info_proj_loc,a2_mop_uza_population,a2_mpo,a2_past_proj,a2_past_proj_qty,a2_proj_lat,a2_proj_long,a2_proj_scope_summary,a2_project_location_map,a2_rtpa,a3_plan_active_trans,a3_plan_active_trans_exists,a3_plan_bicycle,a3_plan_bicycle_exists,a3_plan_ped,a3_plan_ped_exists,a3_plan_srts,a3_plan_srts_exists,a3_st_bicycle_applies,a3_st_bicycle_pct,a3_st_num_schools,a3_st_ped_applies,a3_st_ped_pct,a3_st_srts,a3_trail_elig_cost,a3_trail_fed_funding,a3_trail_trans_pct,a3_current_plan,a3_trails,a3_plan_none,a3_plan_other,a3_plan_other_desc,a2_output_outcome,b_sig_inter_new_bike_boxes,b_class_1,b_class_2,b_class_3,b_class_4,a4_bike_gap_pct,b_light_intersection,b_mid_block_new_rrfb_signal,b_mid_block_surf_improv,b_bsp_new_bikes,b_bike_new_secured_lockers,b_bike_new_racks,b_bsp_new_station,b_other_bike_improv_1,b_other_bike_improv_qty_1,b_other_bike_improv_2,b_other_bike_improv_qty_2,b_light_rdwy_seg,b_sig_inter_timing_improv,b_un_sig_new_rrfb_signal,b_un_sig_cross_surf_improv,a4_easement_support,m_cls_1_trails_widen_recon_exist,m_cls_1_trails_new__less_than_9,m_cls_1_trails_new_over_9,m_non_cls_trails_new,m_other_trail_imprv_1,m_other_trail_improv_qty_1,m_other_trail_imprv_2,m_other_trail_improv_qty_2,m_non_cls_widen_recon_exist,p_amenities_bench,a4_ped_gap_pct,p_mid_block_cross_new_rrfb_signal,p_light_intersection,p_lighting_rdwy_seg,p_mid_block_cross_surf_improv,p_new_ada_ramp,p_sidewlks_new_barrier_protect,p_sidewlks_new_4_to_8,p_sidewlks_new_over_8,p_other_ped_imprv_1,p_other_ped_qty_1,p_other_ped_imprv_2,p_other_ped_qty_2,p_reconstruct_ramp_to_ada_stand,p_sidewlks_reconstruct_enhance_exist,p_sig_inter_enhance_exist_crosswlk,p_sig_inter_new_crosswlk,p_sig_inter_ped_heads,p_sig_inter_shorten_cross,p_sig_inter_timing_improv,p_amenities_trash_can,p_amenities_shade_tree,p_amenities_shade_tree_type,p_un_sig_inter_new_traff_sig,p_un_sig_inter_new_roundabout,p_un_sig_inter_new_rrfb_sig,p_un_sig_inter_shorten_cross,p_un_sig_inter_cross_surface_improv,p_sidewlks_widen_existing,agency_fully_own_r_w,"require_r_w_easement,_from_gov",_require_rw_easement_from_private,v_other_traffic_calming_imprv_1,v_speed_feedback_signs,v_other_traffic_calming_qty_1,v_other_traffic_calming_imprv_2,v_other_traffic_calming_qty_2,v_remove_right_turn_pocket,v_remove_travel_ln,v_sig_inter_new_roundabout,v_sig_inter_timing_improv,v_un_sig_inter_new_traf_sig,v_un_sig_inter_new_roundabout,a4_reg_init,a4_reg_init_pct,a4_com_init,a4_com_init_pct,a4_safe_route,a4_safe_route_pct,a4_fl_mile,a4_fl_mile_pct,a4_emp_based,a4_emp_based_pct,a4_other_ni,a4_other_ni_descr,a4_other_ni_pct,a4_wb_audits,a4_bike_classes,a4_ped_classes,a4_demo_events,a4_com_enc,a4_le_methods,a4_com_meetings,a4_classrooms,a4_school_assem,a4_after_school,a4_bike_rodeos,a4_mock_cities,a4_walk_bus,a4_bike_train,a4_com_challenges,a4_srts_enc,a4_srts_le,a4_srts_training,a4_act_other_1,a4_act_other_1_descr,a4_act_other_2,a4_act_other_2_decr,a4_comm_trad_media,a4_comm_large_media,a4_comm_print,a4_comm_social,a4_comm_web,a4_comm_other,a4_comm_other_descr,a4_comm_language,a4_collab_pub_health,a4_collab_le,a4_collab_non_profit,a4_collab_schools,a4_collab_pub_works,a4_collab_other,a4_colab_other_descr,a4_plan_ped,a4_plan_bike,a4_plan_atp,a4_plan_school_routes,a4_row_open_street_demo,project_status,solicitation_abv,solicitation,soliciting_agency,atp_id_y,ppno,ppno1,oversight_id,a3_proj_type_y,project_size,total_project_cost,a1_imp_agcy_name,a2_info_proj_name,original_prog__amt___pa_ed_,orig__prog__year__pa_ed_,original_prog__amt___ps_e_,orig__prog__year__ps_e_,original_prog__amt___rw_,orig__prog__year__rw_,orignal_prog__amt___con_,orig__prog__year__con_,original_prog__amt___con_ni_,orig__prog__year__con_ni_,total,_2122_pa_ed,_2122_ps_e,_2122_rw,_2122_con,_2122_con_ni,_2122_total,fund_year_1,_2223_pa_ed,_2223_ps_e,_2223_rw,_2223_con,_2223_con_ni,_2223_total,fund_year_2,_2324_pa_ed,_2324_ps_e,_2324_rw,_2324_con,_2324_con_ni,_2324_total,fund_year_3,_2425_pa_ed,_2425_ps_e,_2425_rw,_2425_con,_2425_con_ni,_2425_total,fund_year_4,total_atp_$,match?,unnamed:_59,paed,ps_e_,rw,con,con_ni,total_atp_,unnamed:_66,year__pa_ed_,year__ps_e_,year__rw_,year__con_,year__con_ni_,full_merge
0,10-Merced County-1,5,10,5939,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
1,"12-Santa Ana, City of-4",5,12,5063,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
2,"4-Pacifica, City of-1",5,4,5350,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
3,"12-Santa Ana, City of-13",5,12,5063,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
4,"12-Santa Ana, City of-14",5,12,5063,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only


In [87]:
(pd.merge(subset, funded, how='outer', on= merge_on, indicator='full_merge')).full_merge.value_counts()

left_only     405
both           49
right_only      0
Name: full_merge, dtype: int64

In [88]:
(pd.merge(subset, funded, how='outer', on= merge_on, indicator='full_merge')).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 454 entries, 0 to 453
Columns: 264 entries, project_app_id to full_merge
dtypes: category(1), float64(154), int64(1), object(108)
memory usage: 936.9+ KB


In [81]:
# subset data in app data (will concat data later) and keep full funded columns

In [83]:
def merge_funding_and_app_data(df_funding, df_app):
    
    #subset the app data:
    subset_app = (df_app>>select(_.project_app_id, _.project_cycle, _.a2_ct_dist, _.a1_locode))
    
    #identified identifier columns to merge on!
    merge_on = ['project_app_id', 'project_cycle', 'a2_ct_dist', 'a1_locode']
    
    
    # subset data in app data (will concat data later) and keep full funded columns
    
    return df

In [89]:
#(pd.merge(master_data, funded, how='outer', on= merge_on, indicator='full_merge')).head()

In [90]:
#(pd.merge(master_data, funded, how='outer', on=merge_on2, indicator='full_merge')).full_merge.value_counts()

In [124]:
master_data.columns.intersection(funded.columns)

Index(['awarded', 'project_cycle', 'a2_ct_dist', '#', 'ppno', 'ppno_1',
       'project_app_id', 'a2_info_proj_name', 'a2_county', 'a1_locode',
       ...
       'a4_collab_non_profit', 'a4_collab_schools', 'a4_collab_pub_works',
       'a4_collab_other', 'a4_colab_other_descr', 'a4_plan_ped',
       'a4_plan_bike', 'a4_plan_atp', 'a4_plan_school_routes',
       'a4_row_open_street_demo'],
      dtype='object', length=194)

In [125]:
master_data.columns.difference(funded.columns)

Index(['a3_proj_type', 'a4_row_100', 'a4_row_gov_ease', 'a4_row_private_ease',
       'atp_id'],
      dtype='object')

In [127]:
funded.columns.difference(master_data.columns)

Index(['_2122_con', '_2122_con_ni', '_2122_pa_ed', '_2122_ps_e', '_2122_rw',
       '_2122_total', '_2223_con', '_2223_con_ni', '_2223_pa_ed', '_2223_ps_e',
       '_2223_rw', '_2223_total', '_2324_con', '_2324_con_ni', '_2324_pa_ed',
       '_2324_ps_e', '_2324_rw', '_2324_total', '_2425_con', '_2425_con_ni',
       '_2425_pa_ed', '_2425_ps_e', '_2425_rw', '_2425_total', '_merge',
       '_require_rw_easement_from_private', 'a3_proj_type_x', 'a3_proj_type_y',
       'agency_fully_own_r_w', 'atp_id_x', 'atp_id_y', 'con', 'con_ni',
       'fund_year_1', 'fund_year_2', 'fund_year_3', 'fund_year_4', 'match?',
       'orig__prog__year__con_', 'orig__prog__year__con_ni_',
       'orig__prog__year__pa_ed_', 'orig__prog__year__ps_e_',
       'orig__prog__year__rw_', 'original_prog__amt___con_ni_',
       'original_prog__amt___pa_ed_', 'original_prog__amt___ps_e_',
       'original_prog__amt___rw_', 'orignal_prog__amt___con_', 'oversight_id',
       'paed', 'ppno1', 'project_size', 'project_st

In [96]:
# # merging on all common columns does not work
# merge_on3 = list(master_data.columns.intersection(funded.columns))

* merging on [`project_app_id`, `project_cycle`, `a2_ct_dist`, `a1_locode`] 

In [97]:
merge_on3 = ['project_app_id', 'project_cycle', 'a2_ct_dist', 'a1_locode', 'a2_county', 'a2_info_proj_name',
             # 'a1_imp_agcy_name', 
              'a1_imp_agcy_street', 'a1_imp_agcy_city',
 'a1_imp_agcy_zip', 'a1_imp_agcy_title', 'a1_imp_agcy_ma', 'a1_imp_agcy_state_ma_num',
 'a1_imp_agcy_fed_ma_num', 'a1_proj_partner_exists', 'a1_proj_partner_agcy', 'a1_proj_partner_title',
 'assembly_district', 'a2_assem_dist_a', 'a2_assem_dist_b', 'a2_assem_dist_c',
 'congressional_district', 'a2_congress_dist_a', 'a2_congress_dist_b', 'a2_congress_dist_c', 'senate_district',
 'a2_senate_dist_a', 'a2_senate_dist_b', 'a2_senatedistc', 'a2_info_proj_loc',
  'a2_mop_uza_population', 'a2_mpo', 'a2_past_proj', 'a2_past_proj_qty',
  'a2_proj_lat', 'a2_proj_long', 'a2_proj_scope_summary',
  'a2_project_location_map', 'a2_rtpa', 'a3_plan_active_trans', 'a3_plan_active_trans_exists',
  'a3_plan_bicycle', 'a3_plan_bicycle_exists', 'a3_plan_ped', 'a3_plan_ped_exists',
  'a3_plan_srts', 'a3_plan_srts_exists', 'a3_st_bicycle_applies',
  'a3_st_bicycle_pct', 'a3_st_num_schools', 'a3_st_ped_applies',
   'a3_st_ped_pct', 'a3_st_srts', 'a3_trail_elig_cost', 'a3_trail_fed_funding',
]

In [98]:
dfall = (pd.merge(master_data, funded, how='outer', on=merge_on3, indicator='full_merge'))



In [99]:
dfall.full_merge.value_counts()

left_only     405
both           49
right_only      0
Name: full_merge, dtype: int64

In [100]:
compare_entries = np.where(dfall["a1_imp_agcy_name_x"] == dfall["a1_imp_agcy_name_y"], True, False)
dfall["compare_desc"] = compare_entries

In [101]:
dfall.compare_desc.value_counts()

False    406
True      48
Name: compare_desc, dtype: int64

In [102]:
dfall>>filter(_.compare_desc==False)>>select(_.a1_imp_agcy_name_x,  _.a1_imp_agcy_name_y)>>arrange(_.a1_imp_agcy_name_y)

Unnamed: 0,a1_imp_agcy_name_x,a1_imp_agcy_name_y
33,"South El Monte, City of-4","South El Monte, City of"
0,Merced County,
1,"Santa Ana, City of",
2,"Pacifica, City of",
3,"Santa Ana, City of",
...,...,...
449,"San Rafael, City of",
450,"Stockton, City of",
451,"Long Beach, City of",
452,"Napa, City of",


In [103]:
dfall>>filter(_.full_merge=='right_only')

Unnamed: 0,awarded_x,project_cycle,a2_ct_dist,#_x,atp_id,ppno_x,ppno_1_x,a3_proj_type,project_app_id,a2_info_proj_name,a2_county,a1_locode,a1_imp_agcy_name_x,a1_imp_agcy_street,a1_imp_agcy_city,a1_imp_agcy_zip,a1_imp_agcy_title,a1_imp_agcy_ma,a1_imp_agcy_state_ma_num,a1_imp_agcy_fed_ma_num,a1_proj_partner_exists,a1_proj_partner_agcy,a1_proj_partner_title,assembly_district,a2_assem_dist_a,a2_assem_dist_b,a2_assem_dist_c,congressional_district,a2_congress_dist_a,a2_congress_dist_b,a2_congress_dist_c,senate_district,a2_senate_dist_a,a2_senate_dist_b,a2_senatedistc,a2_info_proj_descr_x,a2_info_proj_loc,a2_mop_uza_population,a2_mpo,a2_past_proj,a2_past_proj_qty,a2_proj_lat,a2_proj_long,a2_proj_scope_summary,a2_project_location_map,a2_rtpa,a3_plan_active_trans,a3_plan_active_trans_exists,a3_plan_bicycle,a3_plan_bicycle_exists,a3_plan_ped,a3_plan_ped_exists,a3_plan_srts,a3_plan_srts_exists,a3_st_bicycle_applies,a3_st_bicycle_pct,a3_st_num_schools,a3_st_ped_applies,a3_st_ped_pct,a3_st_srts,a3_trail_elig_cost,a3_trail_fed_funding,a3_trail_trans_pct_x,a3_current_plan_x,a3_trails_x,a3_plan_none_x,a3_plan_other_x,a3_plan_other_desc_x,a2_output_outcome_x,b_sig_inter_new_bike_boxes_x,b_class_1_x,b_class_2_x,b_class_3_x,b_class_4_x,a4_bike_gap_pct_x,b_light_intersection_x,b_mid_block_new_rrfb_signal_x,b_mid_block_surf_improv_x,b_bsp_new_bikes_x,b_bike_new_secured_lockers_x,b_bike_new_racks_x,b_bsp_new_station_x,b_other_bike_improv_1_x,b_other_bike_improv_qty_1_x,b_other_bike_improv_2_x,b_other_bike_improv_qty_2_x,b_light_rdwy_seg_x,b_sig_inter_timing_improv_x,b_un_sig_new_rrfb_signal_x,b_un_sig_cross_surf_improv_x,a4_easement_support_x,m_cls_1_trails_widen_recon_exist_x,m_cls_1_trails_new__less_than_9_x,m_cls_1_trails_new_over_9_x,m_non_cls_trails_new_x,m_other_trail_imprv_1_x,m_other_trail_improv_qty_1_x,m_other_trail_imprv_2_x,m_other_trail_improv_qty_2_x,m_non_cls_widen_recon_exist_x,p_amenities_bench_x,a4_ped_gap_pct_x,p_mid_block_cross_new_rrfb_signal_x,p_light_intersection_x,p_lighting_rdwy_seg_x,p_mid_block_cross_surf_improv_x,p_new_ada_ramp_x,p_sidewlks_new_barrier_protect_x,p_sidewlks_new_4_to_8_x,p_sidewlks_new_over_8_x,p_other_ped_imprv_1_x,p_other_ped_qty_1_x,p_other_ped_imprv_2_x,p_other_ped_qty_2_x,p_reconstruct_ramp_to_ada_stand_x,p_sidewlks_reconstruct_enhance_exist_x,p_sig_inter_enhance_exist_crosswlk_x,p_sig_inter_new_crosswlk_x,p_sig_inter_ped_heads_x,p_sig_inter_shorten_cross_x,p_sig_inter_timing_improv_x,p_amenities_trash_can_x,p_amenities_shade_tree_x,p_amenities_shade_tree_type_x,p_un_sig_inter_new_traff_sig_x,p_un_sig_inter_new_roundabout_x,p_un_sig_inter_new_rrfb_sig_x,p_un_sig_inter_shorten_cross_x,p_un_sig_inter_cross_surface_improv_x,p_sidewlks_widen_existing_x,a4_row_100,a4_row_gov_ease,a4_row_private_ease,v_other_traffic_calming_imprv_1_x,v_speed_feedback_signs_x,v_other_traffic_calming_qty_1_x,v_other_traffic_calming_imprv_2_x,v_other_traffic_calming_qty_2_x,v_remove_right_turn_pocket_x,v_remove_travel_ln_x,v_sig_inter_new_roundabout_x,v_sig_inter_timing_improv_x,v_un_sig_inter_new_traf_sig_x,v_un_sig_inter_new_roundabout_x,a4_reg_init_x,a4_reg_init_pct_x,a4_com_init_x,a4_com_init_pct_x,a4_safe_route_x,a4_safe_route_pct_x,a4_fl_mile_x,a4_fl_mile_pct_x,a4_emp_based_x,a4_emp_based_pct_x,a4_other_ni_x,a4_other_ni_descr_x,a4_other_ni_pct_x,a4_wb_audits_x,a4_bike_classes_x,a4_ped_classes_x,a4_demo_events_x,a4_com_enc_x,a4_le_methods_x,a4_com_meetings_x,a4_classrooms_x,a4_school_assem_x,a4_after_school_x,a4_bike_rodeos_x,a4_mock_cities_x,a4_walk_bus_x,a4_bike_train_x,a4_com_challenges_x,a4_srts_enc_x,a4_srts_le_x,a4_srts_training_x,a4_act_other_1_x,a4_act_other_1_descr_x,a4_act_other_2_x,a4_act_other_2_decr_x,a4_comm_trad_media_x,a4_comm_large_media_x,a4_comm_print_x,a4_comm_social_x,a4_comm_web_x,a4_comm_other_x,a4_comm_other_descr_x,a4_comm_language_x,a4_collab_pub_health_x,a4_collab_le_x,a4_collab_non_profit_x,a4_collab_schools_x,a4_collab_pub_works_x,a4_collab_other_x,a4_colab_other_descr_x,a4_plan_ped_x,a4_plan_bike_x,a4_plan_atp_x,a4_plan_school_routes_x,a4_row_open_street_demo_x,awarded_y,#_y,atp_id_x,ppno_1_y,a3_proj_type_x,a2_info_proj_descr_y,a3_trail_trans_pct_y,a3_current_plan_y,a3_trails_y,a3_plan_none_y,a3_plan_other_y,a3_plan_other_desc_y,a2_output_outcome_y,b_sig_inter_new_bike_boxes_y,b_class_1_y,b_class_2_y,b_class_3_y,b_class_4_y,a4_bike_gap_pct_y,b_light_intersection_y,b_mid_block_new_rrfb_signal_y,b_mid_block_surf_improv_y,b_bsp_new_bikes_y,b_bike_new_secured_lockers_y,b_bike_new_racks_y,b_bsp_new_station_y,b_other_bike_improv_1_y,b_other_bike_improv_qty_1_y,b_other_bike_improv_2_y,b_other_bike_improv_qty_2_y,b_light_rdwy_seg_y,b_sig_inter_timing_improv_y,b_un_sig_new_rrfb_signal_y,b_un_sig_cross_surf_improv_y,a4_easement_support_y,m_cls_1_trails_widen_recon_exist_y,m_cls_1_trails_new__less_than_9_y,m_cls_1_trails_new_over_9_y,m_non_cls_trails_new_y,m_other_trail_imprv_1_y,m_other_trail_improv_qty_1_y,m_other_trail_imprv_2_y,m_other_trail_improv_qty_2_y,m_non_cls_widen_recon_exist_y,p_amenities_bench_y,a4_ped_gap_pct_y,p_mid_block_cross_new_rrfb_signal_y,p_light_intersection_y,p_lighting_rdwy_seg_y,p_mid_block_cross_surf_improv_y,p_new_ada_ramp_y,p_sidewlks_new_barrier_protect_y,p_sidewlks_new_4_to_8_y,p_sidewlks_new_over_8_y,p_other_ped_imprv_1_y,p_other_ped_qty_1_y,p_other_ped_imprv_2_y,p_other_ped_qty_2_y,p_reconstruct_ramp_to_ada_stand_y,p_sidewlks_reconstruct_enhance_exist_y,p_sig_inter_enhance_exist_crosswlk_y,p_sig_inter_new_crosswlk_y,p_sig_inter_ped_heads_y,p_sig_inter_shorten_cross_y,p_sig_inter_timing_improv_y,p_amenities_trash_can_y,p_amenities_shade_tree_y,p_amenities_shade_tree_type_y,p_un_sig_inter_new_traff_sig_y,p_un_sig_inter_new_roundabout_y,p_un_sig_inter_new_rrfb_sig_y,p_un_sig_inter_shorten_cross_y,p_un_sig_inter_cross_surface_improv_y,p_sidewlks_widen_existing_y,agency_fully_own_r_w,"require_r_w_easement,_from_gov",_require_rw_easement_from_private,v_other_traffic_calming_imprv_1_y,v_speed_feedback_signs_y,v_other_traffic_calming_qty_1_y,v_other_traffic_calming_imprv_2_y,v_other_traffic_calming_qty_2_y,v_remove_right_turn_pocket_y,v_remove_travel_ln_y,v_sig_inter_new_roundabout_y,v_sig_inter_timing_improv_y,v_un_sig_inter_new_traf_sig_y,v_un_sig_inter_new_roundabout_y,a4_reg_init_y,a4_reg_init_pct_y,a4_com_init_y,a4_com_init_pct_y,a4_safe_route_y,a4_safe_route_pct_y,a4_fl_mile_y,a4_fl_mile_pct_y,a4_emp_based_y,a4_emp_based_pct_y,a4_other_ni_y,a4_other_ni_descr_y,a4_other_ni_pct_y,a4_wb_audits_y,a4_bike_classes_y,a4_ped_classes_y,a4_demo_events_y,a4_com_enc_y,a4_le_methods_y,a4_com_meetings_y,a4_classrooms_y,a4_school_assem_y,a4_after_school_y,a4_bike_rodeos_y,a4_mock_cities_y,a4_walk_bus_y,a4_bike_train_y,a4_com_challenges_y,a4_srts_enc_y,a4_srts_le_y,a4_srts_training_y,a4_act_other_1_y,a4_act_other_1_descr_y,a4_act_other_2_y,a4_act_other_2_decr_y,a4_comm_trad_media_y,a4_comm_large_media_y,a4_comm_print_y,a4_comm_social_y,a4_comm_web_y,a4_comm_other_y,a4_comm_other_descr_y,a4_comm_language_y,a4_collab_pub_health_y,a4_collab_le_y,a4_collab_non_profit_y,a4_collab_schools_y,a4_collab_pub_works_y,a4_collab_other_y,a4_colab_other_descr_y,a4_plan_ped_y,a4_plan_bike_y,a4_plan_atp_y,a4_plan_school_routes_y,a4_row_open_street_demo_y,project_status,solicitation_abv,solicitation,soliciting_agency,atp_id_y,ppno_y,ppno1,oversight_id,a3_proj_type_y,project_size,total_project_cost,a1_imp_agcy_name_y,original_prog__amt___pa_ed_,orig__prog__year__pa_ed_,original_prog__amt___ps_e_,orig__prog__year__ps_e_,original_prog__amt___rw_,orig__prog__year__rw_,orignal_prog__amt___con_,orig__prog__year__con_,original_prog__amt___con_ni_,orig__prog__year__con_ni_,total,_2122_pa_ed,_2122_ps_e,_2122_rw,_2122_con,_2122_con_ni,_2122_total,fund_year_1,_2223_pa_ed,_2223_ps_e,_2223_rw,_2223_con,_2223_con_ni,_2223_total,fund_year_2,_2324_pa_ed,_2324_ps_e,_2324_rw,_2324_con,_2324_con_ni,_2324_total,fund_year_3,_2425_pa_ed,_2425_ps_e,_2425_rw,_2425_con,_2425_con_ni,_2425_total,fund_year_4,total_atp_$,match?,unnamed:_59,paed,ps_e_,rw,con,con_ni,total_atp_,unnamed:_66,year__pa_ed_,year__ps_e_,year__rw_,year__con_,year__con_ni_,_merge,full_merge,compare_desc


In [104]:
dfall.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 454 entries, 0 to 453
Columns: 411 entries, awarded_x to compare_desc
dtypes: bool(1), category(2), float64(152), int64(98), object(158)
memory usage: 1.4+ MB


In [109]:
# columns in common with the dfs we merged
list(master_data.columns.intersection(funded.columns))

['awarded',
 'project_cycle',
 'a2_ct_dist',
 '#',
 'ppno',
 'ppno_1',
 'project_app_id',
 'a2_info_proj_name',
 'a2_county',
 'a1_locode',
 'a1_imp_agcy_name',
 'a1_imp_agcy_street',
 'a1_imp_agcy_city',
 'a1_imp_agcy_zip',
 'a1_imp_agcy_title',
 'a1_imp_agcy_ma',
 'a1_imp_agcy_state_ma_num',
 'a1_imp_agcy_fed_ma_num',
 'a1_proj_partner_exists',
 'a1_proj_partner_agcy',
 'a1_proj_partner_title',
 'assembly_district',
 'a2_assem_dist_a',
 'a2_assem_dist_b',
 'a2_assem_dist_c',
 'congressional_district',
 'a2_congress_dist_a',
 'a2_congress_dist_b',
 'a2_congress_dist_c',
 'senate_district',
 'a2_senate_dist_a',
 'a2_senate_dist_b',
 'a2_senatedistc',
 'a2_info_proj_descr',
 'a2_info_proj_loc',
 'a2_mop_uza_population',
 'a2_mpo',
 'a2_past_proj',
 'a2_past_proj_qty',
 'a2_proj_lat',
 'a2_proj_long',
 'a2_proj_scope_summary',
 'a2_project_location_map',
 'a2_rtpa',
 'a3_plan_active_trans',
 'a3_plan_active_trans_exists',
 'a3_plan_bicycle',
 'a3_plan_bicycle_exists',
 'a3_plan_ped',
 'a

In [110]:
# (dfall.iloc[:,199:230]).info()

In [111]:
dfall.sample()

Unnamed: 0,awarded_x,project_cycle,a2_ct_dist,#_x,atp_id,ppno_x,ppno_1_x,a3_proj_type,project_app_id,a2_info_proj_name,a2_county,a1_locode,a1_imp_agcy_name_x,a1_imp_agcy_street,a1_imp_agcy_city,a1_imp_agcy_zip,a1_imp_agcy_title,a1_imp_agcy_ma,a1_imp_agcy_state_ma_num,a1_imp_agcy_fed_ma_num,a1_proj_partner_exists,a1_proj_partner_agcy,a1_proj_partner_title,assembly_district,a2_assem_dist_a,a2_assem_dist_b,a2_assem_dist_c,congressional_district,a2_congress_dist_a,a2_congress_dist_b,a2_congress_dist_c,senate_district,a2_senate_dist_a,a2_senate_dist_b,a2_senatedistc,a2_info_proj_descr_x,a2_info_proj_loc,a2_mop_uza_population,a2_mpo,a2_past_proj,a2_past_proj_qty,a2_proj_lat,a2_proj_long,a2_proj_scope_summary,a2_project_location_map,a2_rtpa,a3_plan_active_trans,a3_plan_active_trans_exists,a3_plan_bicycle,a3_plan_bicycle_exists,a3_plan_ped,a3_plan_ped_exists,a3_plan_srts,a3_plan_srts_exists,a3_st_bicycle_applies,a3_st_bicycle_pct,a3_st_num_schools,a3_st_ped_applies,a3_st_ped_pct,a3_st_srts,a3_trail_elig_cost,a3_trail_fed_funding,a3_trail_trans_pct_x,a3_current_plan_x,a3_trails_x,a3_plan_none_x,a3_plan_other_x,a3_plan_other_desc_x,a2_output_outcome_x,b_sig_inter_new_bike_boxes_x,b_class_1_x,b_class_2_x,b_class_3_x,b_class_4_x,a4_bike_gap_pct_x,b_light_intersection_x,b_mid_block_new_rrfb_signal_x,b_mid_block_surf_improv_x,b_bsp_new_bikes_x,b_bike_new_secured_lockers_x,b_bike_new_racks_x,b_bsp_new_station_x,b_other_bike_improv_1_x,b_other_bike_improv_qty_1_x,b_other_bike_improv_2_x,b_other_bike_improv_qty_2_x,b_light_rdwy_seg_x,b_sig_inter_timing_improv_x,b_un_sig_new_rrfb_signal_x,b_un_sig_cross_surf_improv_x,a4_easement_support_x,m_cls_1_trails_widen_recon_exist_x,m_cls_1_trails_new__less_than_9_x,m_cls_1_trails_new_over_9_x,m_non_cls_trails_new_x,m_other_trail_imprv_1_x,m_other_trail_improv_qty_1_x,m_other_trail_imprv_2_x,m_other_trail_improv_qty_2_x,m_non_cls_widen_recon_exist_x,p_amenities_bench_x,a4_ped_gap_pct_x,p_mid_block_cross_new_rrfb_signal_x,p_light_intersection_x,p_lighting_rdwy_seg_x,p_mid_block_cross_surf_improv_x,p_new_ada_ramp_x,p_sidewlks_new_barrier_protect_x,p_sidewlks_new_4_to_8_x,p_sidewlks_new_over_8_x,p_other_ped_imprv_1_x,p_other_ped_qty_1_x,p_other_ped_imprv_2_x,p_other_ped_qty_2_x,p_reconstruct_ramp_to_ada_stand_x,p_sidewlks_reconstruct_enhance_exist_x,p_sig_inter_enhance_exist_crosswlk_x,p_sig_inter_new_crosswlk_x,p_sig_inter_ped_heads_x,p_sig_inter_shorten_cross_x,p_sig_inter_timing_improv_x,p_amenities_trash_can_x,p_amenities_shade_tree_x,p_amenities_shade_tree_type_x,p_un_sig_inter_new_traff_sig_x,p_un_sig_inter_new_roundabout_x,p_un_sig_inter_new_rrfb_sig_x,p_un_sig_inter_shorten_cross_x,p_un_sig_inter_cross_surface_improv_x,p_sidewlks_widen_existing_x,a4_row_100,a4_row_gov_ease,a4_row_private_ease,v_other_traffic_calming_imprv_1_x,v_speed_feedback_signs_x,v_other_traffic_calming_qty_1_x,v_other_traffic_calming_imprv_2_x,v_other_traffic_calming_qty_2_x,v_remove_right_turn_pocket_x,v_remove_travel_ln_x,v_sig_inter_new_roundabout_x,v_sig_inter_timing_improv_x,v_un_sig_inter_new_traf_sig_x,v_un_sig_inter_new_roundabout_x,a4_reg_init_x,a4_reg_init_pct_x,a4_com_init_x,a4_com_init_pct_x,a4_safe_route_x,a4_safe_route_pct_x,a4_fl_mile_x,a4_fl_mile_pct_x,a4_emp_based_x,a4_emp_based_pct_x,a4_other_ni_x,a4_other_ni_descr_x,a4_other_ni_pct_x,a4_wb_audits_x,a4_bike_classes_x,a4_ped_classes_x,a4_demo_events_x,a4_com_enc_x,a4_le_methods_x,a4_com_meetings_x,a4_classrooms_x,a4_school_assem_x,a4_after_school_x,a4_bike_rodeos_x,a4_mock_cities_x,a4_walk_bus_x,a4_bike_train_x,a4_com_challenges_x,a4_srts_enc_x,a4_srts_le_x,a4_srts_training_x,a4_act_other_1_x,a4_act_other_1_descr_x,a4_act_other_2_x,a4_act_other_2_decr_x,a4_comm_trad_media_x,a4_comm_large_media_x,a4_comm_print_x,a4_comm_social_x,a4_comm_web_x,a4_comm_other_x,a4_comm_other_descr_x,a4_comm_language_x,a4_collab_pub_health_x,a4_collab_le_x,a4_collab_non_profit_x,a4_collab_schools_x,a4_collab_pub_works_x,a4_collab_other_x,a4_colab_other_descr_x,a4_plan_ped_x,a4_plan_bike_x,a4_plan_atp_x,a4_plan_school_routes_x,a4_row_open_street_demo_x,awarded_y,#_y,atp_id_x,ppno_1_y,a3_proj_type_x,a2_info_proj_descr_y,a3_trail_trans_pct_y,a3_current_plan_y,a3_trails_y,a3_plan_none_y,a3_plan_other_y,a3_plan_other_desc_y,a2_output_outcome_y,b_sig_inter_new_bike_boxes_y,b_class_1_y,b_class_2_y,b_class_3_y,b_class_4_y,a4_bike_gap_pct_y,b_light_intersection_y,b_mid_block_new_rrfb_signal_y,b_mid_block_surf_improv_y,b_bsp_new_bikes_y,b_bike_new_secured_lockers_y,b_bike_new_racks_y,b_bsp_new_station_y,b_other_bike_improv_1_y,b_other_bike_improv_qty_1_y,b_other_bike_improv_2_y,b_other_bike_improv_qty_2_y,b_light_rdwy_seg_y,b_sig_inter_timing_improv_y,b_un_sig_new_rrfb_signal_y,b_un_sig_cross_surf_improv_y,a4_easement_support_y,m_cls_1_trails_widen_recon_exist_y,m_cls_1_trails_new__less_than_9_y,m_cls_1_trails_new_over_9_y,m_non_cls_trails_new_y,m_other_trail_imprv_1_y,m_other_trail_improv_qty_1_y,m_other_trail_imprv_2_y,m_other_trail_improv_qty_2_y,m_non_cls_widen_recon_exist_y,p_amenities_bench_y,a4_ped_gap_pct_y,p_mid_block_cross_new_rrfb_signal_y,p_light_intersection_y,p_lighting_rdwy_seg_y,p_mid_block_cross_surf_improv_y,p_new_ada_ramp_y,p_sidewlks_new_barrier_protect_y,p_sidewlks_new_4_to_8_y,p_sidewlks_new_over_8_y,p_other_ped_imprv_1_y,p_other_ped_qty_1_y,p_other_ped_imprv_2_y,p_other_ped_qty_2_y,p_reconstruct_ramp_to_ada_stand_y,p_sidewlks_reconstruct_enhance_exist_y,p_sig_inter_enhance_exist_crosswlk_y,p_sig_inter_new_crosswlk_y,p_sig_inter_ped_heads_y,p_sig_inter_shorten_cross_y,p_sig_inter_timing_improv_y,p_amenities_trash_can_y,p_amenities_shade_tree_y,p_amenities_shade_tree_type_y,p_un_sig_inter_new_traff_sig_y,p_un_sig_inter_new_roundabout_y,p_un_sig_inter_new_rrfb_sig_y,p_un_sig_inter_shorten_cross_y,p_un_sig_inter_cross_surface_improv_y,p_sidewlks_widen_existing_y,agency_fully_own_r_w,"require_r_w_easement,_from_gov",_require_rw_easement_from_private,v_other_traffic_calming_imprv_1_y,v_speed_feedback_signs_y,v_other_traffic_calming_qty_1_y,v_other_traffic_calming_imprv_2_y,v_other_traffic_calming_qty_2_y,v_remove_right_turn_pocket_y,v_remove_travel_ln_y,v_sig_inter_new_roundabout_y,v_sig_inter_timing_improv_y,v_un_sig_inter_new_traf_sig_y,v_un_sig_inter_new_roundabout_y,a4_reg_init_y,a4_reg_init_pct_y,a4_com_init_y,a4_com_init_pct_y,a4_safe_route_y,a4_safe_route_pct_y,a4_fl_mile_y,a4_fl_mile_pct_y,a4_emp_based_y,a4_emp_based_pct_y,a4_other_ni_y,a4_other_ni_descr_y,a4_other_ni_pct_y,a4_wb_audits_y,a4_bike_classes_y,a4_ped_classes_y,a4_demo_events_y,a4_com_enc_y,a4_le_methods_y,a4_com_meetings_y,a4_classrooms_y,a4_school_assem_y,a4_after_school_y,a4_bike_rodeos_y,a4_mock_cities_y,a4_walk_bus_y,a4_bike_train_y,a4_com_challenges_y,a4_srts_enc_y,a4_srts_le_y,a4_srts_training_y,a4_act_other_1_y,a4_act_other_1_descr_y,a4_act_other_2_y,a4_act_other_2_decr_y,a4_comm_trad_media_y,a4_comm_large_media_y,a4_comm_print_y,a4_comm_social_y,a4_comm_web_y,a4_comm_other_y,a4_comm_other_descr_y,a4_comm_language_y,a4_collab_pub_health_y,a4_collab_le_y,a4_collab_non_profit_y,a4_collab_schools_y,a4_collab_pub_works_y,a4_collab_other_y,a4_colab_other_descr_y,a4_plan_ped_y,a4_plan_bike_y,a4_plan_atp_y,a4_plan_school_routes_y,a4_row_open_street_demo_y,project_status,solicitation_abv,solicitation,soliciting_agency,atp_id_y,ppno_y,ppno1,oversight_id,a3_proj_type_y,project_size,total_project_cost,a1_imp_agcy_name_y,original_prog__amt___pa_ed_,orig__prog__year__pa_ed_,original_prog__amt___ps_e_,orig__prog__year__ps_e_,original_prog__amt___rw_,orig__prog__year__rw_,orignal_prog__amt___con_,orig__prog__year__con_,original_prog__amt___con_ni_,orig__prog__year__con_ni_,total,_2122_pa_ed,_2122_ps_e,_2122_rw,_2122_con,_2122_con_ni,_2122_total,fund_year_1,_2223_pa_ed,_2223_ps_e,_2223_rw,_2223_con,_2223_con_ni,_2223_total,fund_year_2,_2324_pa_ed,_2324_ps_e,_2324_rw,_2324_con,_2324_con_ni,_2324_total,fund_year_3,_2425_pa_ed,_2425_ps_e,_2425_rw,_2425_con,_2425_con_ni,_2425_total,fund_year_4,total_atp_$,match?,unnamed:_59,paed,ps_e_,rw,con,con_ni,total_atp_,unnamed:_66,year__pa_ed_,year__ps_e_,year__rw_,year__con_,year__con_ni_,_merge,full_merge,compare_desc
332,N,5,8,,,,,Infrastructure + NI - Medium,"8-Temecula, City of-1",Temecula Creek Southside Trail Project,RIV,5459,"Temecula, City of",41000 Main Street,Temecula,92590,Associate Engineer II,Yes,08-0457,08-5459,No,,,75,75,,,50,50,,,28,28,,,Temecula Creek Southside Trail Project: 2.4 mi...,Southern levee/berm of Temecula Creek between ...,Project is located within one of the ten large...,SCAG,No,0,33.48,-117.1,Existing Conditions: The proposed Temecula Cre...,,,,Yes,,Yes,,Yes,,No,Yes,50,1,Yes,50,Yes,0,,0,Yes,No,0,0,,Construction of a 2.4 mile Class I trail featu...,0,12672,0,0,0,,0,0,0,0,0,0,0,"Undercrossing, Redhawk Pkwy bridge",1,"Creek crossing, Ave de Missiones",1,0,0,0,0,,0,0,0,0,,0,,0,0,0,50,0,0,0,0,6,0,0,0,"Hand rail and cable rail, 772 LF",1,Signage,32,0,0,0,0,0,0,0,0,0,,0,0,0,0,0,0,No,Yes,No,,0,0,,0,0,0,0,0,0,0,N,0,Y,100,N,0,N,0,,0,N,,0,0,0,1,1,2,0,0,0,0,0,1,0,0,0,1,0,0,0,0,,0,,Y,N,Y,Y,Y,N,,,N,Y,Y,Y,Y,Y,Riverside County Flood Control and Water Conse...,N,N,N,N,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only,False


In [112]:
dfall>>select(_.awarded_x, _.awarded_y,
               _['#_x'], _['#_y'],
               _.ppno_x, _.ppno_y,
               _.ppno_1_x, _.ppno_1_y,
               _.a2_info_proj_descr_x, _.a2_info_proj_descr_y)>>filter(_.a2_info_proj_descr_y.notnull())

Unnamed: 0,awarded_x,awarded_y,#_x,#_y,ppno_x,ppno_y,ppno_1_x,ppno_1_y,a2_info_proj_descr_x,a2_info_proj_descr_y
8,Y,Y,,,,3057,,,Design and construct buffered bike lanes on De...,Design and construct buffered bike lanes on De...
33,Y,Y,,,,5858,,,This project focuses on school and pedestrian ...,This project focuses on school and pedestrian ...
35,Y,Y,,,,2343A,,2343B,Construct Class IV separated bikeways with Cla...,Construct Class IV separated bikeways with Cla...
60,Y,Y,,,,3058A,,3058B,Construction of .8 miles of Segment 7 of the R...,Construction of .8 miles of Segment 7 of the R...
66,Y,Y,,,,1442,,,The Laurel Elementary SRTS includes infrastruc...,The Laurel Elementary SRTS includes infrastruc...
77,Y,Y,,,,7074,,,Amador & Trinity and Church & Waldby intersect...,Amador & Trinity and Church & Waldby intersect...
81,Y,Y,,,,3513,,,"Construct new curb, gutter, sidewalks, ADA ram...","Construct new curb, gutter, sidewalks, ADA ram..."
83,Y,Y,,,,3060,,,"Completion of PS&E, ROW acquisition, and const...","Completion of PS&E, ROW acquisition, and const..."
91,Y,Y,,,,,,,"In Happy Camp on SR 96, install sidewalks, con...","In Happy Camp on SR 96, install sidewalks, con..."
96,Y,Y,,,,7075,,,"Construct 68 curb ramps, 87 crosswalks, advanc...","Construct 68 curb ramps, 87 crosswalks, advanc..."


In [113]:
compare_desc = np.where(dfall["a2_info_proj_descr_x"] == dfall["a2_info_proj_descr_y"], True, False)
dfall["compare_proj_desc"] = compare_desc

In [114]:
dfall.compare_proj_desc.value_counts()

False    405
True      49
Name: compare_proj_desc, dtype: int64

In [115]:
#check that there are no mismatched entries
dfall>>filter(_.compare_proj_desc==False)>>select(_.a2_info_proj_descr_x,  _.a2_info_proj_descr_y)>>arrange(_.a2_info_proj_descr_y)

Unnamed: 0,a2_info_proj_descr_x,a2_info_proj_descr_y
0,"PA&ED, PS&E, and CON funding for construction ...",
1,Bishop Street Class 3 Bicycle Boulevard with T...,
2,CON funding for installing bicycling facilitie...,
3,Pedestrian traffic safety improvements for Jef...,
4,Pedestrian traffic safety improvements for La...,
...,...,...
449,Ped/bike crossing of San Rafael Canal between ...,
450,"Install curb, gutter and sidewalk to close sid...",
451,Pine Avenue Bicycle Boulevard will construct t...,
452,"Design and construction of curb, gutter, sidew...",


### function to fix the df

In [90]:
## adding new function for all columns using for loop
def fix_col_merges(df):
    #left (x) master_data, right (y) funded,
    
    # filling the null values for some of the duplicate columns
    # manually checking that values are the same as of now- will add function to check when we get the data links
    df['awarded_y'] = df['awarded_x'].fillna(df['awarded_y'])
    df['ppno_y'] = df['ppno_x'].fillna(df['ppno_y'])
    df['ppno_1_y'] = df['ppno_1_x'].fillna(df['ppno_1_y'])
    df['a2_info_proj_descr_y'] = df['a2_info_proj_descr_x'].fillna(df['a2_info_proj_descr_y'])
    
    
    df = df.rename(columns={'awarded_y':'awarded',
                            '#_y':'#',
                            'ppno_y':'ppno',
                            'ppno_1_y':'ppno_1', 
                             'a1_imp_agcy_name_y':'a1_imp_agcy_name',
                             'a2_info_proj_name_y':'a2_info_proj_name',
                              'a2_info_proj_descr_y':'a2_info_proj_descr'
                               })
    df = df.drop(columns={'#_x', 'a2_info_proj_descr_x',
                          'ppno_x', 'compare_desc', 'compare_proj_desc'})
    
    return df

In [88]:
# dfall = fix_col_merges(dfall)

In [116]:
# dfall>>filter(_.full_merge=='both')>>select(_.awarded,
#                # _['#'],
#                _.ppno, 
#                _.ppno_1,
#                _.a2_info_proj_descr)>>filter(_.a2_info_proj_descr.notnull())

#### Check what columns are the duplicates

In [123]:
#https://stackoverflow.com/questions/61793094/find-column-whose-name-contains-a-specific-value-that-is-in-a-fixed-column

In [117]:
import re                                                               

In [120]:
dcolumns = [col for col in dfall.columns if isinstance(col, str) and re.match('.*_x', col)]

In [121]:
len(dcolumns)

142

In [122]:
columns

['awarded_x',
 '#_x',
 'ppno_x',
 'ppno_1_x',
 'a1_imp_agcy_name_x',
 'a2_info_proj_descr_x',
 'a3_trail_trans_pct_x',
 'a3_current_plan_x',
 'a3_trails_x',
 'a3_plan_none_x',
 'a3_plan_other_x',
 'a3_plan_other_desc_x',
 'a2_output_outcome_x',
 'b_sig_inter_new_bike_boxes_x',
 'b_class_1_x',
 'b_class_2_x',
 'b_class_3_x',
 'b_class_4_x',
 'a4_bike_gap_pct_x',
 'b_light_intersection_x',
 'b_mid_block_new_rrfb_signal_x',
 'b_mid_block_surf_improv_x',
 'b_bsp_new_bikes_x',
 'b_bike_new_secured_lockers_x',
 'b_bike_new_racks_x',
 'b_bsp_new_station_x',
 'b_other_bike_improv_1_x',
 'b_other_bike_improv_qty_1_x',
 'b_other_bike_improv_2_x',
 'b_other_bike_improv_qty_2_x',
 'b_light_rdwy_seg_x',
 'b_sig_inter_timing_improv_x',
 'b_un_sig_new_rrfb_signal_x',
 'b_un_sig_cross_surf_improv_x',
 'a4_easement_support_x',
 'm_cls_1_trails_widen_recon_exist_x',
 'm_cls_1_trails_new__less_than_9_x',
 'm_cls_1_trails_new_over_9_x',
 'm_non_cls_trails_new_x',
 'm_other_trail_imprv_1_x',
 'm_other_trai

In [None]:
def remove_duplicate_cols(df, col_list):
    for col_x in col_list:
        df[col_x] = df[col_x].fillna(df[col_y])
    

In [None]:
dfall

In [86]:
## list of all columns
#col_list = sorted(dfall.columns.to_list())

In [85]:
#col_list

In [71]:
#dfall.columns.get_loc("a3_current_plan_x")

In [72]:
# (dfall>>select(_.a3_proj_type, 
#                _.a3_proj_type_x,
#               _.a3_proj_type_y)).info()

remove cols: 
* a3_proj_type_x

In [73]:
(dfall>>select(_.a3_proj_type, 
               _.a3_proj_type_x,
              _.a3_proj_type_y))>>arrange(_.a3_proj_type_y)

Unnamed: 0,a3_proj_type,a3_proj_type_x,a3_proj_type_y
35,Infrastructure + NI - Large,Infrastructure + NI - Large,Combined (IF and NI)
60,Infrastructure + NI - Large,Infrastructure + NI - Large,Combined (IF and NI)
66,Infrastructure + NI - Small,Infrastructure + NI - Small,Combined (IF and NI)
77,Infrastructure + NI - Small,Infrastructure + NI - Small,Combined (IF and NI)
96,Infrastructure + NI - Small,Infrastructure + NI - Small,Combined (IF and NI)
...,...,...,...
449,Infrastructure - Large,,
450,Infrastructure - Medium,,
451,Infrastructure + NI - Medium,,
452,Infrastructure - Medium,,


In [87]:
#(dfall.iloc[:,64:70]).info()