In [143]:
import pandas as pd 
import sqlalchemy 
import sys 
import re
import oracledb 

In [144]:
oracledb.version = "8.3.0" 
sys.modules["cx_Oracle"] = oracledb 

In [145]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [146]:
DIALECT = 'oracle'  

In [148]:
# Use SB1_READONLY login
ENGINE_PATH_WIN_AUTH =  f"{DIALECT}://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/?service_name={SERVICE}" 

In [149]:
engine = sqlalchemy.create_engine(ENGINE_PATH_WIN_AUTH)   

In [150]:
def to_snakecase(df):
    df.columns = df.columns.str.lower().str.replace(' ','_')
    return df

## Projects
* Should I be filling in NA of comment_desc with other columns? Otherwise this information is going to be lost.

In [151]:
projects_df = pd.read_sql_query(""" 
SELECT 
projects.project_id,
projects.county_code,
projects.comment_desc,
projects.district_code, 
projects.est_total_prj_costs,
projects.location_name,
projects.project_label_name,
projects.original_post_mile_begin_id,
projects.original_post_mile_end_id,
projects.revised_post_mile_begin_ind,
projects.revised_post_mile_end_ind,
projects.route_name,
projects.state_hwy_ind,
projects.senate_district_code,
projects.project_category_type_code,
projects.work_type_code,
projects.update_date_time,
local_agencies.agency_name,
local_agencies.urban_area_code,
counties.county_name,
work_types.work_type_desc,
project_category_type_codes.category_desc
FROM projects 
LEFT JOIN local_agencies ON projects.agency_code = local_agencies.agency_code
LEFT JOIN counties ON projects.county_code = counties.county_code
LEFT JOIN work_types ON projects.work_type_code = work_types.work_type_code
LEFT JOIN project_category_type_codes ON projects.project_category_type_code = project_category_type_codes.category_code
WHERE projects.status_code = 'Active'
""", engine) 

In [152]:
projects_df.comment_desc = projects_df.comment_desc.fillna(projects_df.category_desc)

In [153]:
projects_df.comment_desc = projects_df.comment_desc.fillna(projects_df.work_type_desc)

In [154]:
projects_df = projects_df.drop(columns = ['work_type_code','project_category_type_code', 'county_code'])

In [155]:
projects_df['current_phase'] = 'single phase'

In [285]:
projects_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11768 entries, 0 to 11767
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   project_id                   11768 non-null  object        
 1   comment_desc                 10892 non-null  object        
 2   district_code                11767 non-null  object        
 3   est_total_prj_costs          1083 non-null   float64       
 4   location_name                11401 non-null  object        
 5   project_label_name           10906 non-null  object        
 6   original_post_mile_begin_id  750 non-null    float64       
 7   original_post_mile_end_id    576 non-null    float64       
 8   revised_post_mile_begin_ind  20 non-null     object        
 9   revised_post_mile_end_ind    15 non-null     object        
 10  route_name                   11447 non-null  object        
 11  state_hwy_ind                11405 non-nu

## EA Number
* Projects can have multiple EA numbers.
* Should we keep the most recent EA or all of them -> Ask Tony Hunt.

In [156]:
ea_df = pd.read_sql_query(""" 
SELECT 
project_id, 
district_code,
ea_assign_date, 
expense_authorization_id 
FROM expense_authorizations
""", engine) 

In [157]:
ea_df.shape, ea_df.project_id.nunique()

((49431, 4), 24130)

In [158]:
# Do an outer join to understand what's going on under the hood
# outer_join = pd.merge(ea_df, projects_df4, on = ['district_code','project_id'], how = "outer", indicator = True)

In [159]:
# outer_join[['_merge']].value_counts()

In [160]:
# Understand why there are more rows compared to project_ids that are unique
# outer_join.loc[outer_join._merge == "both"][['project_id']].nunique()

In [161]:
# Do an inner merge to get only projects we care about
ea_df = pd.merge(projects_df, ea_df, on = ['district_code','project_id'], how = "inner")

In [162]:
ea_og_cols = ['district_code', 'expense_authorization_id', 'project_id',
      'ea_assign_date']

In [163]:
# Keep only original columns 
ea_df = ea_df[ea_og_cols]

In [164]:
len(ea_df)

3030

In [165]:
ea_df.project_id.value_counts().describe()

count   2944.00
mean       1.03
std        0.18
min        1.00
25%        1.00
50%        1.00
75%        1.00
max        3.00
Name: project_id, dtype: float64

In [166]:
ea_df.project_id.nunique()

2944

In [167]:
ea_df.project_id.value_counts().head()

5008(072)    3
5932(042)    3
5006(504)    3
5006(635)    3
5953(536)    3
Name: project_id, dtype: int64

In [168]:
ea_df.loc[ea_df.project_id == '5006(635)']

Unnamed: 0,district_code,expense_authorization_id,project_id,ea_assign_date
2381,7,4S6608,5006(635),2009-09-10 13:58:44
2382,7,933575,5006(635),2009-07-02 14:46:18
2383,7,4U4414,5006(635),2009-09-10 13:56:35


In [169]:
# Keep only the most recent EA number so only one EA number per project??
# Ea_df2 only keeps the most recent.
ea_df2 = (ea_df
          .sort_values(['ea_assign_date'], ascending = False)
          .drop_duplicates(subset=['project_id','district_code'])
          .drop(columns = ['ea_assign_date'])
          .reset_index(drop = True)
         )

In [170]:
ea_df2.project_id.nunique()

2944

In [171]:
ea_df2.head()

Unnamed: 0,district_code,expense_authorization_id,project_id
0,4,1Q7614,6204(135)
1,4,985981,6480(026)
2,4,985980,5933(171)
3,9,955175,6142(034)
4,4,985979,5178(016)


In [172]:
ea_df2.expense_authorization_id.nunique()

2873

#### The same EA number matches to multiple projects
* Understand why this is happening
* It seems like the same EA number matches multiple projects that have nothing in common.

In [286]:
ea_df.loc[ea_df.expense_authorization_id == "924969"]

Unnamed: 0,district_code,expense_authorization_id,project_id,ea_assign_date
127,1,924969,5904(114),2011-02-28 10:37:39
404,3,924969,5238(018),1998-06-04 00:00:00
1428,8,924969,NBIL(502),2006-06-23 16:18:52


In [287]:
ea_df.loc[ea_df.expense_authorization_id == "924360"]

Unnamed: 0,district_code,expense_authorization_id,project_id,ea_assign_date
424,8,924360,0061(025),1998-12-23 00:00:00
1248,4,924360,6003(030),2005-05-17 15:25:28


In [175]:
#ea_df2.project_id.nunique() == len(ea_df2)

In [176]:
#len(projects_df6) == len(projects_df)

* Shares the EA of 924360

In [177]:
#projects_df6.loc[projects_df6.project_id == '0061(025)'][preview_cols]

## EFIS_MV_BUD_STRU_94_LVL_3_VW
* Advantage information
* LP2000 projects only use pec_code 2030
* 10/30: to do, combine the query with efis_join_df

In [178]:
efis_df = pd.read_sql_query(""" 
SELECT 
adv_project_id,
fund_code,
pec_code,
appropriation_category_code,
curr_bud_am,
cash_exp_am,
pect_task_code
FROM EFIS_MV_BUD_STRU_94_LVL_3_VW
WHERE pec_code LIKE '%2030%'
""", engine) 

In [179]:
efis_df.shape

(45227, 7)

In [180]:
efis_df.adv_project_id.nunique()

19552

### Efis Join
* In SQL: Filtering out 9's also eliminates nulls, which means newer projects or projects with statuses are also eliminated, so I am filtering this out in Python at a later stage.


In [181]:
efis_join_df = pd.read_sql_query(""" 
SELECT 
adv_project_id,
project_id,
project_status_code
FROM EFIS_MV_R_PROG_VW 
""", engine) 

* Exclude project status because it's just the financial status of the project, not construction or whatever.

In [182]:
#project_status = pd.read_sql_query(""" 
#SELECT DISTINCT project_status_code, 
#project_status
#FROM ACCOUNTING_EXP_CWA_VW  
#""", engine) 

In [183]:
# project_status

In [184]:
# Have to fill in nans with no status
efis_join_df.project_status_code = efis_join_df.project_status_code.fillna('no status')

In [185]:
# Filter out all 9 codes because this means the project is closed
efis_join_df2 = efis_join_df.loc[~efis_join_df.project_status_code.str.contains('9')]

In [186]:
# Drop project status code. This is just about the project status from the 
# Accounting POV
efis_join_df2 = efis_join_df2.drop(columns = ['project_status_code'])

In [187]:
pd.merge(efis_df, efis_join_df2, on = ['adv_project_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

_merge    
left_only     34284
both          10958
right_only       85
dtype: int64

In [188]:
efis_m1 = pd.merge(efis_df, efis_join_df2, on = ['adv_project_id'], how = 'inner')

* 77 project ids missing after inner join.

In [189]:
efis_join_df2.project_id.nunique()

4788

In [190]:
efis_m1.project_id.nunique()

4718

## Subset only  for the relevant project_ids from `Projects`
* Before manipulating

In [191]:
projects_df_subset = projects_df[['project_id']].drop_duplicates().reset_index(drop = True)

In [192]:
# Make sure it's unique
projects_df_subset.project_id.nunique(), projects_df_subset.shape

(11768, (11768, 1))

In [193]:
projects_df_subset.shape

(11768, 1)

In [194]:
pd.merge(efis_m1, projects_df_subset, on = ['project_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

_merge    
both          10289
right_only     7463
left_only       669
dtype: int64

In [195]:
accounting_df = pd.merge(efis_m1, projects_df_subset, on = ['project_id'], how = 'inner')

In [196]:

accounting_df.project_id.nunique()

4305

In [197]:
accounting_df.project_id.value_counts().head()

6211(130)    32
5908(031)    28
6053(130)    27
6211(131)    27
5006(219)    23
Name: project_id, dtype: int64

## Bring in pect_description for `Projects` -> Double Check
* PEC codes that are not supposed to have a corresponding PECT code have one after my manipulation -> Correct this.
* Also need to keep the second duplicate because it's the more recent one
* From section 2: https://accounting.onramp.dot.ca.gov/manual/7-program-codes
* Double check with LP2000 team this is a correct way of thinking.

In [198]:
def load_pec(excel_file:str)-> pd.DataFrame:
    df = to_snakecase(pd.read_excel(excel_file))
    
    # Drop rows that are all nan
    df = df.dropna(how='all').reset_index(drop =  True)
    
    # Keep ONLY rows that have "X" under 23/24
    # That means they are still relevant
    df2 = df.loc[df['23/24'] == 'X'].reset_index(drop = True)
    
    df2 = df2.drop(columns = ['19/20', '20/21', '21/22', '22/23', '23/24'])
    # Find program ONLY rows
    program_only = (df2
             .drop_duplicates(subset = ['pec'])
             .dropna(how='all')
             .reset_index(drop = True)
             .drop(columns = ['pect'])
             .rename(columns = {'description':'program'})
            )
    
    # Merge to get program plus pect
    m1 = pd.merge(df2, program_only, how = "left", on = ['pec'])
    m1 = (m1
          .sort_values(['pec','pect'], ascending = [True, False])
          .drop_duplicates(subset=['pec', 'description', 'program'])
          .rename(columns = {'description':'pect_description'})
          .sort_values(['pec','pect'])
          .reset_index(drop = True)
         )

    m1.pec = m1.pec.str.replace('.','')
    m1.pect = m1.pect.fillna(0).astype(int)
    return m1

In [199]:
final_pect = load_pec('section2_pect_2023_2024_FY.xlsx')



In [200]:
final_pect.shape

(799, 4)

In [201]:
final_pect.sample(3)

Unnamed: 0,pec,pect,pect_description,program
720,4050203,845,Dumbarton Bridge RM1,Toll Bridge Program
476,2080385,851,Reimbursement from BATA - Antioch,Reimbursement of Toll Bridge Maintenance and Toll Collection Costs from the Bay Area Toll Authority ( BATA)
486,2080410,0,Lighting,Lighting


### Turn this part to script once finalized

In [202]:
accounting_df.pect_task_code = accounting_df.pect_task_code.fillna(0).astype(int)

In [203]:
pect_df = pd.merge(accounting_df, final_pect, left_on = ['pec_code', 'pect_task_code'], right_on = ['pec', 'pect'], how = 'left')

In [204]:
accounting_df.head(1)

Unnamed: 0,adv_project_id,fund_code,pec_code,appropriation_category_code,curr_bud_am,cash_exp_am,pect_task_code,project_id
0,1449,42,2030010,809,0.0,-38.99,535,6200(024)


In [205]:
pect_df.sample(3)

Unnamed: 0,adv_project_id,fund_code,pec_code,appropriation_category_code,curr_bud_am,cash_exp_am,pect_task_code,project_id,pec,pect,pect_description,program
7498,813000007,890,2030010,1112,1645967.06,1645967.06,300,5954(108),2030010,300.0,Highway Bridge,Local Assistance
4227,517000187,890,2030010,2122,82876.0,0.0,650,32L0(084),2030010,650.0,Emergency Relief (ER),Local Assistance
2899,416000110,890,2030010,1516,126000.0,126000.0,300,5094(065),2030010,300.0,Highway Bridge,Local Assistance


In [206]:
len(pect_df), pect_df.project_id.nunique()

(10289, 4305)

In [207]:
# Subset 
pect_df2 = pect_df[['pect_description','curr_bud_am', 'project_id']]

In [208]:
# Need to fill in NA so it'll appear in the pivot properly
pect_df2 = pect_df2.fillna('Unknown')

In [209]:
# Drop duplicates because we only need one PECT description & project_id combo
len(pect_df2.drop_duplicates())

10096

In [210]:
pect_df2 = pect_df2.drop_duplicates().reset_index(drop = True)

In [211]:
# Pivot so the PECT_description becomes the columns
# curr_bud_sum are just placeholders
pect_df3 = pect_df2.pivot_table(index=['project_id'], columns='pect_description', 
                    values=['curr_bud_am'], aggfunc='sum')

In [212]:
pect_df3.columns = pect_df3.columns.droplevel()

In [213]:
pect_df3 = pect_df3.reset_index()

In [214]:
pect_df3 = pect_df3.fillna('No')

In [215]:
pect_df3.project_id.nunique(), pect_df2.project_id.nunique(), pect_df.project_id.nunique()

(4305, 4305, 4305)

In [216]:
pect_df3 = to_snakecase(pect_df3)

In [217]:
#pect_df3projects = set(pect_df3.project_id.unique().tolist())
#pect_df2projects = set(pect_df2.project_id.unique().tolist())
#pect_df2projects - pect_df3projects

In [218]:
# pect_df2[pect_df2.project_id == '6115(006)']

In [219]:
# Change integers to yes 
pect_df3 = pect_df3.mask(pect_df3.apply(lambda x : pd.to_numeric(x,errors='coerce')).notnull(),'Yes')

In [220]:
pd.merge(pect_df3, projects_df, on = ['project_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

_merge    
right_only    7463
both          4305
left_only        0
dtype: int64

In [221]:
# Remove project_id to fill in unknowns later on
pect_code_cols = list(pect_df3.columns)
pect_code_cols.remove('project_id')

In [222]:
# Update projects
project_df = pd.merge(projects_df, pect_df3, on = ['project_id'], how = 'left')

In [223]:
project_df.project_id.nunique()

11768

In [224]:
# Fill in unknown
project_df[pect_code_cols] = project_df[pect_code_cols].fillna('Unknown')

### Double check

In [225]:
pect_df2.loc[pect_df2.project_id == '5918(101)']

Unnamed: 0,pect_description,curr_bud_am,project_id
1610,Highway Bridge,690839.49,5918(101)
1611,"Earmarks Projects (HPP, DEMO CPFCDS, etc.)",238679.79,5918(101)
1612,Regional Surface Transportation Block Grant Program (RSTBGP) and Highway Infrastructure Program (HIP),0.0,5918(101)
1613,Highway Bridge,472887.51,5918(101)


In [226]:
project_df.loc[project_df.project_id == '5918(101)'].style.where(lambda val: 'Yes' in str(val), 'color: red')

Unnamed: 0,project_id,comment_desc,district_code,est_total_prj_costs,location_name,project_label_name,original_post_mile_begin_id,original_post_mile_end_id,revised_post_mile_begin_ind,revised_post_mile_end_ind,route_name,state_hwy_ind,senate_district_code,update_date_time,agency_name,urban_area_code,county_name,work_type_desc,category_desc,current_phase,active_transportation_program_(atp),bridge_inspection_&_scour_evaluation,covid_relief_funds_for_highway_infrastructure_programs_for_stip-covid_augmentation,carbon_reduction_program_(crp),congestion_mitigation_&_air_quality_improvement_program_(cmaq),coronavirus_response_and_relief_supplemental_appropriations_act_(crrsaa)_funds,corridor_mobility_improvement_account_(cmia)_program,county_exchange_funds,county_state_match_program,"earmarks_projects_(hpp,_demo_cpfcds,_etc.)",emergency_relief_(er),ferry_boat_program_(fbp)_and_ferry_boat_discretionary_(fbd)_program,"funds_for_planning,_programming_and_monitoring_-_rip",general_funded_designated_programs,hazard_elimination_safety_(hes),high_risk_rural_roads_program_(hr3),highway_bridge_,highway_safety_improvement_program_(hsip)_(infrastructure)-state_fund,highway_safety_improvement_program_(hsip)_(non-infrastructure),highway_safety_improvement_program_(hsip)(infrastructure)-federal_fund,local_partnership_program_(lpp_–_competitive)_,local_roads,local_roads_rehabilitation,railroad_grade_crossing_protection,railroad_grade_separations,"rebuilding_american_infrastructure_with_sustainability_and_equity_(raise)_and_multimodal_project_discretionary_grant_programs_(e.g.,_infra,_mega,_rstg_or_rural)_",regional_improvement_program_–_regional_share_of_stip_transportation_enhancement_(off_system),regional_surface_transportation_block_grant_program_(rstbgp)_and_highway_infrastructure_program_(hip),regional_transportation_planning_agency_(rtpa)_stp_match_exchange,sb1_funded_freeway_service_patrol,shopp-_traffic_light_synchronization_program_(tlsp)-_proposition_1b_bond_funds,safe_routes_to_school_(sr2s_and_srts),set-aside_coordinated_border_infrastructure_(cbi)_program_under_fast_act,solutions_for_congested_corridors_program_(sccp),special_programs,state-local_partnership_program_(slpp)_and_local_partnership_program_(lpp-formulaic),structures_seismic_retrofit_,trade_corridor_enhancement_account_(tcea)_programs_–_local_share,trade_corridor_enhancement_account_(tcea)_programs_–_state_share,trade_corridors_improvement_fund_(tcif)_program_local_streets_&_roads,traffic_congestion_relief_program_(_tcrp_),unknown
1413,5918(101),"4-26-2023: told Neal Hay to do a BAR request and that he cannot ask for more than what was lapsed - JC 1/10/22: TCT JWalton adv of CWA expring and funds lapsing. need invoice by Apr 1, 2022. ab 8/2/17: email SRiddle re: inactive status. ab County will seek to replace (SR= 53.6)",3,,"On Howsley Road, 1.02 Mile East of State Route 99, Br",Bridge Replacement,,,,,0-CR,N,,2023-04-26 15:16:25,Sutter County,,Sutter County,Bridge Replacement - No Added Capacity,Bridge Replacement,single phase,No,No,No,No,No,No,No,No,No,Yes,No,No,No,No,No,No,Yes,No,No,No,No,No,No,No,No,No,No,Yes,No,No,No,No,No,No,No,No,No,No,No,No,No,No


## Phase_Funding Table

### Bring in fund_code
* These fund codes were shared by Brian via his contacts in Accounting. Have to request this each fiscal year?

In [227]:
def load_fund_codes(excel_file:str)->pd.DataFrame:
    df = pd.read_excel('lp2000_2023_fund_codes.xlsx')
    
    # Pad codes
    df['0001'] = df['0001'].apply(lambda x: f'{x:04}')
    df = df.rename(columns = {'General Fund':'general_fund'})
    
    return df

In [228]:
fund_codes = load_fund_codes("lp2000_2023_fund_codes.xlsx")

In [229]:
fund_codes.sample(3)

Unnamed: 0,0001,general_fund
3,183,Environmental Enhanc & Mitigat Prgm Fd
10,3291,"Trade Corridor Enhancement Account, STF"
1,45,Bicycle Transportation Account


In [230]:
fund_phase_df = pd.merge(accounting_df, fund_codes, left_on = ['fund_code'], right_on = ['0001'], how = 'left')
fund_phase_df = fund_phase_df.drop(columns = ['0001'])

In [231]:
fund_phase_df.project_id.nunique()

4305

In [232]:
fund_phase_df.general_fund = fund_phase_df.general_fund.fillna('Unknown')

In [233]:
fund_phase_df.general_fund.value_counts()

Federal Trust Fund                                   7647
State Highway Account                                1575
Road Maintenance & Rehabilitation Account, STF        433
Unknown                                               402
Local Bridge Seismic Retrofit Acct                     73
Environmental Enhanc & Mitigat Prgm Fd                 56
Transportation Investment Fund                         29
Transportation Deferred Investment Fund                19
Trade Corridor Enhancement Account, STF                18
Trade Corridors Improvement Fund                       13
Traffic Congestion Relief Fund                          8
State-Local Partnership Program Acct                    7
Highway Safety,Rehabilitation,& Preservation Acct       7
Corridor Mobility Improvement Account                   1
Transportation Faciilities Account                      1
Name: general_fund, dtype: int64

In [288]:
# Sum up the money received by fund
fund_phase_df_pivot1 = (fund_phase_df
                  .groupby(['project_id','general_fund'])
                  .agg({'curr_bud_am':'sum', 
                        'cash_exp_am':'sum'})
                  .reset_index()
                  .rename(columns = {'curr_bud_am':'single_phase_cost',
                                    'cash_exp_am':'single_phase_expenditure'})
                  )

In [235]:
fund_phase_df_pivot1.head()

Unnamed: 0,project_id,general_fund,single_phase_cost,single_phase_expenditure
0,0001(002),Federal Trust Fund,20743617.04,20743617.04
1,0014(005),Federal Trust Fund,879983.23,879983.23
2,0027(012),Federal Trust Fund,12830458.87,12830458.87
3,0061(025),Federal Trust Fund,2595722.0,2595722.0
4,15A5(001),Federal Trust Fund,849820.3,700737.01


In [236]:
# No fund detail -> just want the total cost 
fund_phase_df_pivot2 = (fund_phase_df
                  .groupby(['project_id'])
                  .agg({'curr_bud_am':'sum', 
                        'cash_exp_am':'sum'})
                  .reset_index()
                  .rename(columns = {'curr_bud_am':'single_phase_cost',
                                    'cash_exp_am':'single_phase_expenditure_amt'})
                  )

In [237]:
fund_phase_df_pivot2.head()

Unnamed: 0,project_id,single_phase_cost,single_phase_expenditure_amt
0,0001(002),20743617.04,20743617.04
1,0014(005),879983.23,879983.23
2,0027(012),12830458.87,12830458.87
3,0061(025),2595722.0,2595722.0
4,15A5(001),849820.3,700737.01


In [238]:
fund_phase_df_pivot2.project_id.nunique()

4305

In [239]:
# Pivot so general_fund will be the column names
# https://stackoverflow.com/questions/22798934/pandas-long-to-wide-reshape-by-two-variables
fund_phase_df_pivot1 = fund_phase_df_pivot1.pivot_table(index=['project_id'], columns='general_fund', 
                    values=['single_phase_cost'], aggfunc='sum')

In [240]:
fund_phase_df_pivot1.columns = fund_phase_df_pivot1.columns.droplevel()

In [241]:
fund_phase_df_pivot1 = fund_phase_df_pivot1.reset_index()

In [242]:
fund_phase_df_pivot1 = to_snakecase(fund_phase_df_pivot1)

In [243]:
# Find state fund only columns
state_only_columns = list((fund_phase_df_pivot1.filter(regex='account|fd|acct|fund').columns))

In [244]:
state_only_columns

['corridor_mobility_improvement_account',
 'environmental_enhanc_&_mitigat_prgm_fd',
 'federal_trust_fund',
 'highway_safety,rehabilitation,&_preservation_acct',
 'local_bridge_seismic_retrofit_acct',
 'road_maintenance_&_rehabilitation_account,_stf',
 'state_highway_account',
 'state-local_partnership_program_acct',
 'trade_corridor_enhancement_account,_stf',
 'trade_corridors_improvement_fund',
 'traffic_congestion_relief_fund',
 'transportation_deferred_investment_fund',
 'transportation_faciilities_account',
 'transportation_investment_fund']

In [245]:

state_only_columns.remove('federal_trust_fund')

In [246]:
# Sum up all the state only funds
fund_phase_df_pivot1['total_state_funds'] = fund_phase_df_pivot1[state_only_columns].sum(axis = 1).fillna(0)

In [247]:
# Mask integers with bool
fund_phase_df_pivot_bool = fund_phase_df_pivot1.fillna('No')

In [248]:
fund_phase_df_pivot_bool = fund_phase_df_pivot_bool.mask(fund_phase_df_pivot_bool.apply(lambda x : pd.to_numeric(x,errors='coerce')).notnull(),'Yes')

In [249]:
# Merge again so projects will have total budgeted amount
# for the single phase and expenditure
final_fund_phase_df = pd.merge(fund_phase_df_pivot1, fund_phase_df_pivot2, on = ['project_id'])

In [250]:
# Make it clear about total federal funds
final_fund_phase_df['total_federal_funds'] = final_fund_phase_df.federal_trust_fund

In [251]:
final_fund_phase_df.shape

(4305, 20)

In [252]:
final_fund_phase_df.project_id.nunique()

4305

In [253]:
# Tag whether something is funded by state/federal/both
def is_state_funds(row):
    if row.total_state_funds > 0:
        return "Yes"
    else:
        return "No"

In [254]:
def is_fed_funds(row):
    if row.total_federal_funds > 0:
        return "Yes"
    else:
        return "No"

In [255]:
final_fund_phase_df["is_state"] = final_fund_phase_df.apply(is_state_funds, axis=1)

In [256]:
final_fund_phase_df["is_federal"] = final_fund_phase_df.apply(is_fed_funds, axis=1)

In [257]:
final_fund_phase_df = final_fund_phase_df.fillna(0)

### Double Checking
* Make sure the project flag is correct

In [258]:
fund_phase_df.project_id.value_counts().head()

6211(130)    32
5908(031)    28
6053(130)    27
6211(131)    27
5006(219)    23
Name: project_id, dtype: int64

In [259]:
final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].style.where(lambda val: 'Yes' in str(val), 'color: red')

Unnamed: 0,project_id,corridor_mobility_improvement_account,environmental_enhanc_&_mitigat_prgm_fd,federal_trust_fund,"highway_safety,rehabilitation,&_preservation_acct",local_bridge_seismic_retrofit_acct,"road_maintenance_&_rehabilitation_account,_stf",state_highway_account,state-local_partnership_program_acct,"trade_corridor_enhancement_account,_stf",trade_corridors_improvement_fund,traffic_congestion_relief_fund,transportation_deferred_investment_fund,transportation_faciilities_account,transportation_investment_fund,unknown,total_state_funds,single_phase_cost,single_phase_expenditure_amt,total_federal_funds,is_state,is_federal
3209,5944(068),0.0,0.0,5412383.39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,85000.0,0.0,85000.0,5497383.39,5497383.39,5412383.39,Yes,Yes


In [260]:
fund_phase_df.loc[fund_phase_df.project_id == '5944(068)']

Unnamed: 0,adv_project_id,fund_code,pec_code,appropriation_category_code,curr_bud_am,cash_exp_am,pect_task_code,project_id,general_fund
3804,500000588,890,2030010,203,630485.13,630485.13,300,5944(068),Federal Trust Fund
3805,500000588,3008,2030600,506,85000.0,85000.0,620,5944(068),Transportation Investment Fund
3806,500000588,890,2030010,910,809514.72,809514.72,300,5944(068),Federal Trust Fund
3807,500000588,890,2030010,1213,1001729.0,1001729.0,300,5944(068),Federal Trust Fund
3808,500000588,890,2030010,1516,2970654.54,2970654.54,300,5944(068),Federal Trust Fund


In [261]:
fund_phase_df.loc[(fund_phase_df.project_id == '5944(068)')&(fund_phase_df.general_fund == 'Federal Trust Fund')][['curr_bud_am']].sum()

curr_bud_am   5412383.39
dtype: float64

In [262]:
fund_phase_df.loc[fund_phase_df.project_id == '5944(068)'][['curr_bud_am']].sum()

curr_bud_am   5497383.39
dtype: float64

In [263]:
final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].total_state_funds + final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].federal_trust_fund

3209   5497383.39
dtype: float64

In [264]:
final_fund_phase_df.loc[final_fund_phase_df.project_id == '5006(219)']

Unnamed: 0,project_id,corridor_mobility_improvement_account,environmental_enhanc_&_mitigat_prgm_fd,federal_trust_fund,"highway_safety,rehabilitation,&_preservation_acct",local_bridge_seismic_retrofit_acct,"road_maintenance_&_rehabilitation_account,_stf",state_highway_account,state-local_partnership_program_acct,"trade_corridor_enhancement_account,_stf",trade_corridors_improvement_fund,traffic_congestion_relief_fund,transportation_deferred_investment_fund,transportation_faciilities_account,transportation_investment_fund,unknown,total_state_funds,single_phase_cost,single_phase_expenditure_amt,total_federal_funds,is_state,is_federal
383,5006(219),0.0,0.0,32967253.86,0.0,229400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,229400.0,33196653.86,32534546.43,32967253.86,Yes,Yes


In [265]:
fund_phase_df.loc[(fund_phase_df.project_id == '5006(219)')&(fund_phase_df.general_fund == 'Federal Trust Fund')][['curr_bud_am']].sum()

curr_bud_am   32967253.86
dtype: float64

In [266]:
229400.00 + 32967253.86

33196653.86

In [267]:
fund_phase_df.loc[fund_phase_df.project_id ==  '5006(219)']

Unnamed: 0,adv_project_id,fund_code,pec_code,appropriation_category_code,curr_bud_am,cash_exp_am,pect_task_code,project_id,general_fund
5734,700001158,890,2030010,506,1000000.0,1000000.0,810,5006(219),Federal Trust Fund
5735,700001158,890,2030010,1415,0.0,0.0,300,5006(219),Federal Trust Fund
5736,700001158,890,2030010,809,1691542.0,1691542.0,810,5006(219),Federal Trust Fund
5737,700001158,890,2030010,1011,25448.42,25448.42,810,5006(219),Federal Trust Fund
5738,700001158,890,2030010,1112,20206009.54,20206009.54,300,5006(219),Federal Trust Fund
5739,700001158,890,2030010,1314,3216979.12,3216979.12,300,5006(219),Federal Trust Fund
5740,700001158,890,2030010,1415,154672.27,154672.27,300,5006(219),Federal Trust Fund
5741,700001158,890,2030010,1617,608787.0,294068.82,300,5006(219),Federal Trust Fund
5742,700001158,890,2030010,1920,1876299.0,1653630.55,300,5006(219),Federal Trust Fund
5743,700001158,890,2030010,1819,106000.0,94331.53,300,5006(219),Federal Trust Fund


## Awards Table
* Appropriation code is the fiscal year of award


In [268]:
pect_df.sample()

Unnamed: 0,adv_project_id,fund_code,pec_code,appropriation_category_code,curr_bud_am,cash_exp_am,pect_task_code,project_id,pec,pect,pect_description,program
858,214000121,890,2030010,1617,762938.0,435821.6,560,5905(099),2030010,560.0,High Risk Rural Roads Program (HR3),Local Assistance


In [269]:
accounting_df.sample()

Unnamed: 0,adv_project_id,fund_code,pec_code,appropriation_category_code,curr_bud_am,cash_exp_am,pect_task_code,project_id
5928,700020294,890,2030010,1112,554663.0,554663.0,690,5953(650)


In [270]:
# Only want the most recent year of a pec_code listed once
awards_df = (pect_df
                  .groupby(['project_id', 'program'])
                  .agg({'appropriation_category_code':'max'})
                  .reset_index()
                  .rename(columns = {'appropriation_category_code':'state_fiscal_awarded_year',
                                     'program':'grant_program'})
                  )

## Checks

In [271]:
awards_df.project_id.value_counts().head()

5182(058)    3
5288(046)    3
5475(038)    3
6066(140)    3
6090(059)    3
Name: project_id, dtype: int64

In [272]:
awards_df.loc[awards_df.project_id == "5182(058)"]

Unnamed: 0,project_id,grant_program,state_fiscal_awarded_year
1546,5182(058),Active Transportation Program (ATP),2223
1547,5182(058),Local Assistance,2223
1548,5182(058),"Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017",2122


In [273]:
# Check original df 
pect_df.loc[pect_df.project_id == "5182(058)"]

Unnamed: 0,adv_project_id,fund_code,pec_code,appropriation_category_code,curr_bud_am,cash_exp_am,pect_task_code,project_id,pec,pect,pect_description,program
1412,312000145,890,2030720,2223,4318000.0,0.0,100,5182(058),2030720,100.0,Active Transportation Program (ATP),Active Transportation Program (ATP)
1413,312000145,42,2030210,2122,6239000.0,0.0,350,5182(058),2030210,350.0,Solutions for Congested Corridors Program (SCCP),"Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017"
1414,312000145,890,2030010,1011,456704.0,456704.0,820,5182(058),2030010,820.0,Congestion Mitigation & Air Quality Improvement Program (CMAQ),Local Assistance
1415,312000145,890,2030010,1112,0.0,0.0,820,5182(058),2030010,820.0,Congestion Mitigation & Air Quality Improvement Program (CMAQ),Local Assistance
1416,312000145,890,2030010,1213,54423.24,54423.24,820,5182(058),2030010,820.0,Congestion Mitigation & Air Quality Improvement Program (CMAQ),Local Assistance
1417,312000145,890,2030010,1920,50000.0,13000.0,820,5182(058),2030010,820.0,Congestion Mitigation & Air Quality Improvement Program (CMAQ),Local Assistance
1418,312000145,890,2030010,2223,333821.0,0.0,820,5182(058),2030010,820.0,Congestion Mitigation & Air Quality Improvement Program (CMAQ),Local Assistance
1419,312000145,890,2030010,1516,34991.76,34991.76,820,5182(058),2030010,820.0,Congestion Mitigation & Air Quality Improvement Program (CMAQ),Local Assistance


In [274]:
project_df[project_df.project_id ==  "5288(046)"].style.where(lambda val: 'Yes' in str(val), 'color: red')

Unnamed: 0,project_id,comment_desc,district_code,est_total_prj_costs,location_name,project_label_name,original_post_mile_begin_id,original_post_mile_end_id,revised_post_mile_begin_ind,revised_post_mile_end_ind,route_name,state_hwy_ind,senate_district_code,update_date_time,agency_name,urban_area_code,county_name,work_type_desc,category_desc,current_phase,active_transportation_program_(atp),bridge_inspection_&_scour_evaluation,covid_relief_funds_for_highway_infrastructure_programs_for_stip-covid_augmentation,carbon_reduction_program_(crp),congestion_mitigation_&_air_quality_improvement_program_(cmaq),coronavirus_response_and_relief_supplemental_appropriations_act_(crrsaa)_funds,corridor_mobility_improvement_account_(cmia)_program,county_exchange_funds,county_state_match_program,"earmarks_projects_(hpp,_demo_cpfcds,_etc.)",emergency_relief_(er),ferry_boat_program_(fbp)_and_ferry_boat_discretionary_(fbd)_program,"funds_for_planning,_programming_and_monitoring_-_rip",general_funded_designated_programs,hazard_elimination_safety_(hes),high_risk_rural_roads_program_(hr3),highway_bridge_,highway_safety_improvement_program_(hsip)_(infrastructure)-state_fund,highway_safety_improvement_program_(hsip)_(non-infrastructure),highway_safety_improvement_program_(hsip)(infrastructure)-federal_fund,local_partnership_program_(lpp_–_competitive)_,local_roads,local_roads_rehabilitation,railroad_grade_crossing_protection,railroad_grade_separations,"rebuilding_american_infrastructure_with_sustainability_and_equity_(raise)_and_multimodal_project_discretionary_grant_programs_(e.g.,_infra,_mega,_rstg_or_rural)_",regional_improvement_program_–_regional_share_of_stip_transportation_enhancement_(off_system),regional_surface_transportation_block_grant_program_(rstbgp)_and_highway_infrastructure_program_(hip),regional_transportation_planning_agency_(rtpa)_stp_match_exchange,sb1_funded_freeway_service_patrol,shopp-_traffic_light_synchronization_program_(tlsp)-_proposition_1b_bond_funds,safe_routes_to_school_(sr2s_and_srts),set-aside_coordinated_border_infrastructure_(cbi)_program_under_fast_act,solutions_for_congested_corridors_program_(sccp),special_programs,state-local_partnership_program_(slpp)_and_local_partnership_program_(lpp-formulaic),structures_seismic_retrofit_,trade_corridor_enhancement_account_(tcea)_programs_–_local_share,trade_corridor_enhancement_account_(tcea)_programs_–_state_share,trade_corridors_improvement_fund_(tcif)_program_local_streets_&_roads,traffic_congestion_relief_program_(_tcrp_),unknown
7834,5288(046),"Data Migrated from CTIPS : The Project Planning Id are: 1785; The locations are :In Folsom on White Rock Road in the vicinity of the Scott Road Intersection. Widen 1 mile of 4-lane roadway and signalize 1 Intersection.; 2/13/2020: This project is the same as STPL-6498(003). Agency is determining whether CMGC negotiations will be viable via the JPA and if not, project will be turned over to City of Folsom to implement/construct. CR 2/13/2020: This project is the same as Project has $10,000 LPP and $15,000 RIP/STIP. 8/22/22: Cost adj to correct local funds to local AC $6,201,500. Erroneously entered as local funds in prior sequence.",3,25750000.0,"In City of Folsom, on White Rock Road from Prairie City Road to East Bidwell Street.",Construct 4 lane road with 8 foot shoulders,,,,,0-FOL,N,,2023-10-27 10:33:06,Folsom,3067,Sacramento County,,Roadway Widening,single phase,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,Yes,No,No,No,No,No,Yes,No,No,No,No,No,No,No,No,No,No,No,No,No,No


In [275]:
awards_df.loc[awards_df.project_id == "5475(038)"]

Unnamed: 0,project_id,grant_program,state_fiscal_awarded_year
2416,5475(038),Active Transportation Program (ATP),2223
2417,5475(038),Local Assistance,2223
2418,5475(038),"Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017",2122


In [276]:
# Check original df 
pect_df.loc[pect_df.project_id ==  "5475(038)"]

Unnamed: 0,adv_project_id,fund_code,pec_code,appropriation_category_code,curr_bud_am,cash_exp_am,pect_task_code,project_id,pec,pect,pect_description,program
1639,315000005,890,2030720,2223,1512000.0,0.0,100,5475(038),2030720,100.0,Active Transportation Program (ATP),Active Transportation Program (ATP)
1640,315000005,42,2030210,2122,2860000.0,0.0,350,5475(038),2030210,350.0,Solutions for Congested Corridors Program (SCCP),"Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017"
1641,315000005,890,2030010,1314,1061999.97,1061999.97,820,5475(038),2030010,820.0,Congestion Mitigation & Air Quality Improvement Program (CMAQ),Local Assistance
1642,315000005,890,2030010,1516,2898000.0,2898000.0,820,5475(038),2030010,820.0,Congestion Mitigation & Air Quality Improvement Program (CMAQ),Local Assistance
1643,315000005,890,2030010,2223,9552155.0,0.0,810,5475(038),2030010,810.0,Regional Surface Transportation Block Grant Program (RSTBGP) and Highway Infrastructure Program (HIP),Local Assistance


In [277]:
project_df[project_df.project_id ==  "5475(038)"].style.where(lambda val: 'Yes' in str(val), 'color: red')

Unnamed: 0,project_id,comment_desc,district_code,est_total_prj_costs,location_name,project_label_name,original_post_mile_begin_id,original_post_mile_end_id,revised_post_mile_begin_ind,revised_post_mile_end_ind,route_name,state_hwy_ind,senate_district_code,update_date_time,agency_name,urban_area_code,county_name,work_type_desc,category_desc,current_phase,active_transportation_program_(atp),bridge_inspection_&_scour_evaluation,covid_relief_funds_for_highway_infrastructure_programs_for_stip-covid_augmentation,carbon_reduction_program_(crp),congestion_mitigation_&_air_quality_improvement_program_(cmaq),coronavirus_response_and_relief_supplemental_appropriations_act_(crrsaa)_funds,corridor_mobility_improvement_account_(cmia)_program,county_exchange_funds,county_state_match_program,"earmarks_projects_(hpp,_demo_cpfcds,_etc.)",emergency_relief_(er),ferry_boat_program_(fbp)_and_ferry_boat_discretionary_(fbd)_program,"funds_for_planning,_programming_and_monitoring_-_rip",general_funded_designated_programs,hazard_elimination_safety_(hes),high_risk_rural_roads_program_(hr3),highway_bridge_,highway_safety_improvement_program_(hsip)_(infrastructure)-state_fund,highway_safety_improvement_program_(hsip)_(non-infrastructure),highway_safety_improvement_program_(hsip)(infrastructure)-federal_fund,local_partnership_program_(lpp_–_competitive)_,local_roads,local_roads_rehabilitation,railroad_grade_crossing_protection,railroad_grade_separations,"rebuilding_american_infrastructure_with_sustainability_and_equity_(raise)_and_multimodal_project_discretionary_grant_programs_(e.g.,_infra,_mega,_rstg_or_rural)_",regional_improvement_program_–_regional_share_of_stip_transportation_enhancement_(off_system),regional_surface_transportation_block_grant_program_(rstbgp)_and_highway_infrastructure_program_(hip),regional_transportation_planning_agency_(rtpa)_stp_match_exchange,sb1_funded_freeway_service_patrol,shopp-_traffic_light_synchronization_program_(tlsp)-_proposition_1b_bond_funds,safe_routes_to_school_(sr2s_and_srts),set-aside_coordinated_border_infrastructure_(cbi)_program_under_fast_act,solutions_for_congested_corridors_program_(sccp),special_programs,state-local_partnership_program_(slpp)_and_local_partnership_program_(lpp-formulaic),structures_seismic_retrofit_,trade_corridor_enhancement_account_(tcea)_programs_–_local_share,trade_corridor_enhancement_account_(tcea)_programs_–_state_share,trade_corridors_improvement_fund_(tcif)_program_local_streets_&_roads,traffic_congestion_relief_program_(_tcrp_),unknown
2664,5475(038),"10/1/2020: Original AED date was 9/30/2020, new sequence being done to extend date to 9/30/2022. There will be a gap of time that is not reimbursable. CR CMAQ Emissions Benefit: .03 ROG, .02 NOx, .01 PM10 Project has EPSP approval for $2,646,524 of CMAQ for R/W to 15/16 FY. And EPSP for $291,476 of CMAQ for PE to 15/16 FY.",3,36291000.0,"Auburn Blvd. Complete Streets - Phase 2. On Auburn Blvd, in Citrus Heights from Rusch Park to Northern City Limits.",Pedestrian and Bike Path,,,,,0-CHts,N,,2023-10-13 13:34:21,Citrus Heights,3067,Sacramento County,,Pedestrian and Bike Path,single phase,Yes,No,No,No,Yes,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,No,No,No,No,No,Yes,No,No,No,No,No,No,No,No


## Save to Excel/Final Touches

In [278]:
# https://stackoverflow.com/questions/28837057/pandas-writing-an-excel-file-containing-unicode-illegalcharactererror
project_df = project_df.applymap(lambda x: x.encode('unicode_escape').
                 decode('utf-8') if isinstance(x, str) else x)

In [279]:
project_df.shape

(11768, 62)

In [280]:
project_df.project_id.nunique()

11768

In [281]:
# Split off county
county_df = project_df[['project_id', 'project_label_name','county_name']]

In [282]:
# Split off districts
district_df = project_df[['project_id', 'project_label_name','district_code']]

In [283]:
project_df = project_df.drop(columns = ['county_name', 'district_code','unknown'])

In [284]:

with pd.ExcelWriter("./LP2000.xlsx") as writer:
    project_df.to_excel(writer, sheet_name="project", index=False)
    county_df.to_excel(writer, sheet_name="county", index=False)
    district_df.to_excel(writer, sheet_name="district", index=False)
    awards_df.to_excel(writer, sheet_name="awards", index=False)
    final_fund_phase_df.to_excel(writer, sheet_name="phase_funding", index=False)
