In [1]:
import pandas as pd 
import sqlalchemy 
import sys 
import re
import oracledb 
import _database_utils as _utils 

In [2]:
oracledb.version = "8.3.0" 
sys.modules["cx_Oracle"] = oracledb 

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [5]:
DIALECT = 'oracle'  

In [6]:
# Use SB1_READONLY login
ENGINE_PATH_WIN_AUTH =  f"{DIALECT}://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/?service_name={SERVICE}" 

In [7]:
engine = sqlalchemy.create_engine(ENGINE_PATH_WIN_AUTH)   

## Projects
* Should I be filling in NA of comment_desc with other columns? Otherwise this information is going to be lost.

In [8]:
projects_df = pd.read_sql_query(""" 
SELECT 
projects.project_id,
projects.county_code,
projects.comment_desc,
projects.district_code, 
projects.est_total_prj_costs,
projects.location_name,
projects.project_label_name,
projects.original_post_mile_begin_id,
projects.original_post_mile_end_id,
projects.revised_post_mile_begin_ind,
projects.revised_post_mile_end_ind,
projects.route_name,
projects.state_hwy_ind,
projects.senate_district_code,
projects.project_category_type_code,
projects.work_type_code,
projects.update_date_time,
local_agencies.agency_name,
local_agencies.urban_area_code,
counties.county_name,
work_types.work_type_desc,
project_category_type_codes.category_desc
FROM projects 
LEFT JOIN local_agencies ON projects.agency_code = local_agencies.agency_code
LEFT JOIN counties ON projects.county_code = counties.county_code
LEFT JOIN work_types ON projects.work_type_code = work_types.work_type_code
LEFT JOIN project_category_type_codes ON projects.project_category_type_code = project_category_type_codes.category_code
WHERE projects.status_code = 'Active'
""", engine) 

In [9]:
projects_df.comment_desc = projects_df.comment_desc.fillna(projects_df.category_desc)

In [10]:
projects_df.comment_desc = projects_df.comment_desc.fillna(projects_df.work_type_desc)

In [11]:
projects_df = projects_df.drop(columns = ['work_type_code','project_category_type_code', 'county_code'])

In [12]:
projects_df['current_phase'] = 'single phase'

In [13]:
projects_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11272 entries, 0 to 11271
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   project_id                   11272 non-null  object        
 1   comment_desc                 10399 non-null  object        
 2   district_code                11271 non-null  object        
 3   est_total_prj_costs          1329 non-null   float64       
 4   location_name                10906 non-null  object        
 5   project_label_name           10414 non-null  object        
 6   original_post_mile_begin_id  734 non-null    float64       
 7   original_post_mile_end_id    570 non-null    float64       
 8   revised_post_mile_begin_ind  20 non-null     object        
 9   revised_post_mile_end_ind    15 non-null     object        
 10  route_name                   10950 non-null  object        
 11  state_hwy_ind                10909 non-nu

## EA Number
* Projects can have multiple EA numbers.
* Should we keep the most recent EA or all of them -> Ask Tony Hunt.

In [14]:
ea_df = pd.read_sql_query(""" 
SELECT 
project_id, 
district_code,
ea_assign_date, 
expense_authorization_id 
FROM expense_authorizations
""", engine) 

In [15]:
ea_df.shape, ea_df.project_id.nunique()

((49431, 4), 24130)

In [16]:
# Do an outer join to understand what's going on under the hood
# outer_join = pd.merge(ea_df, projects_df4, on = ['district_code','project_id'], how = "outer", indicator = True)

In [17]:
# outer_join[['_merge']].value_counts()

In [18]:
# Understand why there are more rows compared to project_ids that are unique
# outer_join.loc[outer_join._merge == "both"][['project_id']].nunique()

In [19]:
# Do an inner merge to get only projects we care about
ea_df = pd.merge(projects_df, ea_df, on = ['district_code','project_id'], how = "inner")

In [20]:
ea_og_cols = ['district_code', 'expense_authorization_id', 'project_id',
      'ea_assign_date']

In [21]:
# Keep only original columns 
ea_df = ea_df[ea_og_cols]

In [22]:
len(ea_df)

2961

In [23]:
ea_df.project_id.value_counts().describe()

count   2880.00
mean       1.03
std        0.17
min        1.00
25%        1.00
50%        1.00
75%        1.00
max        3.00
Name: project_id, dtype: float64

In [24]:
ea_df.project_id.nunique()

2880

In [25]:
ea_df.project_id.value_counts().head()

5008(072)    3
5953(536)    3
5932(042)    3
5006(504)    3
5435(010)    2
Name: project_id, dtype: int64

In [26]:
ea_df.loc[ea_df.project_id == '5006(635)']

Unnamed: 0,district_code,expense_authorization_id,project_id,ea_assign_date


In [27]:
# Keep only the most recent EA number so only one EA number per project??
# Ea_df2 only keeps the most recent.
ea_df2 = (ea_df
          .sort_values(['ea_assign_date'], ascending = False)
          .drop_duplicates(subset=['project_id','district_code'])
          .drop(columns = ['ea_assign_date'])
          .reset_index(drop = True)
         )

In [28]:
ea_df2.project_id.nunique()

2880

In [29]:
ea_df2.head()

Unnamed: 0,district_code,expense_authorization_id,project_id
0,4,1Q7614,6204(135)
1,4,985981,6480(026)
2,4,985980,5933(171)
3,9,955175,6142(034)
4,4,985979,5178(016)


In [30]:
ea_df2.expense_authorization_id.nunique()

2814

#### The same EA number matches to multiple projects
* Understand why this is happening
* It seems like the same EA number matches multiple projects that have nothing in common.

In [31]:
ea_df.loc[ea_df.expense_authorization_id == "924969"]

Unnamed: 0,district_code,expense_authorization_id,project_id,ea_assign_date
120,1,924969,5904(114),2011-02-28 10:37:39
390,3,924969,5238(018),1998-06-04 00:00:00
1396,8,924969,NBIL(502),2006-06-23 16:18:52


In [32]:
ea_df.loc[ea_df.expense_authorization_id == "924360"]

Unnamed: 0,district_code,expense_authorization_id,project_id,ea_assign_date
409,8,924360,0061(025),1998-12-23 00:00:00
1218,4,924360,6003(030),2005-05-17 15:25:28


In [33]:
#ea_df2.project_id.nunique() == len(ea_df2)

In [34]:
#len(projects_df6) == len(projects_df)

* Shares the EA of 924360

In [35]:
#projects_df6.loc[projects_df6.project_id == '0061(025)'][preview_cols]

## EFIS_MV_BUD_STRU_94_LVL_3_VW
* Advantage information
* LP2000 projects only use pec_code 2030
* 10/30: to do, combine the query with efis_join_df

In [36]:
efis_df = pd.read_sql_query(""" 
SELECT 
adv_project_id,
fund_code,
pec_code,
appropriation_category_code,
curr_bud_am,
cash_exp_am,
pect_task_code
FROM EFIS_MV_BUD_STRU_94_LVL_3_VW
WHERE pec_code LIKE '%2030%'
""", engine) 

In [37]:
efis_df.shape

(45666, 7)

In [38]:
efis_df.adv_project_id.nunique()

19821

### Efis Join
* In SQL: Filtering out 9's also eliminates nulls, which means newer projects or projects with statuses are also eliminated, so I am filtering this out in Python at a later stage.


In [39]:
efis_join_df = pd.read_sql_query(""" 
SELECT 
adv_project_id,
project_id,
project_status_code
FROM EFIS_MV_R_PROG_VW 
""", engine) 

* Exclude project status because it's just the financial status of the project, not construction or whatever.

In [40]:
#project_status = pd.read_sql_query(""" 
#SELECT DISTINCT project_status_code, 
#project_status
#FROM ACCOUNTING_EXP_CWA_VW  
#""", engine) 

In [41]:
# project_status

In [42]:
# Have to fill in nans with no status
efis_join_df.project_status_code = efis_join_df.project_status_code.fillna('no status')

In [43]:
# Filter out all 9 codes because this means the project is closed
efis_join_df2 = efis_join_df.loc[~efis_join_df.project_status_code.str.contains('9')]

In [44]:
# Drop project status code. This is just about the project status from the 
# Accounting POV
efis_join_df2 = efis_join_df2.drop(columns = ['project_status_code'])

In [45]:
pd.merge(efis_df, efis_join_df2, on = ['adv_project_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

_merge    
left_only     34832
both          10849
right_only       83
dtype: int64

In [46]:
efis_m1 = pd.merge(efis_df, efis_join_df2, on = ['adv_project_id'], how = 'inner')

* 77 project ids missing after inner join.

In [47]:
efis_join_df2.project_id.nunique()

4810

In [48]:
efis_m1.project_id.nunique()

4742

## Subset only  for the relevant project_ids from `Projects`
* Before manipulating

In [49]:
projects_df_subset = projects_df[['project_id']].drop_duplicates().reset_index(drop = True)

In [50]:
# Make sure it's unique
projects_df_subset.project_id.nunique(), projects_df_subset.shape

(11272, (11272, 1))

In [51]:
projects_df_subset.shape

(11272, 1)

In [52]:
pd.merge(efis_m1, projects_df_subset, on = ['project_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

_merge    
both          10186
right_only     6942
left_only       663
dtype: int64

In [53]:
accounting_df = pd.merge(efis_m1, projects_df_subset, on = ['project_id'], how = 'inner')

In [54]:

accounting_df.project_id.nunique()

4330

In [55]:
accounting_df.project_id.value_counts().head()

6211(130)    32
5908(031)    28
6053(130)    27
6211(131)    27
5006(219)    23
Name: project_id, dtype: int64

## Bring in pect_description for `Projects` -> Double Check
* PEC codes that are not supposed to have a corresponding PECT code have one after my manipulation -> Correct this.
* Also need to keep the second duplicate because it's the more recent one
* From section 2: https://accounting.onramp.dot.ca.gov/manual/7-program-codes
* Double check with LP2000 team this is a correct way of thinking.

In [56]:
def load_pec(excel_file:str)-> pd.DataFrame:
    df = _utils.to_snakecase(pd.read_excel(excel_file))
    
    # Drop rows that are all nan
    df = df.dropna(how='all').reset_index(drop =  True)
    
    # Keep ONLY rows that have "X" under 23/24
    # That means they are still relevant
    df2 = df.loc[df['23/24'] == 'X'].reset_index(drop = True)
    
    df2 = df2.drop(columns = ['19/20', '20/21', '21/22', '22/23', '23/24'])
    # Find program ONLY rows
    program_only = (df2
             .drop_duplicates(subset = ['pec'])
             .dropna(how='all')
             .reset_index(drop = True)
             .drop(columns = ['pect'])
             .rename(columns = {'description':'program'})
            )
    
    # Merge to get program plus pect
    m1 = pd.merge(df2, program_only, how = "left", on = ['pec'])
    m1 = (m1
          .sort_values(['pec','pect'], ascending = [True, False])
          .drop_duplicates(subset=['pec', 'description', 'program'])
          .rename(columns = {'description':'pect_description'})
          .sort_values(['pec','pect'])
          .reset_index(drop = True)
         )

    m1.pec = m1.pec.str.replace('.','')
    m1.pect = m1.pect.fillna(0).astype(int)
    return m1

In [57]:
final_pect = load_pec('section2_pect_2023_2024_FY.xlsx')



In [58]:
final_pect.shape

(799, 4)

In [59]:
final_pect.sample(3)

Unnamed: 0,pec,pect,pect_description,program
682,4050201,151,Drainage System Restoration,State Hwy Operation & Protection Program (SHOPP)
494,2080437,0,TMS Electrical Material Procurement,Transportation Management System (TMS) Electrical Material Procurement
331,2030010,630,"Rebuilding American Infrastructure with Sustainability and Equity (RAISE) and Multimodal Project Discretionary Grant Programs (e.g., INFRA, MEGA, RSTG or RURAL)",Local Assistance


### Turn this part to script once finalized

In [60]:
accounting_df.pect_task_code = accounting_df.pect_task_code.fillna(0).astype(int)

In [61]:
pect_df = pd.merge(accounting_df, final_pect, left_on = ['pec_code', 'pect_task_code'], right_on = ['pec', 'pect'], how = 'left')

In [62]:
accounting_df.head(1)

Unnamed: 0,adv_project_id,fund_code,pec_code,appropriation_category_code,curr_bud_am,cash_exp_am,pect_task_code,project_id
0,1449,42,2030010,809,0.0,-38.99,535,6200(024)


In [63]:
pect_df.sample(3)

Unnamed: 0,adv_project_id,fund_code,pec_code,appropriation_category_code,curr_bud_am,cash_exp_am,pect_task_code,project_id,pec,pect,pect_description,program
1678,316000060,890,2030010,2122,200000.0,62549.18,820,6203(069),2030010,820.0,Congestion Mitigation & Air Quality Improvement Program (CMAQ),Local Assistance
6994,722000309,3290,2030720,2021,10000.0,3520.0,100,5352(023),2030720,100.0,Active Transportation Program (ATP),Active Transportation Program (ATP)
2823,415000111,890,2030010,1617,350000.0,350000.0,300,6003(052),2030010,300.0,Highway Bridge,Local Assistance


In [64]:
len(pect_df), pect_df.project_id.nunique()

(10186, 4330)

In [65]:
# Subset 
pect_df2 = pect_df[['pect_description','curr_bud_am', 'project_id']]

In [66]:
# Need to fill in NA so it'll appear in the pivot properly
pect_df2 = pect_df2.fillna('Unknown')

In [67]:
# Drop duplicates because we only need one PECT description & project_id combo
len(pect_df2.drop_duplicates())

9999

In [68]:
pect_df2 = pect_df2.drop_duplicates().reset_index(drop = True)

In [69]:
# Pivot so the PECT_description becomes the columns
# curr_bud_sum are just placeholders
pect_df3 = pect_df2.pivot_table(index=['project_id'], columns='pect_description', 
                    values=['curr_bud_am'], aggfunc='sum')

In [70]:
pect_df3.columns = pect_df3.columns.droplevel()

In [71]:
pect_df3 = pect_df3.reset_index()

In [72]:
pect_df3 = pect_df3.fillna('No')

In [73]:
pect_df3.project_id.nunique(), pect_df2.project_id.nunique(), pect_df.project_id.nunique()

(4330, 4330, 4330)

In [74]:
pect_df3 = _utils.to_snakecase(pect_df3)

In [75]:
#pect_df3projects = set(pect_df3.project_id.unique().tolist())
#pect_df2projects = set(pect_df2.project_id.unique().tolist())
#pect_df2projects - pect_df3projects

In [76]:
# pect_df2[pect_df2.project_id == '6115(006)']

In [77]:
# Change integers to yes 
pect_df3 = pect_df3.mask(pect_df3.apply(lambda x : pd.to_numeric(x,errors='coerce')).notnull(),'Yes')

In [78]:
pd.merge(pect_df3, projects_df, on = ['project_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

_merge    
right_only    6942
both          4330
left_only        0
dtype: int64

In [79]:
# Remove project_id to fill in unknowns later on
pect_code_cols = list(pect_df3.columns)
pect_code_cols.remove('project_id')

In [80]:
# Update projects
project_df = pd.merge(projects_df, pect_df3, on = ['project_id'], how = 'left')

In [81]:
project_df.project_id.nunique()

11272

In [82]:
# Fill in unknown
project_df[pect_code_cols] = project_df[pect_code_cols].fillna('Unknown')

### Double check

In [83]:
pect_df2.loc[pect_df2.project_id == '5918(101)']

Unnamed: 0,pect_description,curr_bud_am,project_id
1615,Highway Bridge,690839.49,5918(101)
1616,"Earmarks Projects (HPP, DEMO CPFCDS, etc.)",238679.79,5918(101)
1617,Regional Surface Transportation Block Grant Program (RSTBGP) and Highway Infrastructure Program (HIP),0.0,5918(101)
1618,Highway Bridge,472887.51,5918(101)


In [84]:
project_df.loc[project_df.project_id == '5918(101)'].style.where(lambda val: 'Yes' in str(val), 'color: red')

Unnamed: 0,project_id,comment_desc,district_code,est_total_prj_costs,location_name,project_label_name,original_post_mile_begin_id,original_post_mile_end_id,revised_post_mile_begin_ind,revised_post_mile_end_ind,route_name,state_hwy_ind,senate_district_code,update_date_time,agency_name,urban_area_code,county_name,work_type_desc,category_desc,current_phase,active_transportation_program_(atp),bridge_inspection_&_scour_evaluation,covid_relief_funds_for_highway_infrastructure_programs_for_stip-covid_augmentation,carbon_reduction_program_(crp),congestion_mitigation_&_air_quality_improvement_program_(cmaq),coronavirus_response_and_relief_supplemental_appropriations_act_(crrsaa)_funds,corridor_mobility_improvement_account_(cmia)_program,county_exchange_funds,county_state_match_program,"earmarks_projects_(hpp,_demo_cpfcds,_etc.)",emergency_relief_(er),ferry_boat_program_(fbp)_and_ferry_boat_discretionary_(fbd)_program,"funds_for_planning,_programming_and_monitoring_-_rip",general_funded_designated_programs,hazard_elimination_safety_(hes),high_risk_rural_roads_program_(hr3),highway_bridge_,highway_safety_improvement_program_(hsip)_(infrastructure)-state_fund,highway_safety_improvement_program_(hsip)_(non-infrastructure),highway_safety_improvement_program_(hsip)(infrastructure)-federal_fund,local_partnership_program_(lpp_–_competitive)_,local_roads,local_roads_rehabilitation,railroad_grade_crossing_protection,railroad_grade_separations,"rebuilding_american_infrastructure_with_sustainability_and_equity_(raise)_and_multimodal_project_discretionary_grant_programs_(e.g.,_infra,_mega,_rstg_or_rural)_",regional_improvement_program_–_regional_share_of_stip_transportation_enhancement_(off_system),regional_surface_transportation_block_grant_program_(rstbgp)_and_highway_infrastructure_program_(hip),regional_transportation_planning_agency_(rtpa)_stp_match_exchange,sb1_funded_freeway_service_patrol,shopp-_traffic_light_synchronization_program_(tlsp)-_proposition_1b_bond_funds,safe_routes_to_school_(sr2s_and_srts),set-aside_coordinated_border_infrastructure_(cbi)_program_under_fast_act,solutions_for_congested_corridors_program_(sccp),special_programs,state-local_partnership_program_(slpp)_and_local_partnership_program_(lpp-formulaic),structures_seismic_retrofit_,trade_corridor_enhancement_account_(tcea)_programs_–_local_share,trade_corridor_enhancement_account_(tcea)_programs_–_state_share,trade_corridors_improvement_fund_(tcif)_program_local_streets_&_roads,traffic_congestion_relief_program_(_tcrp_),unknown
1277,5918(101),"4-26-2023: told Neal Hay to do a BAR request and that he cannot ask for more than what was lapsed - JC 1/10/22: TCT JWalton adv of CWA expring and funds lapsing. need invoice by Apr 1, 2022. ab 8/2/17: email SRiddle re: inactive status. ab County will seek to replace (SR= 53.6)",3,,"On Howsley Road, 1.02 Mile East of State Route 99, Br",Bridge Replacement,,,,,0-CR,N,,2023-04-26 15:16:25,Sutter County,,Sutter County,Bridge Replacement - No Added Capacity,Bridge Replacement,single phase,No,No,No,No,No,No,No,No,No,Yes,No,No,No,No,No,No,Yes,No,No,No,No,No,No,No,No,No,No,Yes,No,No,No,No,No,No,No,No,No,No,No,No,No,No


## Phase_Funding Table

### Bring in fund_code
* These fund codes were shared by Brian via his contacts in Accounting. Have to request this each fiscal year?

In [85]:
def load_fund_codes(excel_file:str)->pd.DataFrame:
    df = pd.read_excel('lp2000_2023_fund_codes.xlsx')
    
    # Pad codes
    df['0001'] = df['0001'].apply(lambda x: f'{x:04}')
    df = df.rename(columns = {'General Fund':'general_fund'})
    
    return df

In [86]:
fund_codes = load_fund_codes("lp2000_2023_fund_codes.xlsx")

In [87]:
fund_codes.sample(3)

Unnamed: 0,0001,general_fund
12,6056,Trade Corridors Improvement Fund
5,3007,Traffic Congestion Relief Fund
3,183,Environmental Enhanc & Mitigat Prgm Fd


In [88]:
fund_phase_df = pd.merge(accounting_df, fund_codes, left_on = ['fund_code'], right_on = ['0001'], how = 'left')
fund_phase_df = fund_phase_df.drop(columns = ['0001'])

In [89]:
fund_phase_df.project_id.nunique()

4330

In [90]:
fund_phase_df.general_fund = fund_phase_df.general_fund.fillna('Unknown')

In [91]:
fund_phase_df.general_fund.value_counts()

Federal Trust Fund                                   7464
State Highway Account                                1563
Unknown                                               505
Road Maintenance & Rehabilitation Account, STF        424
Local Bridge Seismic Retrofit Acct                     72
Environmental Enhanc & Mitigat Prgm Fd                 56
Transportation Investment Fund                         28
Transportation Deferred Investment Fund                19
Trade Corridor Enhancement Account, STF                18
Trade Corridors Improvement Fund                       13
Traffic Congestion Relief Fund                          8
Highway Safety,Rehabilitation,& Preservation Acct       7
State-Local Partnership Program Acct                    7
Transportation Faciilities Account                      1
Corridor Mobility Improvement Account                   1
Name: general_fund, dtype: int64

In [92]:
# Sum up the money received by fund
fund_phase_df_pivot1 = (fund_phase_df
                  .groupby(['project_id','general_fund'])
                  .agg({'curr_bud_am':'sum', 
                        'cash_exp_am':'sum'})
                  .reset_index()
                  .rename(columns = {'curr_bud_am':'single_phase_cost',
                                    'cash_exp_am':'single_phase_expenditure'})
                  )

In [93]:
fund_phase_df_pivot1.head()

Unnamed: 0,project_id,general_fund,single_phase_cost,single_phase_expenditure
0,0001(002),Federal Trust Fund,20743617.04,20743617.04
1,0014(005),Federal Trust Fund,879983.23,879983.23
2,0027(012),Federal Trust Fund,12830458.87,12830458.87
3,0061(025),Federal Trust Fund,2595722.0,2595722.0
4,15A5(013),Federal Trust Fund,172633.0,0.0


In [94]:
# No fund detail -> just want the total cost 
fund_phase_df_pivot2 = (fund_phase_df
                  .groupby(['project_id'])
                  .agg({'curr_bud_am':'sum', 
                        'cash_exp_am':'sum'})
                  .reset_index()
                  .rename(columns = {'curr_bud_am':'single_phase_cost',
                                    'cash_exp_am':'single_phase_expenditure_amt'})
                  )

In [95]:
fund_phase_df_pivot2.head()

Unnamed: 0,project_id,single_phase_cost,single_phase_expenditure_amt
0,0001(002),20743617.04,20743617.04
1,0014(005),879983.23,879983.23
2,0027(012),12830458.87,12830458.87
3,0061(025),2595722.0,2595722.0
4,15A5(013),172633.0,0.0


In [96]:
fund_phase_df_pivot2.project_id.nunique()

4330

In [97]:
# Pivot so general_fund will be the column names
# https://stackoverflow.com/questions/22798934/pandas-long-to-wide-reshape-by-two-variables
fund_phase_df_pivot1 = fund_phase_df_pivot1.pivot_table(index=['project_id'], columns='general_fund', 
                    values=['single_phase_cost'], aggfunc='sum')

In [98]:
fund_phase_df_pivot1.columns = fund_phase_df_pivot1.columns.droplevel()

In [99]:
fund_phase_df_pivot1 = fund_phase_df_pivot1.reset_index()

In [100]:
fund_phase_df_pivot1 = _utils.to_snakecase(fund_phase_df_pivot1)

In [101]:
# Find state fund only columns
state_only_columns = list((fund_phase_df_pivot1.filter(regex='account|fd|acct|fund').columns))

In [102]:
state_only_columns

['corridor_mobility_improvement_account',
 'environmental_enhanc_&_mitigat_prgm_fd',
 'federal_trust_fund',
 'highway_safety,rehabilitation,&_preservation_acct',
 'local_bridge_seismic_retrofit_acct',
 'road_maintenance_&_rehabilitation_account,_stf',
 'state_highway_account',
 'state-local_partnership_program_acct',
 'trade_corridor_enhancement_account,_stf',
 'trade_corridors_improvement_fund',
 'traffic_congestion_relief_fund',
 'transportation_deferred_investment_fund',
 'transportation_faciilities_account',
 'transportation_investment_fund']

In [103]:

state_only_columns.remove('federal_trust_fund')

In [104]:
# Sum up all the state only funds
fund_phase_df_pivot1['total_state_funds'] = fund_phase_df_pivot1[state_only_columns].sum(axis = 1).fillna(0)

In [105]:
# Mask integers with bool
fund_phase_df_pivot_bool = fund_phase_df_pivot1.fillna('No')

In [106]:
fund_phase_df_pivot_bool = fund_phase_df_pivot_bool.mask(fund_phase_df_pivot_bool.apply(lambda x : pd.to_numeric(x,errors='coerce')).notnull(),'Yes')

In [107]:
# Merge again so projects will have total budgeted amount
# for the single phase and expenditure
final_fund_phase_df = pd.merge(fund_phase_df_pivot1, fund_phase_df_pivot2, on = ['project_id'])

In [108]:
# Make it clear about total federal funds
final_fund_phase_df['total_federal_funds'] = final_fund_phase_df.federal_trust_fund

In [109]:
final_fund_phase_df.shape

(4330, 20)

In [110]:
final_fund_phase_df.project_id.nunique()

4330

In [111]:
final_fund_phase_df["is_state"] = final_fund_phase_df.apply(_utils.is_state_funds, axis=1)

In [112]:
final_fund_phase_df["is_federal"] = final_fund_phase_df.apply(_utils.is_fed_funds, axis=1)

In [113]:
final_fund_phase_df = final_fund_phase_df.fillna(0)

In [114]:
to_keep = ['project_id',  'single_phase_cost',
       'single_phase_expenditure_amt', 'total_state_funds','total_federal_funds', 'is_state',
       'is_federal']

In [115]:
final_fund_phase_df2 = final_fund_phase_df[to_keep]

In [116]:
final_fund_phase_df.loc[(final_fund_phase_df.is_state == "No") & (final_fund_phase_df.is_federal == "No")].head()

Unnamed: 0,project_id,corridor_mobility_improvement_account,environmental_enhanc_&_mitigat_prgm_fd,federal_trust_fund,"highway_safety,rehabilitation,&_preservation_acct",local_bridge_seismic_retrofit_acct,"road_maintenance_&_rehabilitation_account,_stf",state_highway_account,state-local_partnership_program_acct,"trade_corridor_enhancement_account,_stf",trade_corridors_improvement_fund,traffic_congestion_relief_fund,transportation_deferred_investment_fund,transportation_faciilities_account,transportation_investment_fund,unknown,total_state_funds,single_phase_cost,single_phase_expenditure_amt,total_federal_funds,is_state,is_federal
55,18D3(041),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No,No
57,2006(034),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No,No
58,2006(048),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No,No
59,2006(049),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No,No
60,2006(053),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,No,No


In [117]:
final_fund_phase_df.groupby(['is_state', 'is_federal']).agg({'project_id':'nunique'})

Unnamed: 0_level_0,Unnamed: 1_level_0,project_id
is_state,is_federal,Unnamed: 2_level_1
No,No,448
No,Yes,2500
Yes,No,1247
Yes,Yes,135


### Double Checking
* Make sure the project flag is correct

In [118]:
# fund_phase_df.project_id.value_counts().head()

In [119]:
# final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].style.where(lambda val: 'Yes' in str(val), 'color: red')

In [120]:
# fund_phase_df.loc[fund_phase_df.project_id == '5944(068)']

In [121]:
# fund_phase_df.loc[(fund_phase_df.project_id == '5944(068)')&(fund_phase_df.general_fund == 'Federal Trust Fund')][['curr_bud_am']].sum()

In [122]:
# fund_phase_df.loc[fund_phase_df.project_id == '5944(068)'][['curr_bud_am']].sum()

In [123]:
# final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].total_state_funds + final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].federal_trust_fund

In [124]:
# final_fund_phase_df.loc[final_fund_phase_df.project_id == '5006(219)']

In [125]:
# fund_phase_df.loc[(fund_phase_df.project_id == '5006(219)')&(fund_phase_df.general_fund == 'Federal Trust Fund')][['curr_bud_am']].sum()

In [126]:
229400.00 + 32967253.86

33196653.86

In [127]:
# fund_phase_df.loc[fund_phase_df.project_id ==  '5006(219)']

## Awards Table
* Appropriation code is the fiscal year of award


In [128]:
pect_df.sample()

Unnamed: 0,adv_project_id,fund_code,pec_code,appropriation_category_code,curr_bud_am,cash_exp_am,pect_task_code,project_id,pec,pect,pect_description,program
8504,1013000080,42,2030600,1213,25000.0,25000.0,621,5940(103),2030600,621.0,Local Roads Rehabilitation,State Transportation Improvement Program (STIP)


In [129]:
accounting_df.sample()

Unnamed: 0,adv_project_id,fund_code,pec_code,appropriation_category_code,curr_bud_am,cash_exp_am,pect_task_code,project_id
6581,718000255,890,2030010,2122,1238310.0,0.0,550,5257(037)


In [130]:
# Only want the most recent year of a pec_code listed once
awards_df = (pect_df
                  .groupby(['project_id', 'program'])
                  .agg({'appropriation_category_code':'max'})
                  .reset_index()
                  .rename(columns = {'appropriation_category_code':'state_fiscal_awarded_year',
                                     'program':'grant_program'})
                  )

## Checks

In [131]:
awards_df.project_id.value_counts().head()

6066(140)    3
5938(233)    3
5956(221)    3
5182(058)    3
5475(038)    3
Name: project_id, dtype: int64

In [132]:
awards_df.loc[awards_df.project_id == "5182(058)"]

Unnamed: 0,project_id,grant_program,state_fiscal_awarded_year
1550,5182(058),Active Transportation Program (ATP),2223
1551,5182(058),Local Assistance,2223
1552,5182(058),"Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017",2122


In [133]:
# Check original df 
# pect_df.loc[pect_df.project_id == "5182(058)"]

In [134]:
# project_df[project_df.project_id ==  "5288(046)"].style.where(lambda val: 'Yes' in str(val), 'color: red')

In [135]:
awards_df.loc[awards_df.project_id == "5475(038)"]

Unnamed: 0,project_id,grant_program,state_fiscal_awarded_year
2442,5475(038),Active Transportation Program (ATP),2223
2443,5475(038),Local Assistance,2223
2444,5475(038),"Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017",2122


In [136]:
# Check original df 
# pect_df.loc[pect_df.project_id ==  "5475(038)"]

In [137]:
project_df[project_df.project_id ==  "5475(038)"].style.where(lambda val: 'Yes' in str(val), 'color: red')

Unnamed: 0,project_id,comment_desc,district_code,est_total_prj_costs,location_name,project_label_name,original_post_mile_begin_id,original_post_mile_end_id,revised_post_mile_begin_ind,revised_post_mile_end_ind,route_name,state_hwy_ind,senate_district_code,update_date_time,agency_name,urban_area_code,county_name,work_type_desc,category_desc,current_phase,active_transportation_program_(atp),bridge_inspection_&_scour_evaluation,covid_relief_funds_for_highway_infrastructure_programs_for_stip-covid_augmentation,carbon_reduction_program_(crp),congestion_mitigation_&_air_quality_improvement_program_(cmaq),coronavirus_response_and_relief_supplemental_appropriations_act_(crrsaa)_funds,corridor_mobility_improvement_account_(cmia)_program,county_exchange_funds,county_state_match_program,"earmarks_projects_(hpp,_demo_cpfcds,_etc.)",emergency_relief_(er),ferry_boat_program_(fbp)_and_ferry_boat_discretionary_(fbd)_program,"funds_for_planning,_programming_and_monitoring_-_rip",general_funded_designated_programs,hazard_elimination_safety_(hes),high_risk_rural_roads_program_(hr3),highway_bridge_,highway_safety_improvement_program_(hsip)_(infrastructure)-state_fund,highway_safety_improvement_program_(hsip)_(non-infrastructure),highway_safety_improvement_program_(hsip)(infrastructure)-federal_fund,local_partnership_program_(lpp_–_competitive)_,local_roads,local_roads_rehabilitation,railroad_grade_crossing_protection,railroad_grade_separations,"rebuilding_american_infrastructure_with_sustainability_and_equity_(raise)_and_multimodal_project_discretionary_grant_programs_(e.g.,_infra,_mega,_rstg_or_rural)_",regional_improvement_program_–_regional_share_of_stip_transportation_enhancement_(off_system),regional_surface_transportation_block_grant_program_(rstbgp)_and_highway_infrastructure_program_(hip),regional_transportation_planning_agency_(rtpa)_stp_match_exchange,sb1_funded_freeway_service_patrol,shopp-_traffic_light_synchronization_program_(tlsp)-_proposition_1b_bond_funds,safe_routes_to_school_(sr2s_and_srts),set-aside_coordinated_border_infrastructure_(cbi)_program_under_fast_act,solutions_for_congested_corridors_program_(sccp),special_programs,state-local_partnership_program_(slpp)_and_local_partnership_program_(lpp-formulaic),structures_seismic_retrofit_,trade_corridor_enhancement_account_(tcea)_programs_–_local_share,trade_corridor_enhancement_account_(tcea)_programs_–_state_share,trade_corridors_improvement_fund_(tcif)_program_local_streets_&_roads,traffic_congestion_relief_program_(_tcrp_),unknown
2418,5475(038),"10/1/2020: Original AED date was 9/30/2020, new sequence being done to extend date to 9/30/2022. There will be a gap of time that is not reimbursable. CR CMAQ Emissions Benefit: .03 ROG, .02 NOx, .01 PM10 Project has EPSP approval for $2,646,524 of CMAQ for R/W to 15/16 FY. And EPSP for $291,476 of CMAQ for PE to 15/16 FY.",3,36291000.0,"Auburn Blvd. Complete Streets - Phase 2. On Auburn Blvd, in Citrus Heights from Rusch Park to Northern City Limits.",Pedestrian and Bike Path,,,,,0-CHts,N,,2023-10-13 13:34:21,Citrus Heights,3067,Sacramento County,,Pedestrian and Bike Path,single phase,Yes,No,No,No,Yes,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,No,No,No,No,No,Yes,No,No,No,No,No,No,No,No


## Save to Excel/Final Touches

In [138]:
# https://stackoverflow.com/questions/28837057/pandas-writing-an-excel-file-containing-unicode-illegalcharactererror
project_df = project_df.applymap(lambda x: x.encode('unicode_escape').
                 decode('utf-8') if isinstance(x, str) else x)

In [139]:
project_df.shape

(11272, 62)

In [140]:
project_df.project_id.nunique()

11272

In [141]:
# Split off county
county_df = project_df[['project_id', 'project_label_name','county_name']]

In [142]:
# Split off districts
district_df = project_df[['project_id', 'project_label_name','district_code']]

In [143]:
project_df = project_df.drop(columns = ['county_name', 'district_code','unknown'])

In [147]:

with pd.ExcelWriter("LP2000_projects.xlsx") as writer:
    project_df.to_excel(writer, sheet_name="project", index=False)
    county_df.to_excel(writer, sheet_name="county", index=False)
    district_df.to_excel(writer, sheet_name="district", index=False)
    awards_df.to_excel(writer, sheet_name="awards", index=False)
    final_fund_phase_df2.to_excel(writer, sheet_name="phase_funding", index=False)
