## CTIPS
* https://ctips-prod.dot.ca.gov/ctips/LoginMediatorForm.do

### To do
* Get total cost
    * total_cost: The total cost of this project.
    Total project cost can be calculated using 3 tables: project, fundtype, and fundline
    AH: which columns do I use from fundtype, project, and fundline to calculate the total cost?
    You can calculate total programmed for a project using: fundline.action = P and project.high_offlc = 1
    Then sum ( fundline.pe_paed + fundline.pe_env + fundline.pe_rw + fundline.pe_con + fundline.rw + fundline.con )

* Ask if DSHOPP means draft SHOPP project
* PROJSCHE - not a lot of matches

In [None]:
import pandas as pd 
import sqlalchemy 
import sys 
import re
import oracledb 
import _utils
import _csis_utils

In [None]:
oracledb.version = "8.3.0" 
sys.modules["cx_Oracle"] = oracledb 

In [None]:
pd.options.display.max_columns = 400
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:

ENGINE_PATH_WIN_AUTH =  f"{DIALECT}://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/?service_name={SERVICE}" 

In [None]:
engine = sqlalchemy.create_engine(ENGINE_PATH_WIN_AUTH)   

## Project Base Table
### Project
Project.agencyid = project sponsor

Implpaed = Implementing Agency for PA&ED

Implpse = Implementing Agency for PS&E

implcon = Implementing Agency for Construction

implrw = Implementing Agency for Right of Way


In [None]:
projects_df = pd.read_sql_query(""" 
SELECT 
ctips_id,
appdate, 
archive,
agencyid,
const_date,
countyid,
countyid2,
countyid3,
chg_offcl,
chg_qual1,
chg_qual2,
districtid,
document,
docyear,
ea_number,
high_ver,
high_offcl,
implpaed, 
implpse, 
implrw, 
implcon, 
needpurpose,
progcode1,
ppno,
proj_desc,
postmiles1,
pm1b,
pm2b,
pm3b,
pm1a,
pm2a,
pm3a,
projcomp_date,
projectid,
route1,
route2,
route3,
rtl,
title,
version
FROM ctips.project
ORDER BY high_ver DESC, version DESC, high_offcl DESC
""", engine) 

In [None]:
projects_table = _csis_utils.csis_clean_project(projects_df)

### PROJSCHE

In [None]:
projsche_df = pd.read_sql_query(""" 
SELECT 
projectid,
m020 AS pa_ed_begin,
m200a AS pa_ed_end,
m200b AS ps_e_begin,
m224 AS begin_row,
m410 AS end_row,
m500 AS con_start_date,
m600 AS con_end_date,
m700 AS begin_closeout,
m800 AS end_closeout
FROM ctips.projsche
""", engine) 

In [None]:
projsche_drop_cols = list(projsche_df.columns)

In [None]:
projsche_drop_cols.remove('projectid')

In [None]:
# I want to drop the rows in which ALL values in the date columns are empty
projsche_df2 = projsche_df.dropna(how = "all", subset = projsche_drop_cols).reset_index(drop = True)

In [None]:
len(projsche_df2), len(projsche_df)

In [None]:
projsche_df2.projectid.nunique()

In [None]:
projsche_df2.projectid.value_counts().head()

#### Not a lot of matching values

In [None]:
pd.merge(projsche_df2, projects_table, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
phase_dates_df = pd.merge(projects_table[['ctips_id', 'projectid']], projsche_df2,  on ='projectid', how = 'inner')

In [None]:
phase_dates_df.ctips_id.nunique()

### AGENCY

In [None]:
agency_df = pd.read_sql_query(""" 
SELECT 
name AS agency_name,
agencyid
FROM ctips.agncy
""", engine) 

In [None]:
projects_table = pd.merge(projects_table, agency_df,  on ='agencyid', how = 'left').rename(columns = {'agency_name':'sponsoring_agency'})

In [None]:
phase_agency_cols = ['implpaed','implpse','implrw', 'implcon']

In [None]:
for i in phase_agency_cols:
    projects_table = _csis_utils.add_agencies(projects_table, agency_df, i)

In [None]:
# Add spelled out version of the agency names to each column
projects_table = _csis_utils.add_agencies(projects_table, agency_df, 'implpse')

In [None]:
projects_table = projects_table.drop(columns = phase_agency_cols)

### COUNTY

In [None]:
county_df = pd.read_sql_query(""" 
SELECT 
name AS county_name,
countyid
FROM ctips.county
""", engine) 

In [None]:
county_df.sample()

In [None]:
county_df = pd.merge(projects_table[['ctips_id','countyid']], county_df,  on ='countyid', how = 'left')

In [None]:
county_df = county_df[['ctips_id', 'county_name']]

### FUNDLINE
* For action: Action: P = programmed, V= vote, A=award

In [None]:
fundline_df = pd.read_sql_query(""" 
SELECT 
action,
con,
rw,
pe_paed,
pe_env,
pe_rw,
pe_con,
pe_total,
fundlineid,
fundtypeid,
line_year,
actiondate
FROM ctips.fundline
""", engine) 

### Fundtype
* Fundtype.agencyid = funding agency

In [None]:
fundtype_df = pd.read_sql_query(""" 
SELECT 
fundtypeid,
fundid,
progcode,
programid,
projectid,
agencyid
FROM ctips.fundtype
""", engine) 

#### Merge everything

In [None]:
pd.merge(fundtype_df,
         fundline_df,  
         on = ['fundtypeid'], 
         how = "outer",
         indicator = True,)[['_merge']].value_counts()

In [None]:
pd.merge(fundtype_df,fundline_df,  on = ['fundtypeid'], how = "outer", indicator = True)[['_merge']].value_counts()

In [None]:
fund_m1 = pd.merge(fundtype_df,fundline_df,  on = ['fundtypeid'], how = "left")

In [None]:
final_fund_m = pd.merge(projects_table[['projectid','ctips_id', 'document']], fund_m1, on = ['projectid'], how = "inner")

In [None]:
final_fund_m.ctips_id.nunique(), len(final_fund_m)

In [None]:
projects_table.ctips_id.nunique()

In [None]:
29152-29116

### Progmain

In [None]:
progmain_df = pd.read_sql_query(""" 
SELECT 
programid,
category AS program
FROM ctips.progmain
""", engine) 

In [None]:
progmain_df.head()

### Fund

In [None]:
fund_df = pd.read_sql_query(""" 
SELECT 
fund,
fundid,
type AS fund_type_1_fed_2_state_3_local
FROM ctips.fund
""", engine) 

In [None]:
fund_df.head(2)

### Progsub
* Some progcodes have more than one progdesc
* Dropped duplicates bc the progdesc are similar
double_ids = ['20.30.010.820',
             '20.XX.723.000',
            '20.30.010.810',
             '20.XX.720.100',
             '20.30.010.817',
              '20.30.210.200'
             ]

In [None]:
progsub_df = pd.read_sql_query(""" 
SELECT 
progcode,
progdesc
FROM ctips.progsub
""", engine) 

In [None]:
progsub_df.head(1)

In [None]:
# progsub_df.loc[progsub_df.progcode.isin(double_ids)].sort_values('progcode')

In [None]:
progsub_df2 = progsub_df.drop_duplicates(subset = ['progcode'])

### Merge for work below

In [None]:
final_fund_m.sample()

In [None]:
final_fund_m.ctips_id.nunique(), projects_table.ctips_id.nunique()

In [None]:
funding_w_program_info = ((final_fund_m
                           .merge(progmain_df, on = ['programid'], how = "left")
                           .merge(fund_df, on =['fundid'], how = "left")
                           .merge(progsub_df2, on = ['progcode'], how = "left"))
                           .drop(columns = ['fundid','programid', 'progcode']))

In [None]:
 funding_w_program_info = funding_w_program_info.fillna(funding_w_program_info.dtypes.replace({'float64': 0.0, 'object': 'None'}))

In [None]:
funding_w_program_info = funding_w_program_info

In [None]:
funding_w_program_info.fund.nunique()

In [None]:
funding_w_program_info['fund'] = funding_w_program_info.fund + '-' + funding_w_program_info.fund_type_1_fed_2_state_3_local.astype('str')

In [None]:
funding_w_program_info = funding_w_program_info.fillna(funding_w_program_info.dtypes.replace({'float64': 0.0, 'object': 'None'}))

In [None]:
funding_w_program_info.sample()

In [None]:
funding_w_program_info.ctips_id.nunique()

## Phase Funding Table

#### First: find the # of funds a project has programmed/voted/awarded for each fund

In [None]:
columns_to_agg = {**dict.fromkeys(['con', 'rw',
       'pe_paed', 'pe_env', 'pe_rw', 'pe_con', 'pe_total'], 'sum')}


In [None]:
# I want to find the total funds a project will receive for each fund
total_cost = funding_w_program_info.groupby(['ctips_id','fund','document']).agg(columns_to_agg).reset_index()

In [None]:
total_cost.sample()

##### Separate out FTIP and everything else to calculate total funds a project is estimated to receive

In [None]:
# Calculate out FTIP and oither documents in 2 stages
ftip_only = total_cost.loc[total_cost.document == 'FTIP'].reset_index(drop = True)

In [None]:
total_cost_ftip = ftip_only.groupby(['ctips_id', 'fund']).agg(columns_to_agg).reset_index()

In [None]:
cols_to_keep = ['ctips_id', 'fund', 'total_cost']

In [None]:
total_cost_ftip['total_cost'] = total_cost_ftip.con + total_cost_ftip.rw + total_cost_ftip.pe_total

In [None]:
total_cost_ftip = total_cost_ftip[cols_to_keep]

In [None]:
everything_else = total_cost.loc[total_cost.document != 'FTIP'].reset_index(drop = True)

In [None]:
everything_else = everything_else.groupby(['ctips_id', 'fund']).agg(columns_to_agg).reset_index()

In [None]:
everything_else['total_cost'] = everything_else.con + everything_else.rw + everything_else.pe_paed + everything_else.pe_env + everything_else.pe_rw + everything_else.pe_con

In [None]:
everything_else = everything_else[cols_to_keep]

In [None]:
total_requested_funds_final = pd.concat([everything_else, total_cost_ftip])

In [None]:
len(total_requested_funds_final), total_requested_funds_final.ctips_id.nunique()

In [None]:
total_requested_funds_final.ctips_id.value_counts().sample(5)

##### One project

In [None]:
total_requested_funds_final.loc[total_requested_funds_final.ctips_id == 20300000209]

In [None]:
8900.00+63400.00

In [None]:
# Original 
funding_w_program_info.loc[funding_w_program_info.ctips_id == 20300000209]

#### Pivot - I want the dataframe to be wide instead of long

In [None]:
fund_table = total_requested_funds_final.pivot_table(index=['ctips_id'], columns='fund', 
                    values=['total_cost'], aggfunc='sum')

In [None]:
fund_table.columns = fund_table.columns.droplevel()

In [None]:
fund_table = fund_table.reset_index()

In [None]:
fund_table.shape

In [None]:
fund_table = _utils.to_snakecase(fund_table)

##### Check with one project

In [None]:
total_requested_funds_final.loc[total_requested_funds_final.ctips_id == 20300000522]

In [None]:
fund_table.loc[fund_table.ctips_id == 20300000522].dropna(axis =1)

In [None]:
total_requested_funds_final.loc[total_requested_funds_final.ctips_id == 20300000522]

In [None]:
funding_w_program_info.loc[funding_w_program_info.ctips_id == 20300000522]

#### Second: find the amount of $ for each phase

In [None]:
cost_per_phase = funding_w_program_info.groupby(['ctips_id']).agg(columns_to_agg).reset_index()

In [None]:
cost_per_phase.shape, cost_per_phase.ctips_id.nunique()

In [None]:
63400.00+9700.00+11300

In [None]:
fund_table.loc[fund_table.ctips_id == 20300000209].dropna(axis=1)

In [None]:
cost_per_phase.loc[cost_per_phase.ctips_id == 20300000209].dropna(axis=1)

#### Third: merge these 2 tables

In [None]:
final_phase_funding_table = pd.merge(fund_table, cost_per_phase, on = ["ctips_id"], how = "inner")

In [None]:
len(final_phase_funding_table)

In [None]:
final_phase_funding_table.head(2)

In [None]:
final_phase_funding_table.ctips_id.nunique(), len(final_phase_funding_table)

#### Fourth: find state v federal 
* State funds is a lot more.

In [None]:

federal_funds = _csis_utils.calculate_state_fed_local_total_funds(final_phase_funding_table, ['1.0'], 'total_federal_funds')


In [None]:
state_funds = _csis_utils.calculate_state_fed_local_total_funds(final_phase_funding_table, ['2.0'], 'total_state_funds')


In [None]:
local_funds = _csis_utils.calculate_state_fed_local_total_funds(final_phase_funding_table, ['3.0'], 'total_local_funds')


In [None]:
final_phase_funding_table["is_state"] = final_phase_funding_table.apply(_utils.is_state_funds, axis=1)

In [None]:
final_phase_funding_table["is_federal"] = final_phase_funding_table.apply(_utils.is_fed_funds, axis=1)

In [None]:
final_phase_funding_table["is_local"] = final_phase_funding_table.apply(_utils.is_local_funds, axis=1)

##### Check that I summed up federal funds correctly
* State not summing up correctly

In [None]:
funding_w_program_info.loc[funding_w_program_info.ctips_id == 20920011849][['fund_type_1_fed_2_state_3_local']].value_counts()

In [None]:
funding_w_program_info.loc[(funding_w_program_info.ctips_id == 20920011849) & (funding_w_program_info.fund == 'Public Transportation Modernization Improvement-2.0-2.0')][['con']].sum()

In [None]:
funding_w_program_info.loc[(funding_w_program_info.ctips_id == 20920011849) & (funding_w_program_info.fund == 'Public Transportation Modernization Improvement-2.0-2.0')][['rw']].sum()

In [None]:
funding_w_program_info.loc[(funding_w_program_info.ctips_id == 20920011849) & (funding_w_program_info.fund == 'Public Transportation Modernization Improvement-2.0-2.0')][['pe_total']].sum()

In [None]:
112386000.00+12213000.00+4351000.00 == 128950000.0

In [None]:
final_phase_funding_table.loc[final_phase_funding_table.ctips_id == 20920011849].dropna(axis=1).T

In [None]:
(final_phase_funding_table
 .groupby(['is_state', 'is_federal', 'is_local'])
 .agg({'ctips_id':'nunique'})
 .reset_index()
 .sort_values(by = ['ctips_id']))

#### Fifth: Drop everything before `con`
* Need to differentiate between `pe_total` for FTIP vs `pe_total` for everything else.

In [None]:
to_keep = ['ctips_id','con','rw', 'pe_env', 'pe_rw', 'pe_con', 'pe_total', 'total_federal_funds',
       'total_state_funds', 'total_local_funds', 'is_local', 'is_state',
       'is_federal']

In [None]:
final_phase_funding_table2 = final_phase_funding_table[to_keep]

In [None]:
final_phase_funding_table2.head()

#### Sixth: Merge on `phase_dates_df` with all the phase dates

In [None]:
phase_dates_df.ctips_id.nunique(), len(phase_dates_df)

In [None]:
projects_table.ctips_id.nunique(), len(projects_table)

In [None]:
final_phase_funding_table.ctips_id.nunique(), len(final_phase_funding_table)

In [None]:
final_phase_funding_table2 = pd.merge(final_phase_funding_table2, phase_dates_df, on = "ctips_id", how = "outer")

In [None]:
pd.merge(final_phase_funding_table2, phase_dates_df, on = "ctips_id", how = "outer", indicator = True)[['_merge']].value_counts()

In [None]:
final_phase_funding_table2.ctips_id.nunique()

In [None]:
29152-29127

#### Seventh: Merge some other dates found in the `projects` dataframe.

In [None]:
project_date_cols = ['const_date', 'rtl', 'ctips_id', 'projcomp_date']

In [None]:
projects_dates = projects_table[project_date_cols]

In [None]:
project_date_cols.remove('ctips_id')

In [None]:
projects_table = projects_table.drop(columns = project_date_cols)

In [None]:
# I'm only interested in rwos in which at least one of the dates are populated
projects_dates2 = projects_dates.loc[(projects_dates.rtl != 'datetime64[ns]') |  (projects_dates.const_date != 'datetime64[ns]')].reset_index(drop = True)

In [None]:
projects_dates2 = projects_dates2.rename(columns = {'const_date': 'construction_completion_date', 'rtl':'ready_to_list_date'})

In [None]:
final_phase_funding_table3 = pd.merge(final_phase_funding_table2, projects_dates2, on = 'ctips_id', how = 'left')

In [None]:
final_phase_funding_table3.ctips_id.nunique()

In [None]:
projects_table.ctips_id.nunique()

### Awards Table
* Take final_fund_m and sort it by year
* Line year is "fiscal year of this fund record Note that the year listed is the second in the pair of fiscal year notation.  For example if the funds for this record are for fiscal year 1998/99, then this record will hold the value 1999."
* These aren't really programs, funds?

In [None]:
# del out '',
awards = funding_w_program_info[['ctips_id','line_year', 'program', 'progdesc']]

In [None]:
# Just drop dups across
awards2 = awards.drop_duplicates().reset_index(drop = True)

In [None]:
len(awards), len(awards2)

In [None]:
awards3 = awards2.sort_values(by = ['ctips_id','program','line_year', ], ascending = [False, False, False])

In [None]:
awards4 = awards3.drop_duplicates(subset = ['ctips_id','program'])

In [None]:
awards4.ctips_id.nunique()

#### Check w/ one project

In [None]:
awards4.loc[awards4.ctips_id == 20700001649]

In [None]:
awards4.ctips_id.value_counts().describe()

In [None]:
awards4.ctips_id.value_counts().head(10)

### Political

In [None]:
political_df = pd.read_sql_query(""" 
SELECT 
*
FROM ctips.politcal
""", engine) 

In [None]:
# Drop any rows with nulls
political_df = political_df.dropna(how = "any")

In [None]:
len(political_df), political_df.projectid.nunique()

In [None]:
political_df2 = pd.merge(projects_table[['ctips_id', 'projectid']], political_df, on ='projectid', how = 'inner')

In [None]:
political_df2.shape, political_df2.projectid.nunique(), political_df2.ctips_id.nunique()

In [None]:
assembly_df = _csis_utils.clean_political(political_df2, 'assembly')

In [None]:
assembly_df.ctips_id.value_counts().head()

In [None]:
len(assembly_df), assembly_df.ctips_id.nunique()

In [None]:
senate_df = _csis_utils.clean_political(political_df2, 'ssenate')

In [None]:
len(senate_df), senate_df.ctips_id.nunique()

In [None]:
ushouse_df = _csis_utils.clean_political(political_df2, 'ushouse')

In [None]:
len(ushouse_df), ushouse_df.ctips_id.nunique()

In [None]:
ushouse_df.ushouse.value_counts().head()

#### Double check

In [None]:
assembly_df.loc[assembly_df.ctips_id == 10900000289]

In [None]:
political_df2.loc[political_df2.ctips_id == 10900000289]

In [None]:
# projects_table.loc[projects_table.ctips_id == 10900000289]

## Save to Excel

In [None]:
district_df = projects_table[['ctips_id','districtid']]

In [None]:
district_df.shape

In [None]:
drop_cols = ['chg_offcl', 'chg_qual1', 'chg_qual2','districtid', 'appdate', 'version','projcomp_date', 'agencyid', 'projectid', 'archive', 'agency_name']

In [None]:
projects_table2 = projects_table.drop(columns = drop_cols)

In [None]:
projects_table2 = projects_table2.fillna(projects_table2.dtypes.replace({'float64': 0.0, 'object': 'None'}))

In [None]:
district_df = district_df.fillna(district_df.dtypes.replace({'float64': 0.0, 'object': 'None'}))

In [None]:
county_df = county_df.fillna(county_df.dtypes.replace({'float64': 0.0, 'object': 'None'}))

In [None]:
final_phase_funding_table3 = final_phase_funding_table3.fillna(district_df.dtypes.replace({'float64': 0.0, 'object': 'None'}))

In [None]:
awards4 = awards4.fillna(awards4.dtypes.replace({'float64': 0.0, 'object': 'None'}))

In [None]:
ushouse_df = ushouse_df.fillna(ushouse_df.dtypes.replace({'float64': 0.0, 'object': 'None'}))

In [None]:
senate_df = senate_df.fillna(senate_df.dtypes.replace({'float64': 0.0, 'object': 'None'}))

In [None]:
assembly_df = assembly_df.fillna(assembly_df.dtypes.replace({'float64': 0.0, 'object': 'None'}))

In [None]:
assembly_df.ctips_id.nunique(), awards4.ctips_id.nunique()

In [None]:
final_phase_funding_table3.ctips_id.nunique()

In [None]:
county_df.ctips_id.nunique()

In [None]:
district_df.ctips_id.nunique()

In [None]:

with pd.ExcelWriter("./CTIPS.xlsx") as writer:
    projects_table2.to_excel(writer, sheet_name="project", index=False)
    district_df.to_excel(writer, sheet_name="district", index=False)
    county_df.to_excel(writer, sheet_name="county", index=False)
    final_phase_funding_table3.to_excel(writer, sheet_name="phase_funding", index=False)
    awards4.to_excel(writer, sheet_name="awards", index=False)
    ushouse_df.to_excel(writer, sheet_name="us_house", index=False)
    senate_df.to_excel(writer, sheet_name="senate", index=False)
    assembly_df.to_excel(writer, sheet_name="assembly", index=False)
