## CTIPS
* https://ctips-prod.dot.ca.gov/ctips/LoginMediatorForm.do

### To do
* Ask if DSHOPP means draft SHOPP project
* PROJSCHE - not a lot of matches

In [None]:
import pandas as pd 
import sqlalchemy 
import sys 
import re
import oracledb 

In [None]:
oracledb.version = "8.3.0" 
sys.modules["cx_Oracle"] = oracledb 

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:

ENGINE_PATH_WIN_AUTH =  f"{DIALECT}://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/?service_name={SERVICE}" 

In [None]:
engine = sqlalchemy.create_engine(ENGINE_PATH_WIN_AUTH)   

In [None]:
def to_snakecase(df):
    df.columns = df.columns.str.lower().str.replace(' ','_')
    return df

### Project
Project.agencyid = project sponsor

Implpaed = Implementing Agency for PA&ED

Implpse = Implementing Agency for PS&E

implcon = Implementing Agency for Construction

implrw = Implementing Agency for Right of Way


In [None]:
projects_df = pd.read_sql_query(""" 
SELECT 
appdate, 
archive,
agencyid,
bond99,
cmia,
ctips_id,
const_date,
countyid,
countyid2,
countyid3,
chg_offcl,
chg_qual1,
chg_qual2,
districtid,
document,
docyear,
ea_number,
high_ver,
high_offcl,
implpaed, 
implpse, 
implrw, 
implcon, 
lupdate, 
needpurpose,
progcode1,
ppno,
proj_desc,
postmiles1,
pm1b,
pm2b,
pm3b,
pm1a,
pm2a,
pm3a,
projcomp_date,
projectid,
route1,
route2,
route3,
rtl,
stip,
shopp,
title,
tcif,
tcrpno,
tcrp,
urbanid,
version
FROM ctips.project
""", engine) 

In [None]:
projects_df.shape

In [None]:
projects_df.projectid.nunique()

In [None]:
projects_df.ctips_id.nunique()

### A bit of cleaning

In [None]:
projects_df = projects_df.fillna(projects_df.dtypes.replace({'float64': 0.0, 'object': 'None', 'int64': 0}))

In [None]:
string_cols = [col for col in projects_df.columns if projects_df[col].dtype == 'object']

In [None]:
string_cols = [
 'needpurpose',
 'proj_desc',
 'route1',
 'title']

In [None]:
for i in string_cols:
        projects_df[i] = projects_df[i].str.title().str.lstrip().str.rstrip()
        projects_df[i] = projects_df[i].replace(r'\s+', ' ', regex=True)

### 1 row = 1 project 
* Some projects don't have a high version?

In [None]:
projects_df2 = projects_df.sort_values(by = ['high_offcl', 'high_ver','archive'], ascending = [False, False, False])

In [None]:
# Drop projects by ctips_id
projects_df3 = projects_df2.drop_duplicates(subset = ['ctips_id'])

In [None]:
# Filter out projects that are finished
projects_df3 = projects_df3.loc[projects_df3.archive == 0]

In [None]:
projects_df3.ctips_id.nunique()

In [None]:
# Filter out any rows where chg_qual1==7 because those are projects that are deleted
projects_df3 = projects_df3[projects_df3.chg_qual1 != 7]

In [None]:
len(projects_df3)

#### Ask if DSHOPP means draft shopp?

In [None]:
projects_df3.document.unique()

### PROJSCHE

In [None]:
projsche_df = pd.read_sql_query(""" 
SELECT 
projectid,
m020 AS pa_ed_begin,
m200a AS pa_ed_end,
m200b AS ps_e_begin,
m224 AS begin_row,
m410 AS end_row,
m500 AS con_start_date,
m600 AS con_end_date,
m700 AS begin_closeout,
m800 AS end_closeout
FROM ctips.projsche
""", engine) 

In [None]:
projsche_df.shape

In [None]:
projsche_df.info()

In [None]:
projsche_drop_cols = list(projsche_df.columns)

In [None]:
projsche_drop_cols.remove('projectid')

In [None]:
# I want to drop the rows in which ALL values in the date columns are empty
projsche_df2 = projsche_df.dropna(how = "all", subset = projsche_drop_cols).reset_index(drop = True)

In [None]:
len(projsche_df2), len(projsche_df)

In [None]:
projsche_df2.projectid.nunique()

In [None]:
projsche_df2.info()

#### Not a lot of matching values

In [None]:
pd.merge(projsche_df2, projects_df3, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
pd.merge(projsche_df, projects_df3, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
m1 = pd.merge(projects_df3, projsche_df2,  on ='projectid', how = 'left')

### AGENCY

In [None]:
agency_df = pd.read_sql_query(""" 
SELECT 
name AS agency_name,
agencyid
FROM ctips.agncy
""", engine) 

In [None]:
pd.merge(m1, agency_df, on ='agencyid', how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
m2 = pd.merge(m1, agency_df,  on ='agencyid', how = 'left')

In [None]:
agency_cols = ['agencyid', 'agency_name', 'implpaed', 'implpse', 'implrw', 'implcon']

In [None]:
m2.loc[m2.implpaed != "None"][agency_cols].sample()

In [None]:
m2.loc[m2.implrw != "None"][agency_cols].head()

### COUNTY

In [None]:
county_df = pd.read_sql_query(""" 
SELECT 
name AS county_name,
countyid
FROM ctips.county
""", engine) 

In [None]:
m3 = pd.merge(m2, county_df,  on ='countyid', how = 'left')

In [None]:
m3.sample()

### FUNDLINE
* For action: Action: P = programmed, V= vote, A=award

In [None]:
fundline_df = pd.read_sql_query(""" 
SELECT 
action,
con,
rw,
pe_paed,
pe_env,
pe_rw,
pe_con,
pe_total,
fundlineid,
fundtypeid,
line_year,
actiondate
FROM ctips.fundline
""", engine) 

In [None]:
fundline_df.fundlineid.nunique()

In [None]:
fundline_df.fundlineid.value_counts().head()

In [None]:
fundline_df.fundtypeid.nunique()

In [None]:
fundline_df.fundtypeid.value_counts().sample(5)

In [None]:
len(fundline_df)

In [None]:
fundline_df.action.value_counts()

In [None]:
fundline_df.loc[fundline_df.fundtypeid == 20700009194]

### Fundtype
* Fundtype.agencyid = funding agency

In [None]:
fundtype_df = pd.read_sql_query(""" 
SELECT 
fundtypeid,
fundid,
progcode,
programid,
projectid,
agencyid
FROM ctips.fundtype
""", engine) 

In [None]:
fundtype_df.shape

In [None]:
fundtype_df.fundid.nunique()

In [None]:
fundtype_df.projectid.nunique()

In [None]:
fundtype_df.fundtypeid.nunique()

In [None]:
fundtype_df.fundtypeid.value_counts().head()

### Do the merges
#### Merge fundtype and fundline

In [None]:
pd.merge(fundtype_df,
         fundline_df,  
         on = ['fundtypeid'], 
         how = "outer",
         indicator = True,)[['_merge']].value_counts()

In [None]:
fund_m1 = pd.merge(fundtype_df,fundline_df,  on = ['fundtypeid'], how = "left")

In [None]:
len(fund_m1)

In [None]:
fund_m1.head()

In [None]:
fund_m1.projectid.nunique(), fund_m1.fundtypeid.nunique()

In [None]:
fund_m1.fundtypeid.value_counts().head()

In [None]:
fund_m1.action.value_counts()

#### Merge subset of project with the merge above

In [None]:
project_preview = ['ctips_id','projectid', 'high_ver', 'high_offcl']

In [None]:
fundtype_m2 = pd.merge(m3[project_preview], fund_m1, on = ['projectid'], how = "inner")

In [None]:
fundtype_m2.projectid.value_counts().head()

In [None]:
fundtype_m2.projectid.value_counts().describe()

In [None]:
project_preview = project_preview + ['title']

In [None]:
fundtype_m2.columns

#### Aggregate

In [None]:
columns_to_agg = {**dict.fromkeys(['con', 'rw',
       'pe_paed', 'pe_env', 'pe_rw', 'pe_con', 'pe_total'], 'sum')}


In [None]:
columns_to_agg

In [None]:
total_cost = fundtype_m2.groupby(['ctips_id','fundid','progcode','programid']).agg(columns_to_agg).reset_index()

In [None]:
total_cost.sample(3)

In [None]:
total_cost.con.describe()

In [None]:
total_cost.ctips_id.value_counts().head()

In [None]:
len(total_cost), total_cost.ctips_id.nunique()

In [None]:
total_cost.loc[total_cost.ctips_id == 20600003977]

### Progmain

In [None]:
progmain_df = pd.read_sql_query(""" 
SELECT 
programid,
category AS program
FROM ctips.progmain
""", engine) 

In [None]:
progmain_df.head()

### Fund

In [None]:
fund_df = pd.read_sql_query(""" 
SELECT 
fund,
fundid
FROM ctips.fund
""", engine) 

### Progsub

In [None]:
progsub_df = pd.read_sql_query(""" 
SELECT 
progcode,
progdesc
FROM ctips.progsub
""", engine) 

In [None]:
progsub_df.head(1)

In [None]:
progsub_df.shape

In [None]:
progsub_df.progcode.nunique()

In [None]:
double_ids = ['20.30.010.820',
             '20.XX.723.000',
            '20.30.010.810',
             '20.XX.720.100',
             '20.30.010.817',
              '20.30.210.200'
             ]

In [None]:
progsub_df.loc[progsub_df.progcode.isin(double_ids)].sort_values('progcode')

In [None]:
progsub_df2 = progsub_df.drop_duplicates(subset = ['progcode'])

In [None]:
len(progsub_df)

In [None]:
len(progsub_df2)

#### Merge

In [None]:
final_fin_df = (total_cost.merge(progmain_df, on = ['programid'], how = "left")
              .merge(fund_df, on =['fundid'], how = "left")
              .merge(progsub_df2, on = ['progcode'], how = 'left'))

In [None]:
final_fin_df = final_fin_df.drop(columns = ['fundid', 'progcode','programid'])

In [None]:
final_fin_df.sample(3)

In [None]:
final_fin_df.projectid.value_counts().head()

In [None]:
final_fin_df.projectid.value_counts().head()

#### Find Total Cost
##### CLARIFY FTIP projects have `pe_total` value so figure out how to find the ftip projects and sum those up

In [None]:
total_cost = final_fin_df.groupby(['ctips_id']).agg(columns_to_agg).reset_index()

In [None]:
# pe_test = total_cost.loc[(total_cost.pe_con != 0) & (total_cost.pe_env != 0) & (total_cost.pe_rw != 0) & (total_cost.pe_paed != 0)& (total_cost.pe_total != 0)]

In [None]:
total_cost['total_cost'] = total_cost.con + total_cost.rw + total_cost.pe_paed + total_cost.pe_env + total_cost.pe_rw + total_cost.pe_con

In [None]:
# 6,638,471,000
total_cost['total_cost'].describe()

In [None]:
total_cost.sort_values(by = ['total_cost'], ascending = False).head()

In [None]:
len(total_cost), total_cost.ctips_id.nunique()

In [None]:
len(m3), m3.ctips_id.nunique()

#### Agency name missing?

In [None]:
m3.loc[m3.ctips_id == 20600002404]

### Political

In [None]:
political_df = pd.read_sql_query(""" 
SELECT 
assembly01,
ushouse01,
ssenate01,
projectid
FROM ctips.politcal
""", engine) 
# Drop any rows with nulls
political_df = political_df.dropna(how = "any")

In [None]:
pd.merge(m3, political_df, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
m4 = pd.merge(m3, political_df, on ='projectid', how = 'left')