## CTIPS
* https://ctips-prod.dot.ca.gov/ctips/LoginMediatorForm.do

In [None]:
import pandas as pd 
import sqlalchemy 
import sys 
import re
import oracledb 

In [None]:
oracledb.version = "8.3.0" 
sys.modules["cx_Oracle"] = oracledb 

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:

ENGINE_PATH_WIN_AUTH =  f"{DIALECT}://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/?service_name={SERVICE}" 

In [None]:
engine = sqlalchemy.create_engine(ENGINE_PATH_WIN_AUTH)   

In [None]:
def to_snakecase(df):
    df.columns = df.columns.str.lower().str.replace(' ','_')
    return df

### Project 
['projectid',
 'ctips_id',
 'countyid',
 'docyear',
 'document',
 'districtid',
 'ea_number',
 'high_ver',
 'mpoid',
 'official',
 'ppno',
 'version',
 'adminappv',
 'agencyid',
 'airco',
 'aircode',
 'appdate',
 'approve',
 'aprv_fed',
 'aprv_mpo',
 'aprv_state',
 'award_fa',
 'basinid',
 'bridge_num',
 'capinc',
 'chg_offcl',
 'chg_qual1',
 'chg_qual2',
 'comnt_dist',
 'comnt_mpo',
 'comnt_senr',
 'contrib',
 'corridor',
 'ct_candt',
 'deny_fed',
 'deny_state',
 'proj_desc',
 'docprint',
 'elementid',
 'env',
 'env_doc',
 'ext_xfdt',
 'ext_xfer',
 'flag_a',
 'flag_b',
 'flag_c',
 'flag_d',
 'flag_e',
 'ftipinfocg',
 'ftip_amend',
 'ftip_xfer',
 'ftip_xfdt',
 'kp1a',
 'kp1af',
 'kp1b',
 'kp1bf',
 'kplength1',
 'law',
 'lupdate',
 'lupdateid',
 'maintenace',
 'miscfield1',
 'miscfield2',
 'miscfield3',
 'miscfield4',
 'mpo',
 'mpoupdate',
 'northsouth',
 'no_pesplit',
 'p116prj',
 'perfmeaval',
 'perfmeacod',
 'pm1a',
 'pm1af',
 'pm1b',
 'pm1bf',
 'pmcfon',
 'pmpfon',
 'pmpfonarea',
 'postmiles1',
 'prjmgr',
 'prjmgrphon',
 'puc',
 'route1',
 'sourceorig',
 'stip_amend',
 'system',
 'systemid',
 'tciprj',
 'title',
 'tollbridge',
 'urbanid',
 'const_date',
 'fund_res',
 'aqexemptid',
 'sj_groupid',
 'cost_incr',
 'scope_incr',
 'cost_decr',
 'scope_decr',
 'sch_delay',
 'sch_adv',
 'fnd_src_ch',
 'compnt_chg',
 'split_comb',
 'other_chg',
 'fco',
 'comnt_ct',
 'comnt_fed',
 'ctc_mtg_dt',
 'extwaivno',
 'extappvdte',
 'prjsplit',
 'prjcomb',
 'awd_extwaivno',
 'awd_extappvdte',
 'bookitem',
 'pmeas_val2',
 'pmeas_cod2',
 'pmeas_val3',
 'pmeas_cod3',
 'high_offcl',
 'rtpaid',
 'archive',
 'sponsorid',
 'stip_xfer',
 'shopp_xfer',
 'full_fnding',
 'rtl',
 'tcm1',
 'tcm2',
 'progcode1',
 'progcode2',
 'progcode3',
 'progcode4',
 'pmeas_val4',
 'pmeas_cod4',
 'needpurpose',
 'stip',
 'shopp',
 'tcif',
 'cmia',
 'bond99',
 'ports',
 'tcrpno',
 'tcrp',
 'implpaed',
 'implpse',
 'implrw',
 'implcon',
 'ccc',
 'countyid2',
 'route2',
 'pm2bf',
 'pm2b',
 'pm2af',
 'pm2a',
 'countyid3',
 'route3',
 'pm3bf',
 'pm3b',
 'pm3af',
 'pm3a',
 'pmemail',
 'postmile2',
 'postmile3',
 'slpp',
 'tlsp',
 'efisprojid',
 'ppr_ada',
 'ppr_sustain',
 'ppr_reduce_ghg',
 'ppr_bike',
 'ppr_rev_lane',
 'ppr_benefits',
 'g13',
 'longlead',
 'sb1_base',
 'route1s',
 'pm1bs',
 'pm1as',
 'route2s',
 'pm2bs',
 'pm2as',
 'route3s',
 'pm3bs',
 'pm3as',
 'ppr_add_info',
 'ppr_nhs',
 'ppr_road_class',
 'asset_class',
 'amendment',
 'ppr_finalized_date',
 'programs',
 'ppr_id',
 'co_nom_agencies',
 'fund_programs',
 'exp_extappvdte',
 'exp_extwaivno',
 'comp_extappvdte',
 'comp_extwaivno',
 'projcomp_date',
 'air_quality_approval']

In [None]:
#projects_df = pd.read_sql_query(""" 
#SELECT 
#*
#FROM ctips.project
#WHERE archive = 0
# """, engine) 

In [None]:
# list(projects_df.columns)

In [None]:
projects_df = pd.read_sql_query(""" 
SELECT 
appdate, 
archive,
agencyid,
bond99,
cmia,
ctips_id,
const_date,
countyid,
countyid2,
countyid3,
chg_offcl,
chg_qual1,
chg_qual2,
districtid,
ea_number,
high_ver,
high_offcl,
implpaed, 
implpse, 
implrw, 
implcon, 
lupdate, 
needpurpose,
progcode1,
ppno,
proj_desc,
postmiles1,
pm1b,
pm2b,
pm3b,
pm1a,
pm2a,
pm3a,
projcomp_date,
projectid,
route1,
route2,
route3,
rtl,
stip,
shopp,
title,
tcif,
tcrpno,
tcrp,
urbanid
FROM ctips.project
WHERE archive = 0
""", engine) 

In [None]:
projects_df.shape

In [None]:
projects_df.projectid.nunique()

In [None]:
projects_df.ctips_id.nunique()

### A bit of cleaning

In [None]:
projects_df = projects_df.fillna(projects_df.dtypes.replace({'float64': 0.0, 'object': 'None', 'int64': 0}))

In [None]:
string_cols = [col for col in projects_df.columns if projects_df[col].dtype == 'object']

In [None]:
string_cols = [
 'needpurpose',
 'proj_desc',
 'route1',
 'title']

In [None]:
for i in string_cols:
        projects_df[i] = projects_df[i].str.title().str.lstrip().str.rstrip()
        projects_df[i] = projects_df[i].replace(r'\s+', ' ', regex=True)

### Looking at duplicate projects

In [None]:
projects_df.title.value_counts().head()

In [None]:
duplicate_projects = projects_df.groupby(['ctips_id','title', 'proj_desc','districtid','agencyid']).agg({'projectid':'nunique'
}).reset_index()

In [None]:
duplicate_projects.projectid.describe()

In [None]:
duplicate_projects.sort_values(['projectid'], ascending = False).sample(3)

### 1 row = 1 project 
* Some projects don't have a high version?

In [None]:
projects_df2 = projects_df.sort_values(by = ['high_ver','high_offcl'], ascending = [False, False])

In [None]:
projects_df2.ctips_id.value_counts().head()

In [None]:
# projects_df.query('ctips_id == 10600001907').drop(columns = ['proj_desc'])

In [None]:
# projects_df2.query('ctips_id == 10100000003').drop(columns = ['proj_desc'])

In [None]:
# projects_df2.query('ctips_id == 10600001907')

In [None]:
projects_df3 = projects_df2.drop_duplicates(subset = ['ctips_id']).reset_index(drop = True)

In [None]:
len(projects_df3.loc[(projects_df3.high_ver == 0) & (projects_df2.high_offcl == 0)])

In [None]:
len(projects_df3.loc[(projects_df3.high_ver == 1) & (projects_df2.high_offcl == 1)])

In [None]:
len(projects_df3.loc[(projects_df3.high_ver == 1) & (projects_df2.high_offcl == 0)])

In [None]:
len(projects_df3.loc[(projects_df3.high_ver == 0) & (projects_df2.high_offcl == 1)])

In [None]:
no_high_version = projects_df3.loc[(projects_df3.high_ver == 0) & (projects_df2.high_offcl == 0)]

In [None]:
no_high_version.ctips_id.unique()

In [None]:
no_high_version.ctips_id.nunique()

In [None]:
len(projects_df2)

In [None]:
len(projects_df3)

In [None]:
len(projects_df3) == projects_df3.ctips_id.nunique()

In [None]:
projects_df3.query('ctips_id == 10600001907')

In [None]:
projects_df2.query('ctips_id == 10600001907')

####  Note, if the CTIPS_ID digits 2 and 3 = 30, then the project is a “Rural Non-MPO” project (within the rural non-MPO portion of the state).

In [None]:
projects_df3['ctips_string'] = projects_df3.ctips_id.astype(str).str[1:3].astype(int)

In [None]:
def urban_or_rural(row):
    if row.ctips_string == 30:
        return "Rural Non-MPO"
    else:
        return "Urban"


projects_df3["urban_or_rural"] = projects_df3.apply(lambda x: urban_or_rural(x), axis=1)

In [None]:
projects_df3[['ctips_id','ctips_string', 'urban_or_rural', 'appdate']].sample(10)

In [None]:
projects_df3.urban_or_rural.value_counts()

In [None]:
projects_df3.high_ver.value_counts()

In [None]:
projects_df3.high_offcl.value_counts()

### PROJSCHE

In [None]:
projsche_df = pd.read_sql_query(""" 
SELECT 
projectid,
m020 AS pa_ed_begin,
m200a AS pa_ed_end,
m200b AS ps_e_begin,
m224 AS begin_row,
m410 AS end_row,
m500 AS con_start_date,
m600 AS con_end_date,
m700 AS begin_closeout,
m800 AS end_closeout
FROM ctips.projsche
""", engine) 

In [None]:
projsche_df.columns

#### 2nd Question: Delete all the rows in which all the date values are empty...There are many of them

In [None]:
projsche_df.info()

In [None]:
projsche_drop_cols = list(projsche_df.columns)

In [None]:
projsche_drop_cols.remove('projectid')

In [None]:
projsche_drop_cols

In [None]:
# I want to drop the rows in which ALL values in the date columns are empty
projsche_df2 = projsche_df.dropna(how = "all", subset = projsche_drop_cols).reset_index(drop = True)

In [None]:
len(projsche_df2), len(projsche_df)

In [None]:
projsche_df2.projectid.nunique()

In [None]:
projsche_df2.sample(10)

#### Not a lot of matching values

In [None]:
pd.merge(projsche_df2, projects_df3, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
m1 = pd.merge(projects_df3, projsche_df2,  on ='projectid', how = 'left')

### AGENCY

In [None]:
agency_df = pd.read_sql_query(""" 
SELECT 
name AS agency_name,
agencyid
FROM ctips.agncy
""", engine) 

In [None]:
agency_df.shape

In [None]:
agency_df.agencyid.nunique()

In [None]:
agency_df.head()

In [None]:
pd.merge(m1, agency_df, on ='agencyid', how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
m2 = pd.merge(m1, agency_df,  on ='agencyid', how = 'left')

### COUNTY

In [None]:
county_df = pd.read_sql_query(""" 
SELECT 
name AS county_name,
countyid
FROM ctips.county
""", engine) 

In [None]:
county_df.head()

In [None]:
county_df.shape

In [None]:
county_df.countyid.nunique()

In [None]:
m3 = pd.merge(m2, county_df,  on ='countyid', how = 'left')

In [None]:
stop

### FUNDLINE
* For action: Action: P = programmed, V= vote, A=award

In [None]:
fundline_df = pd.read_sql_query(""" 
SELECT 
action,
con,
pe_paed,
pe_env,
pe_rw,
pe_con,
fundlineid,
fundtypeid,
line_year,
actiondate
FROM ctips.fundline
""", engine) 

In [None]:
fundline_df.shape

In [None]:
fundline_df.head()

In [None]:
fundline_df.info()

In [None]:
stop

### Fundtype
* Fundtype.agencyid = funding agency

In [None]:
fundtype_df = pd.read_sql_query(""" 
SELECT 
fundtypeid,
fundid,
progcode,
programid,
projectid,
agencyid
FROM ctips.fundtype
""", engine) 

In [None]:
fundtype_df.shape

In [None]:
fundtype_df.fundid.nunique()

In [None]:
fundtype_df.projectid.nunique()

In [None]:
fundtype_df.fundtypeid.nunique()

In [None]:
fundtype_df.columns

#### Is this the correct join? 

In [None]:
fundtype_m1 = pd.merge(fundtype_df.drop(columns = ['projectid']),fundline_df,  on = ['fundtypeid'], how = "left")

In [None]:
m3.projectid.nunique(), len(m3)

In [None]:
project_preview = ['ctips_id','projectid', 'high_ver', 'high_offcl']

In [None]:
fundtype_m2 = pd.merge(m3[project_preview], fundtype_df['projectid'], on = ['projectid'], how = "inner")

In [None]:
fundtype_m2.shape

In [None]:
fundtype_m2.projectid.value_counts().head()

In [None]:
fundtype_m2.projectid.value_counts().describe()

In [None]:
fundtype_m1.columns

In [None]:
fundtype_m2.columns

In [None]:
final_fund_merge = pd.merge(fundtype_m1, fundtype_m2, on = ['fundtypeid'], how = "inner")

In [None]:
final_fund_merge.shape

In [None]:
final_fund_merge.projectid.value_counts().head()

#### Test

In [None]:
m3.query('projectid == 20700006229').drop(columns = ['proj_desc'])[['projectid','ctips_id', 'high_ver', 'high_offcl']]

In [None]:
fundtype_m2.loc[fundtype_m2.projectid == 20700006229].sort_values(['fundid']).shape

In [None]:
final_fund_merge.columns

In [None]:
final_fund_merge.loc[final_fund_merge.projectid == 20700006229].sort_values(['fundid'])

### Progmain

In [None]:
progmain_df = pd.read_sql_query(""" 
SELECT 
programid,
category
FROM ctips.progmain
""", engine) 

In [None]:
progmain_df.head()

In [None]:
progmain_df = progmain_df.rename(columns = {'category':'program'})

In [None]:
progmain_df.programid.nunique()

In [None]:
progmain_df.programid.shape

### Fund

In [None]:
fund_df = pd.read_sql_query(""" 
SELECT 
fund,
fundid
FROM ctips.fund
""", engine) 

### Progsub

In [None]:
progsub_df = pd.read_sql_query(""" 
SELECT 
progcode,
progdesc
FROM ctips.progsub
""", engine) 

In [None]:
progsub_df.head(1)

In [None]:
progsub_df.shape

In [None]:
progsub_df.progcode.nunique()

In [None]:
progsub_df.progcode.value_counts().head()

In [None]:
progsub_df.loc[progsub_df.progcode == "20.30.720.100"]

### Merge

In [None]:
fundline_m1.columns

In [None]:
fundline_m2 = (fundline_m1.merge(progmain_df, on = ['programid'], how = "left")
              .merge(fund_df, on =['fundid'], how = "left")
              .merge(progsub_df, on = ['progcode'], how = 'left'))

In [None]:
fundline_m2.sample(3)

In [None]:
fundline_m2.projectid.value_counts().head()

In [None]:
fundline_m2.projectid.value_counts().describe()

### Political

In [None]:
political_df = pd.read_sql_query(""" 
SELECT 
assembly01,
ushouse01,
ssenate01,
projectid
FROM ctips.politcal
""", engine) 
# Drop any rows with nulls
political_df = political_df.dropna(how = "any")
pd.merge(m3, political_df, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()
m4 = pd.merge(m3, political_df, on ='projectid', how = 'left')