## CTIPS
* https://ctips-prod.dot.ca.gov/ctips/LoginMediatorForm.do

In [None]:
import pandas as pd 
import sqlalchemy 
import sys 
import re
import oracledb 

In [None]:
oracledb.version = "8.3.0" 
sys.modules["cx_Oracle"] = oracledb 

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
# Use SB1_READONLY login
ENGINE_PATH_WIN_AUTH =  f"{DIALECT}://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/?service_name={SERVICE}" 

In [None]:
engine = sqlalchemy.create_engine(ENGINE_PATH_WIN_AUTH)   

In [None]:
def to_snakecase(df):
    df.columns = df.columns.str.lower().str.replace(' ','_')
    return df

### Project 
* Base Table
* Ask if someone can help me fill out the DDP6/DDP8 with the columns I wish for?
* Include FULL_FUNDING column to indicate full funding of a project. 
* ACRHIVE column flags a project as being completed and archived
* How does PROJECTID differ than CTIPS_ID
    * Check # of Project ID is the same/different than # of CTIPS-id
* Mulitple rows for the same project with version: HIGH_VER indicates latest version of a project


To Do
* Clean up strings

In [None]:
projects_df = pd.read_sql_query(""" 
SELECT 
archive,
agencyid,
bond99,
cmia,
countyid,
ctips_id,
const_date,
districtid,
ea_number,
high_ver,
lupdate, 
full_fnding,
needpurpose,
progcode1,
ppno,
proj_desc,
postmiles1,
projcomp_date,
projectid,
ports,
route1,
stip,
shopp,
title,
tcif,
tcrpno,
tcrp,
urbanid
FROM project
""", engine) 

In [None]:
projects_df.shape

In [None]:
projects_df.ctips_id.nunique()

In [None]:
projects_df.archive.value_counts()

In [None]:
projects_df.info()

In [None]:
projects_df.ctips_id.nunique()

In [None]:
projects_df.projectid.nunique()

### A bit of cleaning

In [None]:
projects_df = projects_df.fillna(projects_df.dtypes.replace({'float64': 0.0, 'object': 'None', 'int64': 0}))

In [None]:
string_cols = [col for col in projects_df.columns if projects_df[col].dtype == 'object']

In [None]:
projects_df.info()

In [None]:
string_cols = [
 'needpurpose',

 'proj_desc',
 'route1',
 'title']

In [None]:
for i in string_cols:
        projects_df[i] = projects_df[i].str.title().str.lstrip().str.rstrip()
        projects_df[i] = projects_df[i].replace(r'\s+', ' ', regex=True)

### Looking at duplicate projects

In [None]:
projects_df.title.value_counts().head()

In [None]:
duplicate_projects = projects_df.groupby(['title', 'proj_desc','districtid','agencyid']).agg({'projectid':'nunique'
}).reset_index()

In [None]:
duplicate_projects.projectid.describe()

In [None]:
duplicate_projects.sort_values(['projectid'], ascending = False).head()

### Why is the `ctips_id` the same but `projectid` differs?
* How can there be four "1" if there is only one high version?

In [None]:
one_project = projects_df.loc[(projects_df.title == "Grouped Projects For Bicycle And Pedestrian Facili") & (projects_df.districtid == "11") & (projects_df.agencyid == '6066')]

In [None]:
one_project.high_ver.value_counts()

In [None]:
one_project.ctips_id.unique()

In [None]:
one_project.ctips_id.nunique()

In [None]:
one_project.projectid.nunique()

In [None]:
# projects_df.needpurpose.value_counts().head()

In [None]:
one_project.loc[one_project.high_ver == 1][['ctips_id']].drop_duplicates()

In [None]:
one_project.loc[one_project.high_ver == 1][['proj_desc']].drop_duplicates()

In [None]:
one_project.loc[one_project.high_ver == 1]

### Keep only the high_ver/filter out archive
* Archive: FTIP: Flags a project as being completed and archived. STIP: Flags a project where funds needed to be allocated are allocated and final expenditures have been reported.
    * Assume archive = 1 means it's done. 

In [None]:
projects_df.columns

In [None]:
projects_df.archive = projects_df.archive.fillna(0)

In [None]:
projects_df.archive.value_counts()

In [None]:
projects_df2 = projects_df.loc[projects_df.archive != 1]

#### TO DO: sort by lupdate

In [None]:
projects_df2 = projects_df2.sort_values(by = ['ctips_id', 'projectid','proj_desc', 'high_ver'], ascending = False)

In [None]:
projects_df2 = projects_df2.drop_duplicates(subset = ['ctips_id']).reset_index(drop = True)

In [None]:
projects_df2.shape

In [None]:
projects_df2.loc[(projects_df2.ctips_id == 10600000223)]

In [None]:
projects_df2.ctips_id.nunique()

In [None]:
projects_df2.info()

In [None]:
projects_df2.loc[(projects_df2.title == "Grouped Projects For Bicycle And Pedestrian Facili") & (projects_df2.districtid == "11") & (projects_df2.agencyid == '6066')]

### PROJSCHE
* What's M010, can't find it in the data dictionary

In [None]:
projsche_df = pd.read_sql_query(""" 
SELECT *
FROM projsche
""", engine) 

In [None]:
projsche_df.shape

In [None]:
projsche_df.columns

In [None]:
projsche_df.projectid.nunique()

In [None]:
project_rename = {
     'm020': 'pa_ed_begin', 'm200a':'pa_ed_end', 'm200b':'ps_e_begin',
       'm224':'begin_row', 'm410':'end_row', 'm500':' con_start_date',
    'm600':'con_end_date', 'm700':'begin_closeout', 'm800':'end_closeout',
}

In [None]:
projsche_df = projsche_df.rename(columns = project_rename)

In [None]:
keep = ['projectid', 'pa_ed_begin','pa_ed_end',
       'ps_e_begin',  'begin_row', 'end_row', ' con_start_date',
       'con_end_date', 'begin_closeout', 'end_closeout'
      ]

In [None]:
projsche_df2 = projsche_df[keep]

In [None]:
projsche_df2.info()

In [None]:
keep.remove('projectid')

In [None]:
projsche_df3 = projsche_df2.dropna( how = "all", subset = keep).reset_index(drop = True)

In [None]:
projsche_df3.info()

In [None]:
projsche_df3.sample(3)

In [None]:
pd.merge(projsche_df3, projects_df2, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
m1 = pd.merge(projects_df2, projsche_df3,  on ='projectid', how = 'left')

In [None]:
len(projects_df2)

In [None]:
len(m1)

### AGENCY

In [None]:
agency_df = pd.read_sql_query(""" 
SELECT 
name AS agency_name,
agencyid
FROM agncy
""", engine) 

In [None]:
agency_df.shape

In [None]:
agency_df.head()

In [None]:
agency_df.agencyid.nunique()

In [None]:
pd.merge(m1, agency_df, on ='agencyid', how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
m2 = pd.merge(m1, agency_df,  on ='agencyid', how = 'left')

### COUNTY

In [None]:
county_df = pd.read_sql_query(""" 
SELECT 
name AS county_name,
countyid
FROM county
""", engine) 

In [None]:
county_df.head()

In [None]:
county_df.shape

In [None]:
county_df.countyid.nunique()

In [None]:
m2.columns

In [None]:
m3 = pd.merge(m2, county_df,  on ='countyid', how = 'left')

In [None]:
m3.sample()

### FUNDLINE
* 1,090,000 rows
*  Why are there so many rows?

In [None]:
fundline_df = pd.read_sql_query(""" 
SELECT 
con,
pe_paed,
pe_env,
pe_rw,
pe_con,
projectid,
line_year,
actiondate
FROM fundline
""", engine) 

In [None]:
#fundline_df2 = pd.read_sql_query(""" 
#SELECT 
#*
#FROM fundline
#""", engine) 

In [None]:
# fundline_df2.loc[fundline_df2.projectid == 10900013875]

In [None]:
fundline_df.shape

In [None]:
fundline_df.head()

In [None]:
fundline_df.projectid.nunique()

#### Delete rows in which project is is zero b/c none of the rows from `projects` have a projectid of 0.

In [None]:
fundline_df.projectid.value_counts().head()

In [None]:
fundline_df.info()

In [None]:
projectid_zero = fundline_df.loc[fundline_df.projectid == 0]

In [None]:
projectid_zero.sample(10)

In [None]:
projectid_zero.pe_paed.describe()

In [None]:
m3.loc[m3.projectid == 0].shape

In [None]:
fundline_df2 = fundline_df.loc[fundline_df.projectid != 0].reset_index(drop = True)

In [None]:
fundline_df2.shape

In [None]:
fundline_df2.info()

#### Questions
* Why are there so many duplicates?
    * If I drop the duplicates across the dataframe for all rows, only 77,369 rows left out of 210,092.
* Using project 10900013875 as an example
    * How do I know which value to use for 2011 if there are so many of them?
    * Is there a column that signals which row is the most recently updated from 2011?
    * If I want this data to be one row for one project, can I sum up all the years together? 
    * Is there a way to find out which fund/program this pot of money is coming from. 
        * For construction in 2013, 5,000 comes from ATP, 10,000 comes from Sb1, etc etc

In [None]:
fundline_df2.projectid.value_counts().head()

In [None]:
# fundline_df3 = fundline_df2.drop_duplicates().sort_values(['line_year','con', 'pe_paed', 'pe_env', 'pe_rw', 'pe_con', 'actiondate'], ascending = False)

In [None]:
len(fundline_df2.drop_duplicates())

In [None]:
fundline_df3 = fundline_df2.drop_duplicates().reset_index(drop = True)

In [None]:
len(fundline_df3), len(fundline_df2)

In [None]:
fundline_df3.columns

In [None]:
sum_cols = ['con', 'pe_paed', 'pe_env', 'pe_rw', 'pe_con']

In [None]:
fundline_df3.loc[fundline_df3.projectid == 10900013875].groupby(['projectid']).agg(
    {**{e: "sum" for e in sum_cols}}
)

In [None]:
fundline_df3.loc[fundline_df3.projectid == 10900013875].sort_values(by=['line_year'])

In [None]:
pd.merge(m3, fundline_df3, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()

### Political

In [None]:
political_df = pd.read_sql_query(""" 
SELECT 
assembly01,
ushouse01,
ssenate01,
projectid
FROM politcal
""", engine) 

In [None]:
political_df.shape

In [None]:
political_df.info()

In [None]:
political_df = political_df.dropna(how = "any")

In [None]:
political_df.info()

In [None]:
pd.merge(m3, political_df, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()

#### Change this to m4 later

In [None]:
m5 = pd.merge(m3, political_df, on ='projectid', how = 'left')

In [None]:
m5.info()

### Fundtype
* Difference between fundid and fundsource?
* Can't find these columns in the data dictionary
    * p35
    * mionr
* Difference between procode and programid? 
* Programid links to  PROGMAIN
* Why are there so many fundtypeid that list essentailly the same information? 

In [None]:
fundtype_df = pd.read_sql_query(""" 
SELECT 
*
FROM fundtype
""", engine) 

In [None]:
fundtype_df.head()

In [None]:
fundtype_df.projectid.nunique()

In [None]:
fundtype_df.fundtypeid.nunique()

In [None]:
fundtype_df.info()

In [None]:
# fundtype_df.projectid.value_counts()

In [None]:
fundtype_df.projectid.value_counts().describe()

#### See why there are so many rows for one projectid.
* Repeated info for every column except fundtypeid which has a unique value 

In [None]:
len(fundtype_df.drop(columns = ['fundtypeid']).drop_duplicates())

In [None]:
fundtype_df.drop(columns = ['fundtypeid']).loc[fundtype_df.projectid ==20700009251].drop_duplicates()

In [None]:
fundtype_df2 = fundtype_df.drop(columns = ['fundtypeid']).drop_duplicates().reset_index(drop = True)

In [None]:
len(fundtype_df2), len(fundtype_df)

#### Spelled out fund and program names

In [None]:
progmain_df = pd.read_sql_query(""" 
SELECT 
*
FROM progmain
""", engine) 

In [None]:
progmain_df.head()

In [None]:
progmain_df = progmain_df[['programid', 'category']]

In [None]:
progmain_df.shape

In [None]:
progmain_df = progmain_df.rename(columns = {'category':'program'})

In [None]:
progmain_df.programid.nunique()

In [None]:
fund_df = pd.read_sql_query(""" 
SELECT 
*
FROM fund
""", engine) 

In [None]:
fund_df2 = fund_df[['fund','fundid']]

In [None]:
fund_df2.shape

In [None]:
fund_df2.fund.nunique()

In [None]:
fund_df2.fund.value_counts().head(10)

In [None]:
fund_df2.loc[fund_df2.fund == "Advanced Transportation and Congestion Management"]

In [None]:
    fund_df2.fundid.nunique()

#### Merge

In [None]:
fundtype_df2.columns

In [None]:
progmain_df.columns

In [None]:
fundtype_m1 = pd.merge(fundtype_df2, progmain_df, on = "programid", how = "left").merge(fund_df2, on = "fundid", how = "left")

In [None]:
len(fundtype_m1), len(fundtype_df2)

In [None]:
cols_to_keep = ['fundsource', 'minor', 'progcode',
       'projectid', 'notes', 
       'program', 'fund']

In [None]:
fundtype_m2 = fundtype_m1[cols_to_keep]

#### Is there a way to tell how much money came from each fund?
* For the project below, 10,000 comes from demonstration fund, 4,000 comes from general funds, 20,000 comes from future funds

In [None]:
fundtype_m1.loc[fundtype_m1.projectid == 20600035929]

In [None]:
fundtype_m2.loc[fundtype_m2.projectid == 20600035929]

In [None]:
fundtype_m2.projectid.value_counts().describe()

In [None]:
fundtype_m2.program.value_counts()

In [None]:
fundtype_m2.program.unique()

In [None]:
 pd.merge(m5[['projectid']], fundtype_m2, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
# This is a separate table
fundtype_final = pd.merge(m5[['projectid']], fundtype_m2, on ='projectid', how = 'inner')

In [None]:
fundtype_final = fundtype_final.sort_values(['projectid']).reset_index(drop = True)

In [None]:
fundtype_final.head(20)