In [None]:
import pandas as pd 
import sqlalchemy 
import sys 
import re
import oracledb 

In [None]:
oracledb.version = "8.3.0" 
sys.modules["cx_Oracle"] = oracledb 

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
DIALECT = 'oracle'  

In [None]:
ENGINE_PATH_WIN_AUTH =  f"{DIALECT}://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/?service_name={SERVICE}" 

In [None]:
engine = sqlalchemy.create_engine(ENGINE_PATH_WIN_AUTH)   

## Projects
* Each project has a unique row
* Use status_code to filter only for active projects

In [None]:
projects_df = pd.read_sql_query(""" 
SELECT 
agency_code,
county_code,
comment_desc,
district_code, 
est_total_prj_costs,
location_name,
project_id, 
project_label_name,
project_planning_id,
project_category_type_code,
original_post_mile_begin_id,
original_post_mile_end_id,
revised_post_mile_begin_ind,
revised_post_mile_end_ind,
route_name,
state_hwy_ind,
senate_district_code,
work_type_code 
FROM projects 
WHERE status_code = 'Active'
""", engine) 

In [None]:
projects_df.shape

In [None]:
projects_df.head()

In [None]:
projects_df.project_id.nunique()

## Local Agencies
* To do: later just use y/n for urban_area_code. it doesn't mean which urban area the project is located in, I just want to know whether or not it is in an urban area.

In [None]:
local_agencies_df = pd.read_sql_query(""" 
SELECT 
agency_code,
agency_name,
congressional_district,
urban_area_code
FROM local_agencies
""", engine) 

In [None]:
local_agencies_df.shape

In [None]:
local_agencies_df.agency_name.nunique()

In [None]:
projects_df2 = pd.merge(projects_df, local_agencies_df, on = "agency_code", how = "left")

## Assembly Districts

In [None]:
subset = ['agency_code', 'project_id', 'project_label_name','comment_desc', 'location_name']

In [None]:
projects_df2_subset = projects_df2[subset]

### Split this out to its own dataframe
* Or else the dataframe becomes 70,000 + rows because an agency can fall in mulitple assembly districts.

In [None]:
assembly_df = pd.read_sql_query(""" 
SELECT * FROM assembly_districts
""", engine) 

In [None]:
assembly_districts = pd.merge(projects_df2_subset, assembly_df, on = "agency_code", how = "left")

## Counties


In [None]:
counties_df = pd.read_sql_query(""" 
SELECT 
county_code,
county_name
FROM counties
""", engine) 

In [None]:
counties_df.shape

In [None]:
projects_df3 = pd.merge(projects_df2, counties_df, on = "county_code", how = "left")

In [None]:
projects_df3 = projects_df3.drop(columns = ['agency_code', 'county_code'])

## Work Types

In [None]:
work_types_df = pd.read_sql_query(""" 
SELECT 
work_type_code,
work_type_desc
FROM work_types
""", engine) 

In [None]:
projects_df4 = pd.merge(projects_df3, work_types_df, on = "work_type_code", how = "left")

In [None]:
projects_df4[['project_category_type_code','work_type_code','work_type_desc']].sample(5)

## Project Category Type Codes

In [None]:
project_cat_df = pd.read_sql_query(""" 
SELECT category_code, category_desc FROM project_category_type_codes
""", engine) 

In [None]:
projects_df5 = pd.merge(projects_df4, project_cat_df, left_on = "project_category_type_code",
         right_on = "category_code", how = "left") 

In [None]:
projects_df5[['project_category_type_code','category_desc','work_type_code','work_type_desc']].sample(20)

In [None]:
projects_df5 = projects_df5.drop(columns = ['project_category_type_code', 'work_type_code'])

In [None]:
# There are no merges when I use program_category_code for project_cat_df
#pd.merge(projects_df4, project_cat_df, left_on = "project_category_type_code",
#         right_on = "program_category_code", how = "outer", indicator = True)[['_merge']].value_counts()

## EA Number
* Projects can have multiple EA numbers.
* Although EA numbers are antiquated, I will still keep the most recent EA number if it's available because this might be useful to find the same project in a different system.

In [None]:
ea_df = pd.read_sql_query(""" 
SELECT 
project_id, 
district_code,
ea_assign_date, 
expense_authorization_id 
FROM expense_authorizations
""", engine) 

In [None]:
ea_df.shape, ea_df.project_id.nunique()

In [None]:
# Do an outer join to understand what's going on under the hood
# outer_join = pd.merge(ea_df, projects_df4, on = ['district_code','project_id'], how = "outer", indicator = True)

In [None]:
# outer_join[['_merge']].value_counts()

In [None]:
# Understand why there are more rows compared to project_ids that are unique
# Mult
# outer_join.loc[outer_join._merge == "both"][['project_id']].nunique()

In [None]:
# Do an inner merge to get only 
ea_df = pd.merge(projects_df4, ea_df, on = ['district_code','project_id'], how = "inner")

In [None]:
ea_og_cols = ['district_code', 'expense_authorization_id', 'project_id',
      'ea_assign_date']

In [None]:
# Keep only original columns 
ea_df = ea_df[ea_og_cols]

In [None]:
len(ea_df)

In [None]:
ea_df.project_id.value_counts().describe()

In [None]:
ea_df.project_id.value_counts().head()

In [None]:
# Keep only the most recent EA number
# Only one EA number per project
ea_df2 = (ea_df
          .sort_values(['ea_assign_date'], ascending = False)
          .drop_duplicates(subset=['project_id','district_code'])
          .drop(columns = ['ea_assign_date'])
          .reset_index(drop = True)
         )

In [None]:
ea_df2.project_id.nunique()

In [None]:
ea_df2.expense_authorization_id.nunique()

#### The same EA number matches to multiple projects
* Understand why this is happening
* It seems like the same EA number matches multiple projects that have nothing in common.

In [None]:
ea_df2.expense_authorization_id.value_counts().head(20)

In [None]:
ea_df.loc[ea_df.expense_authorization_id == "924969"]

In [None]:
ea_df.loc[ea_df.expense_authorization_id == "924360"]

In [None]:
ea_df2.project_id.nunique() == len(ea_df2)

In [None]:
# Merge with project
projects_df6 = pd.merge(projects_df5, ea_df2[['project_id','district_code','expense_authorization_id']], on = ['district_code','project_id'], how = "left")

In [None]:
len(projects_df6) == len(projects_df)

In [None]:
preview_cols = ['expense_authorization_id','district_code', 'est_total_prj_costs', 
                'location_name',
       'project_id', 'project_label_name', 'project_planning_id',
       'senate_district_code',  'agency_name',
       'congressional_district', 'urban_area_code', 'county_name',
       'work_type_desc',
       'category_desc', ]

* Shares the EA of 924360

In [None]:
projects_df6.loc[projects_df6.project_id == '0061(025)'][preview_cols]

## EFIS_MV_BUD_STRU_94_LVL_3_VW
* There is filter out closed projects using a table sourced from Advantage.
* Send duplicated project IDS. 

In [None]:
efis_df = pd.read_sql_query(""" 
SELECT *
FROM EFIS_MV_BUD_STRU_94_LVL_3_VW
WHERE pec_code LIKE '%2030%'
""", engine) 

In [None]:
efis_df.shape

In [None]:
efis_df.pec_code.value_counts()

In [None]:
efis_df.adv_project_id.nunique()

In [None]:
efis_df.adv_project_id.value_counts().sample(5)

In [None]:
efis_df_column = list(efis_df.columns)

In [None]:
len(efis_df_column)

In [None]:
efis_df_column_subset = ['adv_project_id',
 'phase_code',
 'fund_code',
 'pec_code',
 'pect_task_code',
 'appropriation_category_code','orig_bud_am',
 'curr_bud_am','cash_exp_am','enc_am']

### Efis Join
* Use project_status_code, if it's 9* which means it has been closed.
* 7 is in final voucher but not completed. 
* Use wildcard to eliminate all the 9's.

In [None]:

efis_join_df = pd.read_sql_query(""" 
SELECT adv_project_id,
project_id,
project_status_code
FROM EFIS_MV_R_PROG_VW
WHERE project_status_code NOT LIKE '%9%'
""", engine) 

In [None]:
efis_join_df.shape

In [None]:
efis_join_df.project_status_code.value_counts()

In [None]:
efis_join_df.project_id.nunique()

In [None]:
efis_join_df.project_id.value_counts().head()

In [None]:
# projects_df6.loc[projects_df6.project_id == '5405(077)']

* Not everything has merged

In [None]:
efis_df.shape

In [None]:
efis_join_df.shape

In [None]:
pd.merge(efis_df, efis_join_df, on = ['adv_project_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
f"{(26434+18576+22)-len(efis_df)} more rows"

In [None]:
efis_m1 = pd.merge(efis_df, efis_join_df, on = ['adv_project_id'], how = 'inner')

#### Merge Efis with a subset of projects
* Because need to manipulate this data a little bit

In [None]:
projects_df_subset_cols = ['project_label_name', 'project_id', 'work_type_desc', 'district_code','category_desc','location_name']

In [None]:
projects_df_subset = projects_df6[projects_df_subset_cols]

In [None]:
pd.merge(efis_m1, projects_df_subset, on = ['project_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
advantage_m = pd.merge(efis_m1, projects_df_subset, on = ['project_id'], how = 'inner')

In [None]:
advantage_m.project_id.nunique()

In [None]:
advantage_m.pec_code.value_counts()

In [None]:
advantage_m.fund_code.value_counts()

In [None]:
advantage_m.sample()

#### Bring in pec_code/fund_code
* Section 4 https://accounting.onramp.dot.ca.gov/manual/7-program-codes

In [None]:
section_4 = pd.read_excel('Section 4 Program Codes.xlsx')

In [None]:
section_4[['code', 'definition1']] = section_4['Section 4 '].str.rsplit('-', n=1, expand=True)

In [None]:
# Just drop the rows that don't play nice for now
section_5 = section_4.dropna(subset = ['definition1'])

In [None]:
section_5.info()

In [None]:
section_5[['code1','code2','code3', 'code4']] = section_5['Section 4 '].str.rsplit('.', n=1, expand=True)

In [None]:
section_4

#### Summarize information
##### Sum up everything since this is all single phase anyway

In [None]:
efis_df_column_subset.insert(0,'project_id')

In [None]:
advantage_m = advantage_m[efis_df_column_subset] 

In [None]:
# Sum up everything since this is all single phase anyway
lp2000_phase = (advantage_m
                  .groupby(['project_id','fund_code','pec_code'])
                  .agg({'orig_bud_am':'sum', 
                        'cash_exp_am':'sum'})
                  .reset_index()
                  .rename(columns = {'orig_bud_am':'total_encumbrance_amt',
                                    'cash_exp_am':'total_expenditure_amt'})
                  )

In [None]:
lp2000_phase.shape

In [None]:
lp2000_phase.head()

In [None]:
lp2000_phase.project_id.nunique()

##### Summarize the program info
* Is the year considered "awarded" year

In [None]:
advantage_m.sample()

In [None]:
# Only want the most recent year of a pec_code listed once
lp2000_awards1 = (advantage_m
                  .groupby(['project_id', 'pec_code'])
                  .agg({'appropriation_category_code':'max'})
                  .reset_index()
                  .rename(columns = {'appropriation_category_code':'year'})
                  .drop_duplicates(subset = ['project_id','pec_code'])
                  )

In [None]:
lp2000_awards1.shape

In [None]:
lp2000_awards1.project_id.value_counts().head()

In [None]:
lp2000_awards1.loc[lp2000_awards1.project_id == "5182(058)"]

In [None]:
# Only want the most recent year of a pec_code listed once
lp2000_awards2 = (advantage_m
                  .groupby(['project_id', 'fund_code'])
                  .agg({'appropriation_category_code':'max'})
                  .reset_index()
                  .rename(columns = {'appropriation_category_code':'year'})
                  .drop_duplicates(subset = ['project_id','fund_code'])
                  )

In [None]:
lp2000_awards2.shape

In [None]:
lp2000_awards2.project_id.value_counts().head()

In [None]:
lp2000_awards2.loc[lp2000_awards2.project_id == "5182(058)"]

#### Understand why some advantage ids have multiple projects
* Older projects may have mulitple project IDS
* However, it should be one to one. 
* These are exceptions, glitches from migrating to a new system 
* Anything with 3-4 zeroes are older, should have been converted back in early 2010's.
* These duplicated projects in advantage are essentially closed. 
* They should be closed in LP2000.
* Send them a list of all the projects that have duplicated advantage ids to close.

In [None]:
(efis_m2
 .groupby(['adv_project_id',])
 .agg({'project_id':'nunique'})
 .sort_values(['project_id'], ascending = False)
 .head(10)
)

#### 1100000058

In [None]:
# efis_m2[efis_m2.adv_project_id == '1100000058'][efis_df_column_subset]

In [None]:
efis_m2[efis_m2.adv_project_id == '1100000092'][['adv_project_id','project_id']].drop_duplicates()

In [None]:
projects_df_subset[projects_df_subset.project_id == '5004(140)']

In [None]:
projects_df_subset[projects_df_subset.project_id == '6211(093)']

## Find Repeated Projects
* I noticed project ids are the same, except for the stuff in the parantheses

In [None]:
# Delete out any extremely vague location names/project_id2 that just say "county" or "city"
# repeated_projs3 = (repeated_projs2[(repeated_projs2.location_name.str.contains('City|County|Programming')==False) & (repeated_projs2.location_name_length > 4)]).reset_index()

In [None]:
projects_df_og = pd.read_sql_query(""" 
SELECT *
FROM projects 
WHERE status_code = 'Active'
""", engine) 

In [None]:
subset_to_drop = list(projects_df_g.columns)

In [None]:
# Drop all duplicates for all columns except the few that follow
subset_to_drop.remove('project_id')

In [None]:
subset_to_drop.remove('updater_user_name')

In [None]:
subset_to_drop.remove('update_date_time')

In [None]:
subset_to_drop.remove('create_user_name')

In [None]:
subset_to_drop.remove('create_date_time')

In [None]:
# Remove duplicated projects
projects_no_dups = projects_df_g.drop_duplicates(subset=subset_to_drop).reset_index(drop = True)

In [None]:
len(projects_no_dups)

In [None]:
# Find common columns for merging
common_cols = list(set(subset_to_drop).intersection(filtered_projects))

In [None]:
common_cols.insert(0,'project_id')

In [None]:
# Merge the og df w/ duplicates and the one w/o duplicates
projects_m1 = pd.merge(projects_df6, projects_no_dups, on = common_cols, how = "outer", indicator = True)

In [None]:
projects_m1._merge.value_counts()

In [None]:
# look at the dropped rows to analyze the duplicate rows. 
# How to find the ones it is exactly like?? 
duplicate_rows = projects_m1.loc[projects_m1._merge == "left_only"]

In [None]:
duplicate_rows.project_id.nunique()

In [None]:
# Check out duplicated projects in the original dataframe
dup_projects = list(duplicate_rows.project_id.unique())

In [None]:
subset_preview = ['project_label_name', 'district_code', 'est_total_prj_costs', 'location_name',
       'project_id', 'project_label_name','comment_desc',
      ]

In [None]:
duplicate_projects = projects_df_og.loc[projects_df_og.project_id.isin(dup_projects)].sort_values(['project_label_name','project_id'])

In [None]:
# duplicate_projects.to_excel("./duplicated_projects_LP2000.xlsx", sheet_name="Sheet_name_1", index=False)