In [None]:
import pandas as pd 
import sqlalchemy 
import sys 
import oracledb 

In [None]:
oracledb.version = "8.3.0" 
sys.modules["cx_Oracle"] = oracledb 

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
DIALECT = 'oracle'  

In [None]:
ENGINE_PATH_WIN_AUTH =  f"{DIALECT}://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/?service_name={SERVICE}" 

In [None]:
engine = sqlalchemy.create_engine(ENGINE_PATH_WIN_AUTH)   

## Projects
* Each project has a unique row
* Use status_code to filter only for active projects

In [None]:
projects_df = pd.read_sql_query(""" 
SELECT 
agency_code,
county_code,
comment_desc,
district_code, 
est_total_prj_costs,
location_name,
project_id, 
project_label_name,
project_planning_id,
project_category_type_code,
original_post_mile_begin_id,
original_post_mile_end_id,
revised_post_mile_begin_ind,
revised_post_mile_end_ind,
route_name,
state_hwy_ind,
senate_district_code,
work_type_code 
FROM projects 
WHERE status_code = 'Active'
""", engine) 

In [None]:
projects_df.shape

In [None]:
projects_df.head()

In [None]:
projects_df.project_id.nunique()

## Local Agencies

In [None]:
local_agencies_df = pd.read_sql_query(""" 
SELECT 
agency_code,
agency_name,
congressional_district,
urban_area_code
FROM local_agencies
""", engine) 

In [None]:
local_agencies_df.shape

In [None]:
local_agencies_df.agency_name.nunique()

In [None]:
projects_df2 = pd.merge(projects_df, local_agencies_df, on = "agency_code", how = "left")

## Counties


In [None]:
counties_df = pd.read_sql_query(""" 
SELECT 
county_code,
county_name
FROM counties
""", engine) 

In [None]:
counties_df.shape

In [None]:
projects_df3 = pd.merge(projects_df2, counties_df, on = "county_code", how = "left")

In [None]:
projects_df3 = projects_df3.drop(columns = ['agency_code', 'county_code'])

## Work Types

In [None]:
work_types_df = pd.read_sql_query(""" 
SELECT 
work_type_code,
work_type_desc
FROM work_types
""", engine) 

In [None]:
projects_df4 = pd.merge(projects_df3, work_types_df, on = "work_type_code", how = "left")

In [None]:
projects_df4.sample(3)

In [None]:
projects_df4.columns

In [None]:
projects_df4[['project_planning_id','district_code']].sample(1)

## EA Number
project_id,
district_code,
ea_assign_date,
expense_authorization_id
FROM

In [None]:
ea_df = pd.read_sql_query(""" 
SELECT 
* FROM expense_authorizations
""", engine) 

In [None]:
ea_df.shape, ea_df.project_id.nunique()

In [None]:
ea_df.columns

In [None]:
# join with only the relevant projects 1st
outer_join = pd.merge(ea_df, projects_df4, on = ['district_code','project_id'], how = "outer", indicator = True)

In [None]:
outer_join[['_merge']].value_counts()

In [None]:
outer_join.loc[outer_join._merge == "both"][['project_id']].nunique()

In [None]:

ea_df = pd.merge(projects_df4, ea_df, on = ['district_code','project_id'], how = "inner")

In [None]:
ea_og_cols = ['district_code', 'expense_authorization_id', 'project_id',
       'multi_phase_ind', 'ea_assign_date', 'updater_user_name',
       'last_digit_validation_no', 'ea_comments']

In [None]:
ea_df = ea_df[ea_og_cols]

In [None]:
len(ea_df)

In [None]:
ea_df.project_id.value_counts().describe()

In [None]:
ea_df.project_id.value_counts().head()

In [None]:
ea_df.loc[ea_df.project_id == "5932(042)"].sort_values(['ea_assign_date'], ascending = False)

In [None]:
# ea_df.loc[ea_df.project_id == "5006(504)"].sort_values(['ea_assign_date'], ascending = False)

In [None]:
# ea_df = ea_df.sort_values(['ea_assign_date'], ascending = False).reset_index(drop = True)

In [None]:
ea_df.project_id.nunique()

In [None]:
ea_df2.project_id.nunique()

In [None]:
ea_df2.expense_authorization_id.nunique()

In [None]:
ea_df2.expense_authorization_id.value_counts().head()

In [None]:
ea_df.loc[ea_df.expense_authorization_id == "924969"]

In [None]:
ea_df.loc[ea_df.expense_authorization_id == "924360"]

In [None]:
# Keep only the most recent EA number
# Only one EA number per project
ea_df2 = (ea_df
          .sort_values(['ea_assign_date'], ascending = False)
          .drop_duplicates(subset=['project_id','district_code'])
          .drop(columns = ['ea_assign_date'])
          .reset_index(drop = True)
         )

In [None]:
ea_df2.project_id.nunique() == len(ea_df2)

In [None]:
ea_df2.loc[ea_df2.project_id == "5006(504)"]

In [None]:
# Merge with project
projects_df5 = pd.merge(projects_df4, ea_df2, on = ['district_code','project_id'], how = "left")

In [None]:
len(projects_df5) == len(projects_df)

In [None]:
projects_df5.loc[projects_df5.project_id == "5006(504)"]

## Agreement & Fund Program Year
* How does this differ from the Finance Letter?
* What does the program_code stand for...
* What's the difference between program_year.agreement vs fund_program_year.fund_programmed_amounts

In [None]:
agreement_df = pd.read_sql_query(""" 
SELECT 
program_year,
prefix_project_id,
program_code 
FROM agreements
WHERE program_year IS NOT null
""", engine) 

In [None]:
agreement_df.head()

In [None]:
agreement_df[['prefix', 'project_id']] = agreement_df.prefix_project_id.str.rsplit('-', n=1, expand=True)

In [None]:
agreement_df.shape, agreement_df.prefix_project_id.nunique()

In [None]:
agreement_df.prefix.value_counts().head()

In [None]:
agreement_df.program_code.value_counts()

### Fund Programmed Amounts
* what do the acronyms stand for?

In [None]:
fund_program_df = pd.read_sql_query(""" 
SELECT *
FROM fund_programed_amounts
""", engine) 

In [None]:
fund_program_df.project_id.nunique(), fund_program_df.shape

In [None]:
og_fund_cols = list(fund_program_df.columns)

In [None]:
# Merge with project_df6 to get only the projects we care about
fund_program_df2 = pd.merge(fund_program_df, projects_df5, on = "project_id", how ='inner')

In [None]:
fund_program_df2.project_id.nunique()

In [None]:
fund_program_df2.project_id.value_counts().head()

In [None]:
fund_program_df2 = fund_program_df2[og_fund_cols]

In [None]:
fund_program_df2.phase_id.unique()

In [None]:
fund_program_df2.head()

#### Test with one project

In [None]:
one_project = fund_program_df2.loc[fund_program_df2.project_id == "5907(014)"]

In [None]:
test_group2.loc[test_group2.project_id == "5907(014)"]

In [None]:
one_project.ctips_id.nunique()

In [None]:
one_project.ctips_project_id.nunique()

In [None]:
one_project.groupby(['project_id','phase_id','program_code',]).agg({'programmed_amt':'sum'}).reset_index()

In [None]:
one_project

#### Merge and compare fund programmed amounts with agreement 

In [None]:
pd.merge(fund_program_df2, agreement_df, left_on = ['project_id'], right_on = ['project_id'], how = "outer", indicator = True)[['_merge']].value_counts()

In [None]:
m1 = pd.merge(fund_program_df2, agreement_df,  left_on = ['project_id','fund_program_year'], right_on = ['project_id','program_year'],how = "left", indicator = True)

In [None]:
m1.loc[m1._merge == "both"].head()

In [None]:
# pd.merge(agreement_df, projects_df5, on = ['project_id'], how = "outer", indicator = True)[['_merge']].value_counts()

In [None]:
# projects_df6 = pd.merge(projects_df5, agreement_df, on = ['project_id'], how = "left")

In [None]:
# projects_df6.program_year.value_counts()

## EFIS_MV_BUD_STRU_94_LVL_3_VW
* How to link this to `projects?`

In [None]:
efis_df = pd.read_sql_query(""" 
SELECT *
FROM EFIS_MV_BUD_STRU_94_LVL_3_VW
""", engine) 

In [None]:
# This is not working 
efis_join_df = pd.read_sql_query(""" 
SELECT * FROM EFIS_MV_R_PROJECTS_VW
""", engine) 

In [None]:
efis_df.shape

In [None]:
efis_df.ppno.nunique()

In [None]:
efis_df.adv_project_id.nunique()

In [None]:
efis_df.head()

In [None]:
pd.merge(efis_df, projects_df5, left_on = ['adv_project_id'], right_on = ['project_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

## Draft
### Project Letter Components 
* Might not need to addthis?

In [None]:
#project_letter_df = pd.read_sql_query(""" 
#SELECT finance_letter_item_code, item_code_desc FROM finance_letter_components
#""", engine) 

In [None]:
#project_letter_df

### Project Finance Letters
* What do all the item codes mean?
* What differentiates all the finance_header from each other?

In [None]:
finance_letters_df = pd.read_sql_query(""" 
SELECT 
project_id, field_order_seq, fin_ltr_seq,
       finance_letter_item_code, finance_header1, finance_amt1,
       finance_header2, finance_amt2, finance_header3, finance_amt3,
       finance_header4, finance_amt4, finance_header5, finance_amt5,
       finance_header6, finance_amt6, finance_header7, finance_amt7,
       finance_header8, finance_amt8, finance_header9, finance_amt9,
       finance_header10, finance_amt10, finance_header11,
       finance_amt11, finance_header12, finance_amt12,
       finance_header13, finance_amt13, total_cost_work_amt,
       total_participation_cost_amt, fed_reimbursement, finance_header14,
       finance_amt14, finance_header15, finance_amt15,
       prorata_or_lumpsum, create_date, finance_header16,
       finance_amt16, finance_header17, finance_amt17,
       finance_header18, finance_amt18, finance_header19,
       finance_amt19, finance_header20, finance_amt20
FROM project_finance_letters
""", engine) 

In [None]:
# finance_letters_df.columns

In [None]:
# ['project_finance_letters.' + s for s in list(finance_letters_df.columns)]

In [None]:
finance_letters_df.project_id.nunique()

In [None]:
finance_letters_df.project_id.value_counts().head(10)

In [None]:
finance_letters_df.finance_header18.value_counts()

In [None]:
finance_letters_df.finance_header10.nunique()

#### Test to see the difference between total_cost_work_amt and summing all finance_amt columns

In [None]:
test_projects = ["6211(131)","5133(035)", "5003(003)"]

In [None]:
# test_df = finance_letters_df.loc[finance_letters_df.project_id.isin(test_projects)].reset_index(drop = True)

* Understanding the relationship between the finance_amt columns, the total_cost_work_amt, and total_participation_cost_amt.
* It seems like all the finance amounts summed equal to total_cost_work_amt

In [None]:
# finance_letter_columns = list(finance_letters_df.columns)

In [None]:
# finance_amts = [col for col in finance_letter_columns if 'finance_amt' in col]

In [None]:
# finance_letters_df['all_finance_amt_summed'] = finance_letters_df[finance_amts].sum(axis=1)

In [None]:
# finance_letters_df.shape,finance_letters_df.fin_ltr_seq.nunique()

In [None]:
test_grouped = (finance_letters_df
                .groupby(['project_id','finance_letter_item_code','fin_ltr_seq'])
                .agg({'all_finance_amt_summed':'max', 'total_cost_work_amt':'max','total_participation_cost_amt':'max'})
                .reset_index())

In [None]:
test_grouped['finance_amt_v_to_total_cost'] = (test_grouped.all_finance_amt_summed - test_grouped.total_cost_work_amt).fillna(0).astype(int)

In [None]:
test_grouped['finance_amt_v_to_parti'] = test_grouped.all_finance_amt_summed - test_grouped.total_participation_cost_amt

In [None]:
test_grouped.finance_amt_v_to_total_cost.value_counts()

In [None]:
test_grouped.loc[test_grouped.finance_amt_v_to_total_cost == 28460]

In [None]:
finance_letters_df.project_id.value_counts().head()

In [None]:
# test_df.loc[test_df.fin_ltr_seq == 92475].T

* Is it ok to sum up by project_id and phase to get the total amount?
* How to incorporate program information?
    * Explode finance_header stuff, separate out by commas, and remove duplicates?
* Safe to filter out "BLANK" values in the finance_letter_item_code column?
* Group columns together. Ex: Right of Way is separated into a few different columns.

In [None]:
finance_headers = [col for col in finance_letter_columns if 'finance_header' in col]

In [None]:
def delete_repeated_element(df, col: str):
    """
    If an element is repeated more than once and delinated by commas
    in a column, delete the duplicative ones.
    
    Ex: the column "grocery_list" has apples, cherries, cheese, and apples.
    Keep apples only once. 
    """
    df[col] = (
        df[col]
        .apply(lambda x: ", ".join(set([y.strip() for y in x.split(",")])))
        .str.strip()
    )
    return df

In [None]:
# test_df['program'] = test_df[finance_headers].apply(lambda row: ','.join(row.values.astype(str)), axis=1)

In [None]:
# test_df = delete_repeated_element(test_df, 'program')

* Why are some columns blank? Can I just delete them?

In [None]:
test_group2 = (finance_letters_df
               .groupby(['project_id','finance_letter_item_code'])
               .agg({'total_cost_work_amt':'sum'})
               .reset_index()
              )

In [None]:
test_group2 = (test_group2
               .pivot(index = ['project_id'],columns = ['finance_letter_item_code'], 
                      values=["total_cost_work_amt"])
               .reset_index()
              )

In [None]:
test_group2.columns = test_group2.columns.droplevel(0)

In [None]:
test_group2.columns

In [None]:
test_group2.columns = ['project_id', 'ACE', 'AFM', 'APE', 'APEPA', 'APEPS', 'BLANK', 'CON', 'CONTR',
       'E&P', 'FACT', 'FLIN', 'OSFCE', 'OTHER', 'PS&E', 'RPC01', 'RW', 'RWAQ',
       'RWENG', 'RWRA', 'RWURC', 'SCE', 'SFM', 'SFMT', 'SFMTO', 'SPE', 'SQA']

In [None]:
test_group2.BLANK.value_counts()

In [None]:
finance_letters_df.loc[finance_letters_df.project_id == "5133(035)"][['finance_letter_item_code']].value_counts()

In [None]:
test_group2.loc[test_group2.project_id == "5133(035)"]

In [None]:
rwaq_test = finance_letters_df[(finance_letters_df.project_id == "5133(035)") & (finance_letters_df.finance_letter_item_code == 'RWAQ')][['total_cost_work_amt']].sum()

In [None]:
14418598.00 == rwaq_test