# Data Prep for ATP 

**Duplicate notebook using concat method**

* manual cleaning needed outside of notebook for funding data

In [2]:
import numpy as np
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
from dla_utils import _dla_utils
from siuba import *

import _data_cleaning

ModuleNotFoundError: No module named '_data_cleaning'

In [None]:
pd.set_option("display.max_columns",500)

* some `a1_imp_agcy_fed_ma_num` are not present 
*  merge on `project_app_id`


* need function for reading in funding data and which projects get selected for funding
* sheets of Master_AllData that we need:
    * Master_Yes
    * Statewide SUR Details (merge with SUR Funding
    * Statewide SUR Funding (merge with SUR Details) 
* using a copy of the data to account for multiple headers

In [None]:
GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/dla/atp/'


## Read in Master Data

In [None]:
# reading the clean data (from atp script)
#df = _data_cleaning.read_clean_data()

In [None]:
#df.sample()

In [None]:
def read_app_data():
    """
    Function for reading in the application data. Can then merge with funded data
    """
    # identify information columns that we need to drop
    columns_to_drop = ['a1_imp_agcy_contact','a1_imp_agcy_email','a1_imp_agcy_phone',
                      'a1_proj_partner_contact', 'a1_proj_partner_email', 'a1_proj_partner_phone']
    
    #read in data
    df = to_snakecase(pd.read_excel(f'{GCS_FILE_PATH}Master_AllData_Cycle5_Field_Mapping_COPY.xls',
              sheet_name='Master_Yes',
                                        header=[2]))
    df = df.drop(columns = columns_to_drop)
    
    #drop columns that will contain funding data (this df has the columns but no information for them)
    #we know funding data starts with columns `original_prog__amt___pa_ed_`
    df.drop(df.iloc[:,(df.columns.get_loc('original_prog__amt___pa_ed_')):], inplace=True, axis=1)  
    
    #drop identifier columns that are fully null (these columns are populated in funding data
    df = df.drop(columns={'#', 'atp_id', 'ppno', 'ppno_1'})
    
    return df

In [None]:
## cn use the version provided or the one we cleaned in the cleaning script
master_data2= read_app_data()

In [None]:
master_data =  _data_cleaning.read_clean_data()


In [None]:
master_data.head()

In [None]:
len(list(master_data.columns))

## Function to read SUR funding data

In [None]:
def read_SUR_funding_data():
    """
    Function to read in ATP funding data. Function will need to change for future data.
    Notes:
    * `atp_id` columns appear the same but the sur_details has an extra zero in the middle of the string so it would not match
    * `a3_project_type` also is entered differently however, details has more details than the funding sheet. Has information on size of project. can add to new column
    * `a1_imp_agcy_name_x` has manual errors so selecting `a1_imp_agcy_name_y`
    """
    # identify information columns that we need to drop
    columns_to_drop = ['a1_imp_agcy_contact','a1_imp_agcy_email','a1_imp_agcy_phone',
                      'a1_proj_partner_contact', 'a1_proj_partner_email', 'a1_proj_partner_phone']
    #read in SUR details and SUR funding data
    sur_details = to_snakecase(pd.read_excel(f'{GCS_FILE_PATH}Master_AllData_Cycle5_Field_Mapping_COPY.xls',
              sheet_name='Statewide SUR Details'))
    sur_details = sur_details.drop(columns = columns_to_drop)
    
    sur_funding = to_snakecase(pd.read_excel(f'{GCS_FILE_PATH}Master_AllData_Cycle5_Field_Mapping_COPY.xls',
              sheet_name='Statewide SUR Funding'))
    
    #drop the last few columns of SUR Details that have no funding data entered, but have columns
    sur_details.drop(sur_details.iloc[:,199:], inplace=True, axis=1)
    
    #remove rows with all null values
    cols_to_check = sur_funding.columns
    sur_funding['is_na'] = sur_funding[cols_to_check].isnull().apply(lambda x: all(x), axis=1) 
    sur_funding = sur_funding>>filter(_.is_na==False)
    sur_funding = sur_funding.drop(columns={'is_na'})

    #delete rows identified that are not part of the data (informational cells) or a sum total for all entries
    delete_row = sur_funding[sur_funding["project_cycle"]== 'Added Field not from App'].index
    sur_funding = sur_funding.drop(delete_row)
    
    delete_row = sur_funding[sur_funding["total_project_cost"]== '370,984,000.00'].index
    sur_funding = sur_funding.drop(delete_row)
    
    #merge sur_funding and sur_details
    merge_on = ['project_app_id', 'project_cycle', 'a2_ct_dist', 'a1_locode']
    df = (pd.merge(sur_details, sur_funding, how="outer", on = merge_on, indicator=True))
    
    #keep entries that merge. Right_only rows are misentered and more informational columns  
    df = df>>filter(_._merge=='both')
    
    # filling the null values for some of the duplicate columns
    # manually checking that values are the same as of now- will add function to check when we get the data links
    df['awarded_x'] = df['awarded_x'].fillna(df['awarded_y'])
    df['ppno_y'] = df['ppno_y'].fillna(df['ppno_x'])
    
    #renaming and dropping duplicate columns 
    ## a1_imp_agcy_name_x has manual errors so selecting a1_imp_agcy_name_y
    df = df.rename(columns={'awarded_x':'awarded',
                                'ppno_y':'ppno',
                                'a1_imp_agcy_name_y':'a1_imp_agcy_name',
                                'a2_info_proj_name_y':'a2_info_proj_name'
                               })
    df = df.drop(columns={'awarded_y', 'a1_imp_agcy_name_x', 'a2_info_proj_name_x','ppno_x', '_merge'})
    df["data_origin"]="Funded"
    
    return df

In [None]:
funded = read_SUR_funding_data()

In [None]:
len(funded)

In [None]:
len(list(funded.columns))

In [None]:
funded.head()

In [None]:
import re

In [None]:
dcolumns1 = [col for col in funded.columns if isinstance(col, str) and re.match('.*_x', col)]

In [None]:
dcolumns2 = [col for col in funded.columns if isinstance(col, str) and re.match('.*_y', col)]

In [None]:
dcolumns1

In [None]:
dcolumns2

In [None]:
funded>>select(_.atp_id_x, _.atp_id_y,
               _.a3_proj_type_x, _.a3_proj_type_y)

## Merging with Master_Data

In [None]:
master_data.sample()

In [None]:
funded.sample()

In [None]:
master_data.loc[master_data['project_cycle'] == 'CYCLE 5', 'project_cycle'] = 5
master_data.loc[master_data['project_cycle'] == 'CYCLE 6', 'project_cycle'] = 6

In [None]:
master_data[['a2_ct_dist','project_cycle']] = master_data[['a2_ct_dist','project_cycle']].astype(int)

In [None]:
master_data[['a1_locode']] = master_data[['a1_locode']].astype(object)

In [None]:
subset = master_data>>select(_.project_app_id, _.project_cycle, _.a2_ct_dist, _.a1_locode)

In [None]:
subset.project_cycle.value_counts()

In [None]:
subset.info()

In [None]:
(master_data2>>select(_.project_app_id, _.project_cycle, _.a2_ct_dist, _.a1_locode)).info()

In [None]:
(funded.iloc[:,0:20]).info()

In [None]:
merge_on = ['project_app_id', 'project_cycle', 'a2_ct_dist', 'a1_locode']

In [None]:
#pd.concat([subset,funded])

In [None]:
c1 = subset.select_dtypes(np.float).columns
c1

In [None]:
# will have suplicates for atp_id_x a3_proj_type_x
(pd.merge(subset, funded, how='outer', on= merge_on, indicator='full_merge')).head()

In [None]:
(pd.merge(subset, funded, how='outer', on= merge_on, indicator='full_merge')).full_merge.value_counts()

In [None]:
(pd.merge(subset, funded, how='outer', on= merge_on, indicator='full_merge')).info()

In [None]:
# subset data in app data (will concat data later) and keep full funded columns

### test using merge

In [None]:
#(pd.merge(master_data, funded, how='outer', on= merge_on, indicator='full_merge')).head()

In [None]:
#(pd.merge(master_data, funded, how='outer', on=merge_on2, indicator='full_merge')).full_merge.value_counts()

In [None]:
master_data.columns.intersection(funded.columns)

In [None]:
master_data.columns.difference(funded.columns)

In [None]:
funded.columns.difference(master_data.columns)

In [None]:
# # merging on all common columns does not work
# merge_on3 = list(master_data.columns.intersection(funded.columns))

* merging on [`project_app_id`, `project_cycle`, `a2_ct_dist`, `a1_locode`] 

In [None]:
merge_on3 = ['project_app_id', 'project_cycle', 'a2_ct_dist', 
             # 'a1_locode', 'a2_county', 'a2_info_proj_name',
             # 'a1_imp_agcy_name', 
              # 'a1_imp_agcy_street', 'a1_imp_agcy_city',
 # 'a1_imp_agcy_zip', 'a1_imp_agcy_title', 'a1_imp_agcy_ma', 'a1_imp_agcy_state_ma_num',
 # 'a1_imp_agcy_fed_ma_num', 'a1_proj_partner_exists', 'a1_proj_partner_agcy', 'a1_proj_partner_title',
 #'assembly_district', 'a2_assem_dist_a', 'a2_assem_dist_b', 'a2_assem_dist_c',
 #'congressional_district', 'a2_congress_dist_a', 'a2_congress_dist_b', 'a2_congress_dist_c', 'senate_district',
 #'a2_senate_dist_a', 'a2_senate_dist_b', 'a2_senatedistc', 'a2_info_proj_loc',
  # 'a2_mop_uza_population', 'a2_mpo', 'a2_past_proj', 'a2_past_proj_qty',
  # 'a2_proj_lat', 'a2_proj_long', 'a2_proj_scope_summary',
  # 'a2_project_location_map', 'a2_rtpa', 'a3_plan_active_trans', 'a3_plan_active_trans_exists',
  # 'a3_plan_bicycle', 'a3_plan_bicycle_exists', 'a3_plan_ped', 'a3_plan_ped_exists',
  # 'a3_plan_srts', 'a3_plan_srts_exists', 'a3_st_bicycle_applies',
  # 'a3_st_bicycle_pct', 'a3_st_num_schools', 'a3_st_ped_applies',
   # 'a3_st_ped_pct', 'a3_st_srts', 'a3_trail_elig_cost', 'a3_trail_fed_funding',
]

In [None]:
#(master_data2[master_data2.columns.intersection(merge_on3)]).info()

In [None]:
#(master_data[master_data.columns.intersection(merge_on3)]).info()

In [None]:
dfall = (pd.merge(master_data, funded, how='outer', on=merge_on3, indicator='full_merge'))

In [None]:
dfall.full_merge.value_counts()

In [None]:
compare_entries = np.where(dfall["a1_imp_agcy_name_x"] == dfall["a1_imp_agcy_name_y"], True, False)
dfall["compare_desc"] = compare_entries

In [None]:
dfall.compare_desc.value_counts()

In [None]:
dfall>>filter(_.compare_desc==False)>>select(_.a1_imp_agcy_name_x,  _.a1_imp_agcy_name_y)>>arrange(_.a1_imp_agcy_name_y)

In [None]:
dfall>>filter(_.full_merge=='right_only')

In [None]:
dfall.info()

In [None]:
# columns in common with the dfs we merged
list(master_data.columns.intersection(funded.columns))

In [None]:
# (dfall.iloc[:,199:230]).info()

In [None]:
dfall.sample()

In [None]:
dfall>>select(_.awarded_x, _.awarded_y,
               # _['#_x'], _['#_y'],
               # _.ppno_x, _.ppno_y,
               # _.ppno_1_x, _.ppno_1_y,
               _.a2_info_proj_descr_x, _.a2_info_proj_descr_y)>>filter(_.a2_info_proj_descr_y.notnull())

In [None]:
compare_desc = np.where(dfall["a2_info_proj_descr_x"] == dfall["a2_info_proj_descr_y"], True, False)
dfall["compare_proj_desc"] = compare_desc

In [None]:
dfall.compare_proj_desc.value_counts()

In [None]:
#check that there are no mismatched entries
dfall>>filter(_.compare_proj_desc==False)>>select(_.a2_info_proj_descr_x,  _.a2_info_proj_descr_y)>>arrange(_.a2_info_proj_descr_y)

### test using concat

In [None]:
master_data.info()

In [None]:
funded.info()

In [None]:
dfall2 = (pd.concat([master_data, funded]))

In [None]:
dfall2.info()

In [None]:
dfall2.sample()

In [None]:
len(dfall2>>group_by(_.project_app_id, _.project_cycle)>>summarize(n = _.awarded.nunique())>>arrange(_.n)>>filter(_.n>1))

In [None]:
dfall2>>group_by(_.project_app_id, _.project_cycle)>>summarize(n = _.awarded.nunique())>>arrange(_.n)>>filter(_.n>1)

In [None]:
dfall2.awarded.value_counts()

In [None]:
len(dfall2)

In [None]:
#dfall2['awarded'] = dfall2['awarded'].astype('category') 
# define the valid categories: 
#dfall2['awarded'] = dfall2['awarded'].cat.set_categories(['Y', 'N'], ordered=True) 

In [None]:
dfall2.sort_values(['project_app_id','a2_proj_scope_summary', 'project_cycle', 'awarded'], inplace=True, ascending=True) 

In [None]:
(dfall2>>select(_.project_app_id, _.a2_proj_scope_summary, _.project_cycle, _.awarded, _.data_origin)).head(50)

In [None]:
dfall3 = dfall2.sort_values('awarded').drop_duplicates(subset=['project_app_id','a2_proj_scope_summary','project_cycle'], keep='first')

In [None]:
dfall3>>select(_.project_app_id, _.a2_proj_scope_summary, _.project_cycle, _.awarded, _.data_origin)>>arrange(_.project_app_id)

In [None]:
(dfall3>>arrange(_.project_app_id, _.project_cycle)>>select(_.project_app_id, _.project_cycle, _.awarded, _.data_origin)).head(50)

In [None]:
#should have 882 - same number as the application data...
len(dfall3)

In [None]:
## the two that might show up twice may be from the changes in application name 
## same that did not merge in the first attempt
#dfall>>filter(_.full_merge=='right_only')>>select(_.project_app_id, _.a2_proj_scope_summary_x, _.project_cycle, _.awarded_x)>>arrange(_.project_app_id)

In [None]:
#dfall>>filter(_.project_app_id=='4-Oakland, City of-2')>>select(_.project_app_id,_.a1_imp_agcy_name_x, _.a1_imp_agcy_name_y, _.project_cycle)

In [None]:
dfall3>>group_by(_.project_app_id, _.project_cycle)>>summarize(n = _.awarded.nunique())>>arrange(_.n)

In [None]:
dfall3.sample()

In [None]:
# #unnamed cols originating in Funded data -- UPDATE NOW FIXED MANUALLY
# unnamed_cols = [col for col in funded.columns if isinstance(col, str) and re.match('unnamed.*', col)]
# unnamed_cols

## function for combining data - works

In [None]:
def join_funding_and_app_data(df_funding,
                              df_app,
                              awarded_col: list = [],
                             sort_values_cols: list = [],
                             subset_cols: list = []
                             ):
    '''
    columns in the funded and application data that we want to use
    awarded_col= ['awarded'],
    sort_values_cols = ['project_app_id','a2_proj_scope_summary', 'project_cycle', 'awarded'],
    subset_cols = ['project_app_id','a2_proj_scope_summary','project_cycle']
    '''
    # concat the funding and app dataframes
    df = (pd.concat([df_app, df_funding]))
    
    # take the awarded column and convert to a category so we can order by this column
    df[awarded_col] = df[awarded_col].astype('category') 
    df[awarded_col] = df[awarded_col].cat.set_categories(['Y', 'N'], ordered=True) 
    
    # sort values based on columns we defined (usually key like unique id, cycle)
    #df = df.sort_values(sort_values_cols, inplace=True, ascending=True) 
    
    # drop duplicates so we only get the funded data instead of the application data for a project that is selected
    df_final = df.sort_values(awarded_col).drop_duplicates(subset=subset_cols, keep='first')
    
    return df_final

In [None]:
df_test = join_funding_and_app_data(funded,
                              master_data,
                              awarded_col= 'awarded',
                             sort_values_cols= ['project_app_id','a2_proj_scope_summary', 'project_cycle', 'awarded'],
                             subset_cols = ['project_app_id','a2_proj_scope_summary','project_cycle']
                             )

In [None]:
df_test.sort_values(['project_app_id','a2_proj_scope_summary', 'project_cycle', 'awarded'], inplace=True, ascending=True) 

In [None]:
(df_test>>select(_.project_app_id, _.a2_proj_scope_summary, _.project_cycle, _.awarded, _.data_origin)).head(50)

In [None]:
df_test>>group_by(_.project_cycle)>>count(_.data_origin)

#### Check what columns are the duplicates

In [None]:
#https://stackoverflow.com/questions/61793094/find-column-whose-name-contains-a-specific-value-that-is-in-a-fixed-column

In [None]:
import re                                                               

In [None]:
dcolumns = [col for col in dfall.columns if isinstance(col, str) and re.match('.*_x', col)]

In [None]:
len(dcolumns)

In [None]:
dcolumns

In [None]:
def remove_duplicate_cols(df, col_list):
    for col_x in col_list:
        df[col_x] = df[col_x].fillna(df[col_y])
    

In [None]:
dfall

In [None]:
## list of all columns
#col_list = sorted(dfall.columns.to_list())

In [None]:
#col_list

In [None]:
#dfall.columns.get_loc("a3_current_plan_x")

In [None]:
# (dfall>>select(_.a3_proj_type, 
#                _.a3_proj_type_x,
#               _.a3_proj_type_y)).info()

remove cols: 
* a3_proj_type_x

In [None]:
(dfall>>select(_.a3_proj_type, 
               _.a3_proj_type_x,
              _.a3_proj_type_y))>>arrange(_.a3_proj_type_y)

In [None]:
#(dfall.iloc[:,64:70]).info()