# Notebook to prep functions for report

* Grouped down to the DISTRICT and the CYCLE level

In [1]:
import intake
import numpy as np
import pandas as pd
from calitp import to_snakecase
from dla_utils import _dla_utils
from IPython.display import HTML, Markdown
from siuba import *
from shared_utils import geography_utils

import altair as alt

import _data_cleaning
import _report_utils



In [2]:
GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/dla/atp/'


In [3]:
df_all = _report_utils.read_in_joined_data()



In [4]:
pd.set_option("display.max_columns",500)

In [5]:
## paramaters cell
district = 4

In [6]:
## parameters cell
cycle = 5

In [7]:
## subset df to just the district
df = df_all>>filter(_.a2_ct_dist==district)

In [8]:
#df = df_all.copy()

In [9]:
df = df>>filter(_.project_cycle==cycle)

In [10]:
# #check where everything is mapped
# df = df_all.copy()

In [11]:
display(HTML("<h2>Quick Stats</h2>"))

display(HTML(f"Out of {len(df)} Active Transportation Program Project Applications, "
            f"there are <strong>{(df>>filter(_.awarded=='Y')).project_app_id.nunique()} "
            f"projects</strong> that recieved funding over "
            f"{df.project_cycle.nunique()} cycles"))


In [144]:
display(HTML(f"<h3> What were the application outcomes "
             f"for District {district} "
             f"in Cycle {cycle}?</h3>"))
display(HTML(_dla_utils.pretify_tables((df>>count(_.data_origin)))))

Data Origin,Count
Application,60
Funded,5


In [13]:
quick_view = df>>filter(_.awarded=="Y")>>select(_.data_origin, _.a1_imp_agcy_name, _.a2_info_proj_name, 
                                   _.a2_county, _.total_project_cost)

In [14]:
quick_view['total_project_cost'] = quick_view['total_project_cost'].map('$ {:0,.2f}'.format)

In [142]:
display(HTML("<h3> Funded Projects </h3>"))
display(HTML(_dla_utils.pretify_tables(quick_view)))

Data Origin,A1 Imp Agcy Name,A2 Info Proj Name,A2 County,Total Project Cost
Funded,"Oakland, City of",7th Street Connection Project,Alameda,"$ 21,037,000.00"
Funded,Santa Clara County,Active and Safe Routes to a Healthier City,Santa Clara,"$ 2,510,000.00"
Funded,"Oakland, City of",East Oakland Neighborhood Bike Routes,Alameda,"$ 21,859,000.00"
Funded,Contra Costa County,North Bailey Road Active Transportation Corridor,Contra Costa,"$ 6,845,000.00"
Funded,"Fairfield, City of",West Texas Street Complete Streets Project,Solano,"$ 16,922,000.00"


In [140]:
#df>>group_by(_.awarded)>>count(_.a2_county)>>arrange(_.a2_county)

## Mapping

In [18]:
df_map= (df >>select(_.awarded, _.project_app_id, _.project_cycle, _.data_origin, _.geometry,
               _.a1_imp_agcy_city, _.a1_imp_agcy_name, _.a1_proj_partner_agcy, 
               _.assembly_district, _.congressional_district, _.senate_district,
              _.a2_county, _.a2_info_proj_descr, _.a2_info_proj_loc, _.a2_info_proj_name,
               _.a2_mop_uza_population, _.a2_mpo, _.a1_imp_agcy_street, _.a3_proj_type, 
               _.a3_proj_type, _['total_atp_$'], _.a2_proj_lat, _.a2_proj_long))

In [19]:
df_map = df_map>>filter(_.project_cycle==5)

In [149]:
#df_map>>filter(_.geometry.isnull())

In [21]:
## reapply geometry col for lat long
df_map = (geography_utils.create_point_geometry(df_map, longitude_col = 'a2_proj_long', latitude_col = 'a2_proj_lat'))

### ~~ISSUE~~ -- SOLVED WITH A QUICK FIX
* funded data lost mapping data from geometry column

In [145]:
## map before corrections
#df_map.explore("a2_county", cmap="Blues")  

In [24]:
# issue getting some lat longs in wrong country

###  ~~ISSUE: Some Project Longs are wrong~~ - SOLVED WITH A QUICK FIX

In [28]:
df_map_correct = (df_map>>filter(_.a2_proj_long<0))

In [29]:
df_map_incorrect = (df_map>>filter(_.a2_proj_long>0))

In [30]:
df_map_incorrect['a2_proj_long'] = df_map_incorrect['a2_proj_long'] * (-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [31]:
df_map_corrected = (geography_utils.create_point_geometry(df_map_incorrect, longitude_col = 'a2_proj_long', latitude_col = 'a2_proj_lat'))

In [32]:
df_map2 = (pd.concat([df_map_corrected, df_map_correct]))

In [146]:
#check ## commenting out for now
#df_map2.explore("project_cycle", cmap="Blues")  

In [147]:
df_map3 = df_map2>>filter(_.a2_proj_lat<300)

In [155]:
#check again 
df_map3.explore("data_origin", cmap="tab20b")  

In [39]:
# still have some weird locations but better than before

In [40]:
## going back to just district

### Adding Flag for corrected geometries

## Metrics

In [42]:
unique_agencies = df>>group_by(_.project_cycle, _.a2_county, _.data_origin)>>summarize(n_unique_agency= _.a1_imp_agcy_name.nunique())

In [43]:
unique_agencies =(spread(unique_agencies, "data_origin", "n_unique_agency"))

In [44]:
unique_agencies = unique_agencies.rename(columns={"a2_county":"county_name"})

In [45]:
unique_agencies['Application'].fillna(0, inplace=True)
unique_agencies['Funded'].fillna(0, inplace=True)

In [46]:
unique_agencies['Application'] = unique_agencies['Application'].astype('int32')
unique_agencies['Funded'] = unique_agencies['Funded'].astype('int32')

In [47]:
display(HTML("<h3>Number of Unique Agencies By County</h3>"))
display(HTML(_dla_utils.pretify_tables(unique_agencies)))


Project Cycle,County Name,Application,Funded
5,Alameda,8,1
5,Contra Costa,10,1
5,Marin,5,0
5,Napa,3,0
5,San Francisco,2,0
5,San Mateo,10,0
5,Santa Clara,1,1
5,Solano,3,1
5,Sonoma,3,0


### Success Rates

In [100]:
df = (_report_utils.reorder_namecol(df,
                    og_name_col= 'a1_imp_agcy_name',
                    new_name_col= 'imp_agency_name_new', 
                    split_on = ", ",
                   order_on ='pt2_pt1'))



In [124]:
successes = (df>>group_by(_.awarded)>>count(_.imp_agency_name_new))>>spread("awarded", "n")>>arrange(-_.Y)

In [125]:
successes['total'] = (successes['N'] + successes['Y'])

In [126]:
successes['success_rate'] = (successes['Y']/successes['total'])

In [127]:
successes = successes.rename(columns={"imp_agency_name_new":"Implementing Agency", "N":"Projects Not Funded",
                                    "Y":"Funded Projects", "total":"Total Applications"})

In [128]:
successes_top = successes>>filter(_.success_rate>0)

In [129]:
successes_top['success_rate'] = successes_top['success_rate'].transform(lambda x: '{:,.2%}'.format(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [138]:
display(HTML("<h3>Application Success Rate</h3>"))

display(HTML(f"There are <strong>{len(successes>>filter(_.success_rate==0))}</strong> "
            f"implenting agencies with <strong> zero </strong>"
            f"successful applications."))

display(HTML(f"There are <strong>{len(successes>>filter(_.success_rate!=0))}</strong> "
            f"implenting agencies with <strong> one of more </strong>"
            f"successful applications."))
    
display(HTML("</br><h4> Success Rates for Agencies with Successful Applications </h4>"))
display(HTML(_dla_utils.pretify_tables(successes_top>>select(_['Implementing Agency'], _['Total Applications'], _.success_rate))))

Implementing Agency,Total Applications,Success Rate
City of Oakland,6,33.33%
City of Fairfield,1,100.00%
Contra Costa County,6,16.67%
Santa Clara County,1,100.00%
