# First look/Clean Up

In [None]:
import numpy as np
import pandas as pd
from siuba import *
from calitp import *
from shared_utils import geography_utils

#Formatting the nb 
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

In [None]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/lctop/"
FILE_NAME = "LCTOP_19_20.xlsx"

In [None]:
df1 = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="LCTOP_Projects"))

In [None]:
#Get percentages of how many null values per column
percent_missing = df1.isnull().sum() * 100 / len(df1)
percent_missing

In [None]:
#Drop some columns especially those with a very low percentage of populated values 
df2 = df1.drop(columns = ['count', '#','column3',
       'column4', 'column5','other_state_policies,_plans,_or_initiatives',
       'describe_policies,_plans,_or_initiatives','#2','_d','contact_name','contact_phone_#','contact_e_mail',
                         'authorized_agent_name','authorized_agent_title',
                         'project_id#'])

In [None]:
df2[['lon', 'lat']] = df1['project_location'].str.split(' ', 1, expand=True)

In [None]:
geo_list = ['lon','lat']

In [None]:
for c in geo_list:
    df2[c] = (df2[c]
              .str.replace(",", "")
              .str.replace(";"," ")
             )

In [None]:

for c in geo_list:
    df2[c] = df2[c].apply(pd.to_numeric, errors='coerce')
    

In [None]:
df2[['lon','lat','project_location']].sample(5)

In [None]:
#Coerce date-time columns to the write type
date_columns = ['qm_tool__date_', 'completion_date','start_date']

for c in date_columns:
    df2[c] = df2[c].apply(pd.to_datetime, errors='coerce')
    

In [None]:
df2 = df2.fillna(df2.dtypes.replace({'float64': 0.0, 'object': 'None'}))

In [None]:
df2.info()

In [None]:
df2.head()

In [None]:
df2.groupby(['distr_', 'project_sub_type_ii', 'project_sub_type']).agg({
    'total_project_cost':'sum',
    'total_lctop_funds':'sum'})
    

In [None]:
district_list = list(df2['distr_'].unique())

In [None]:
with pd.ExcelWriter(f"{GCS_FILE_PATH}LCTOP_cleaned.xlsx") as writer:
    df2.to_excel(writer, sheet_name="cleaned", index=False)

## Geodataframe

In [None]:
#Subset for geodataframe
df3 = df2[['funding_year', 'distr_','project_name', 'project_type',
       'project_sub_type', 'project_sub_type_ii',
       'project_description__short_','lon',
       'lat','total_lctop_funds',
       'total_cci_funds', 'total_project_cost',]]

In [None]:
df3 = df3[(df3['lon'] != 0.00) & (df3['lat'] != 0.00)]

In [None]:
gdf1 = geography_utils.create_point_geometry(df3, 'lon','lat')

In [None]:
#gdf1.to_file(f"./test_gdf.geojson", driver="GeoJSON")