## Add SB1 geographic information.

In [1]:
import _utils
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase



In [2]:
import fsspec
from calitp import *
from calitp.storage import get_fs

fs = get_fs()
import os

### Non SHOPP-ATP-TIRCP

In [59]:
# Read in 10 Year non SHOPP with ATP and TIRCP
nonshopp = to_snakecase(pd.read_excel(f"{_utils.GCS_FILE_PATH}cleaned_data_atp_tircp.xlsx"))

In [60]:
# Subset to join.
non_shopp_subset = [
    "ppno",
    "project_name",
    "lead_agency",
    "previous_caltrans_nominations",
    "full_county_name",
    "district",
    "project_description",
    "current_phase",
    "primary_mode",
    "urban_rural",
    "total_project_cost__$1,000",
    "total_unfunded_need__$1,000",
    "notes",
    "shs_capacity_increase_detail",
    "current_phase",
]

In [62]:
nonshopp = nonshopp[non_shopp_subset]

In [63]:
nonshopp.district = nonshopp.district.map("{:02}".format)

In [64]:
nonshopp.project_name = nonshopp.project_name.str.lower().str.strip().str.split("20").str[0]

### Sb1 Geo
* Not every dataset has project names...

In [8]:
pd.options.display.max_columns = 200
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [9]:
def get_sb1_files():

    sb1_gcs_path = f"{_utils.GCS_FILE_PATH}SB1_geojsons"

    # Create a list of all the files in my folder
    files = fs.ls(sb1_gcs_path)

    # Lower case
    files = [i for i in files if ".geojson" in i]

    # For now, delete out any Pt geometries
    # So the df isn't extremely large
    # files= [ x for x in files if "Pt" not in x ]
    # files= [ x for x in files if "pt" not in x ]

    # String to add to read the files
    my_string = "gs://"
    files = [my_string + i for i in files]

    return files

In [10]:
sb1_files = get_sb1_files()

In [11]:
len(sb1_files)

22

In [12]:
with get_fs().open("gs://calitp-analytics-data/data-analyses/project_prioritization/SB1_geojsons/SB1_RCA_Projects_032022_Transit_and_Intercity_Rail_Capital_Program.geojson") as f:
    tircp = to_snakecase(gpd.read_file(f))

In [13]:
with get_fs().open(sb1_files[0]) as f:
    atp = to_snakecase(gpd.read_file(f))

In [14]:
full_gdf = pd.DataFrame()

In [15]:
for i in sb1_files:
    with get_fs().open(i) as f:
        df = to_snakecase(gpd.read_file(f))
    full_gdf = pd.concat([full_gdf, df], axis=0)

In [16]:
full_gdf.programcodes.value_counts()

|SHOPP|    2741
|HM|       1163
|LSR|       509
|ATP|       321
|SHOPA|     165
|SGR|       161
|STIP|      126
|TIRCP|      96
|LPP-F|      68
|TCEP|       63
|LPP-C|      57
|STA|        49
|SCCP|       40
|FM|         12
|SRA|        11
Name: programcodes, dtype: int64

In [17]:
full_gdf.shape

(5582, 38)

In [18]:
full_gdf["project_title2"] = (
    full_gdf["programcodes"]
    + " "
    + full_gdf["agencies"]
    + " "
    + full_gdf["countynames"]
    + " "
    + full_gdf["fiscalyears"]
    + " "
    + full_gdf["totalcost"].astype(str)
)

In [19]:
full_gdf.projecttitle = full_gdf.projecttitle.fillna(full_gdf.project_title2)

In [20]:
# Drop duplicates...since opened all ln/pt file?
full_gdf2 = (
    full_gdf.drop_duplicates(
        subset=[
            "agencies",
            "programcodes",
            "sb1funds",
            "projectid",
            "projecttitle",
            "totalcost",
        ]
    )
).reset_index(drop=True)

In [21]:
full_gdf2.programcodes.value_counts()

|SHOPP|    2741
|HM|        913
|LSR|       509
|ATP|       321
|SHOPA|     164
|SGR|       161
|STIP|      126
|TIRCP|      96
|LPP-F|      68
|TCEP|       60
|LPP-C|      57
|STA|        49
|SCCP|       40
|FM|         12
|SRA|        11
Name: programcodes, dtype: int64

In [22]:
subset = [
    "projectid",
    "agencyids",
    "agencies",
    "programcodes",
    "fiscalyears",
    "projectstatuses",
    "sb1funds",
    "iijafunds",
    "totalcost",
    "assemblydistricts",
    "senatedistricts",
    "congressionaldistricts",
    "countynames",
    "citynames",
    "ct_districts",
    "issb1codes",
    "isiijacode",
    "isonshscodes",
    "geometry",
    "projecttitle",
    "projectdescription",
]

In [23]:
# Fill in NA
full_gdf2 = full_gdf2.fillna(
    full_gdf.dtypes.replace({"float64": 0.0, "object": "None"})
)

In [24]:
full_gdf2 = full_gdf2[subset]

In [25]:
full_gdf2.shape, full_gdf2.projecttitle.nunique()

((5328, 21), 1565)

In [26]:
# Align funding
full_gdf2 = _utils.align_funding_numbers(
    full_gdf2,
    [
        "totalcost",
        "sb1funds",
    ],
)

In [27]:
# Lower case and clean project names
full_gdf2.projecttitle = (
    full_gdf2.projecttitle.str.lower().str.strip().str.split("20").str[0]
)

In [28]:
# Get rid of |
for i in ['programcodes','issb1codes','projecttitle','isiijacode', 'isonshscodes']:
    full_gdf2[i] = full_gdf2[i].str.replace("|", "")



#### Low hanging fruit, TIRCP & ATP

In [71]:
full_gdf2.programcodes.unique()

array(['ATP', 'FM', 'LPP-C', 'LPP-F', 'LSR', 'SCCP', 'SHOPP', 'SHOPA',
       'HM', 'SRA', 'STA', 'STIP', 'SGR', 'TCEP', 'TIRCP'], dtype=object)

In [82]:
tircp_atp = full_gdf2.loc[full_gdf2.programcodes.str.contains("TIRCP|ATP")].reset_index(drop = True)

In [84]:
tircp_atp_nonshopp = nonshopp.loc[
    nonshopp.previous_caltrans_nominations.str.contains("TIRCP|ATP")
].reset_index(drop=True)

#### Try to merge CSV project ids with Geojson
* CSV version has cleaner project names.

In [45]:
def clean_project_id(df, project_id_col:str):
    df[project_id_col] = (df[project_id_col].str.replace("'","")
                          .str.lower()
                          .str.strip()
                         )
    return df

In [29]:
# Read in SB1
sb1_project_id = to_snakecase(pd.read_csv(f"{_utils.GCS_FILE_PATH}RebuildingCA_map_Data.csv"))[['project_id','project_name']]

In [46]:
sb1_project_id = clean_project_id(sb1_project_id, "project_id")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [47]:
full_gdf2 = clean_project_id(full_gdf2, "projectid")

In [69]:
full_gdf2.projectid.nunique(), sb1_project_id.project_id.nunique(), len(sb1_project_id)

(4050, 9544, 9632)

In [48]:
pd.merge(sb1_project_id, full_gdf2, how="outer",left_on=["project_id"], right_on=['projectid'], indicator=True)[
    ["_merge"]
].value_counts()

_merge    
left_only     5595
both          4174
right_only    1321
dtype: int64

In [55]:
sb1_m = pd.merge(full_gdf2, sb1_project_id, how="left",left_on=["projectid"], right_on=['project_id']) 

In [56]:
sb1_m.project_name = sb1_m.project_name.fillna(sb1_m.projecttitle)

### Merge 1 with Non SHOPP
* Didn't work on project names :( 

In [65]:
pd.merge(nonshopp, sb1_m, how="outer", on = ['project_name'], indicator=True)[
    ["_merge"]
].value_counts()

_merge    
right_only    5495
left_only      905
both             0
dtype: int64

### SB1 CSV

In [None]:
sb1.shape, sb1.project_name.nunique()

In [None]:
# Titles are not necessarily specifically named.
# Tends to be very general
# sb1.project_name.value_counts()

In [None]:
sb1_subset = [
    "project_name",
    "implementing_agency",
    "sb1_program",
    "fiscal_year",
    "project_description",
    "total_cost",
    "sb1_funds",
    "is_sb1?",
    "project_status",
    "assembly_districts",
    "senate_districts",
    "congressional_districts",
    "counties",
    "cities",
    "caltrans_districts",
    "on_shs?",
]

In [None]:
sb1_2 = sb1[sb1_subset]

In [None]:
sb1.sb1_program.value_counts()

In [None]:
sb1_2.shape

In [None]:
# Fill in NA
sb1_2 = sb1_2.fillna(sb1_2.dtypes.replace({"float64": 0.0, "object": "None"}))

In [None]:
# Align funding
sb1_2 = _utils.align_funding_numbers(
    sb1_2,
    [
        "total_cost",
        "sb1_funds",
    ],
)

#### Merge on Project Names

In [None]:
# Lower case and clean project names
for i in [sb1_2, df]:
    i["project_name"] = i["project_name"].str.lower().str.strip().str.split("20").str[0]

In [None]:
# 67 without accounting for districts
pd.merge(df, sb1_2, how="outer", on=["project_name"], indicator=True)[
    ["_merge"]
].value_counts()

In [None]:
# 62 matches
pd.merge(
    df,
    sb1_2,
    how="outer",
    left_on=["project_name", "district"],
    right_on=["project_name", "caltrans_districts"],
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
project_title_m = pd.merge(
    df,
    sb1_2,
    how="left",
    left_on=["project_name", "district"],
    right_on=["project_name", "caltrans_districts"],
    indicator=True,
)

In [None]:
preview = [
    "project_name",
    "district",
    "caltrans_districts",
    "counties",
    "full_county_name",
    "project_description_x",
    "project_description_y",
    "previous_caltrans_nominations",
    "sb1_program",
    "total_project_cost__$1,000",
    "total_cost",
]

In [None]:
# project_title_m.loc[project_title_m._merge == 'both'][preview]

In [None]:
project_title_m.sb1_program.value_counts()

In [None]:
sb1_csv = project_title_m.project_name.unique().tolist()

### Tircp
* None of the projects from TIRCP are mapping, even though the names appear the same.

In [None]:
# sb1_tircp = sb1_2.loc[sb1_2.sb1_program == "Transit and Intercity Rail Capital Program"].reset_index(drop = True)

In [None]:
# sb1_tircp[['project_name','caltrans_districts','counties']].sort_values('project_name').head(2)

In [None]:
# tircp_sb[['project_name']].sort_values(by = 'project_name')

In [None]:
# tircp_shopp[['project_name']].sort_values(by = 'project_name')

In [None]:
tircp_shopp = df.loc[
    df.previous_caltrans_nominations.str.contains("TIRCP")
].reset_index(drop=True)

In [None]:
# why are there no matches??
pd.merge(tircp_shopp, tircp_sb, how="outer", on=["project_name"], indicator=True)[
    ["_merge"]
].value_counts()

In [None]:
import urllib.parse

import requests

In [None]:
url = r"https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer/query"

In [None]:
# params = {
#    'geometry': '-118.21637221791077, 34.094916196179504',
#    'geometryType': 'esriGeometryPoint',
#    'returnGeometry': 'true',
#    'f': 'pjson'
# }

In [None]:
# url_final = url + urllib.parse.urlencode(params)

In [None]:
# response = requests.get(url=url_final)

In [None]:
# data = response.text

In [None]:
# test = gpd.read_file(data)

In [None]:
# https://services.arcgisonline.com/arcgis/rest/services/Elevation/World_Hillshade/MapServer?f=json

In [None]:
# https://services.arcgisonline.com/arcgis/rest/services/Elevation/World_Hillshade/MapServer/tilemap/0/0/0/8/8?f=json