# Manual Checks
* Cleaning the "Project" and "Allocation" sheet have manual components. 
    * `clean_allocation_manual()` and `clean_project_manual()` respectively. 
* This notebook walks through each manual step and any changes.

In [106]:
import A1_data_prep
import A5_crosswalks as crosswalks
import A6_other
import numpy as np
import pandas as pd
from calitp import *

pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

## Project Sheet

In [107]:
# Loading in the original sheet that has not been manipulated
project = A1_data_prep.load_project()



In [108]:
# Check award cycles to ensure they are in the format of 1,2,3,4,5
project.award_cycle.value_counts()

3           28
5           23
4           16
1           14
2           14
FY 21/22     1
Name: award_cycle, dtype: int64

In [138]:
# Check award years to ensure 
project.award_year.value_counts()

2018    28
2022    23
2020    17
2015    14
2016    14
Name: award_year, dtype: int64

### Check PPNO - Project Relationship

In [143]:
f"There are {project.ppno.nunique()} PPNO numbers and {project.project_title.nunique()} project titles."

'There are 90 PPNO numbers and 95 project titles.'

In [111]:
# Check PPNO with more than one value
# Make sure they make sense. 
ppno_df = (
    A6_other.value_counts_df(project, "ppno")
    .rename(columns={"ppno": "total projects", "index": "ppno"})
)

In [112]:
ppno_df = ppno_df.loc[ppno_df['total projects'] > 1]

In [113]:
ppno_more_than_1_project = ppno_df.ppno.tolist()

In [114]:
project[["award_year","ppno", "project_title", "grant_recipient", ]][
    project["ppno"].isin(ppno_more_than_1_project)
].sort_values('ppno')

Unnamed: 0,award_year,ppno,project_title,grant_recipient
9,2015,CP006,SFMTA Light Rail Vehicle Fleet Expansion,San Francisco Municipal Transportation Agency (SFMTA)
25,2016,CP006,Light Rail Modernization and Expansion Program,San Francisco Municipal Transportation Agency (SFMTA)
45,2018,CP006,Transit Capacity Expansion Program,San Francisco Municipal Transportation Agency (SFMTA)
38,2018,CP031,All Aboard 2018: Transforming SoCal Rail Travel,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN)
39,2018,CP031,Building Up: LOSSAN North Improvement Program,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN)
42,2018,CP034,Diesel Multiple Unit Vehicle to Zero- or Low-Emission Vehicle Conversion and West Valley Connector Bus Rapid Transit,San Bernardino County Transportation Authority (SBCTA)
44,2018,CP034,Blue Line Rail Corridor Transit Enhancements,San Diego Metropolitan Transit System (SDMTS)
27,2016,CP057,BART Silicon Valley Phase II Extension,Santa Clara Valley Transportation Authority
50,2018,CP057,"VTA’s BART Silicon Valley Extension, Phase II",Santa Clara Valley Transportation Authority (SCVTA)


## Allocation Sheet

In [115]:
# Load in sheets
allocation = A1_data_prep.load_allocation()

### Dates
* Correct any values that didn't read in correctly in `A5_crosswalks`.
* String values without date information such as "pending" or "on hold" don't need to be corrected as they are coerced later on. 
* In contrast values such as "6/30/2021\n12/31/2021\n10/20/2022" should be corrected. 

In [116]:
allocation._3rd_party_award_date.unique().tolist()

[datetime.datetime(2016, 3, 14, 0, 0),
 datetime.datetime(2016, 6, 1, 0, 0),
 datetime.datetime(2017, 6, 28, 0, 0),
 datetime.datetime(2016, 11, 3, 0, 0),
 datetime.datetime(2015, 11, 30, 0, 0),
 datetime.datetime(2015, 10, 21, 0, 0),
 datetime.datetime(2016, 1, 8, 0, 0),
 '-',
 datetime.datetime(2017, 5, 30, 0, 0),
 datetime.datetime(2016, 11, 1, 0, 0),
 datetime.datetime(2015, 1, 1, 0, 0),
 datetime.datetime(2018, 2, 2, 0, 0),
 datetime.datetime(2017, 6, 23, 0, 0),
 datetime.datetime(2015, 10, 15, 0, 0),
 datetime.datetime(2016, 4, 6, 0, 0),
 datetime.datetime(2017, 3, 22, 0, 0),
 datetime.datetime(2017, 5, 16, 0, 0),
 datetime.datetime(2019, 2, 13, 0, 0),
 nan,
 datetime.datetime(2019, 7, 23, 0, 0),
 datetime.datetime(2020, 4, 3, 0, 0),
 datetime.datetime(2019, 1, 9, 0, 0),
 datetime.datetime(2021, 8, 9, 0, 0),
 datetime.datetime(2022, 2, 15, 0, 0),
 datetime.datetime(2017, 1, 13, 0, 0),
 datetime.datetime(2021, 9, 22, 0, 0),
 datetime.datetime(2020, 3, 26, 0, 0),
 datetime.datetime

In [117]:
allocation.phase_completion_date.unique().tolist()

[datetime.datetime(2022, 3, 30, 0, 0),
 'Complete\n6/1/2019',
 datetime.datetime(2021, 6, 30, 0, 0),
 'Complete 6/11/2018',
 'Complete\n2/11/2018',
 'Complete\n6/30/2020',
 datetime.datetime(2020, 9, 30, 0, 0),
 datetime.datetime(2018, 6, 30, 0, 0),
 datetime.datetime(2020, 6, 29, 0, 0),
 datetime.datetime(2019, 11, 1, 0, 0),
 datetime.datetime(2018, 12, 10, 0, 0),
 'Complete\n11/13/2019',
 datetime.datetime(2020, 3, 30, 0, 0),
 'Sep-22',
 datetime.datetime(2021, 12, 30, 0, 0),
 datetime.datetime(2021, 9, 30, 0, 0),
 datetime.datetime(2020, 5, 16, 0, 0),
 '6/30/2020\nComplete',
 '-',
 'Jun-23',
 'Apr-23',
 'Jun-24',
 'Dec-25',
 '6/30/2022\nComplete',
 'Sep-24',
 'Jul-24',
 datetime.datetime(2018, 5, 26, 0, 0),
 datetime.datetime(2019, 5, 21, 0, 0),
 datetime.datetime(2022, 1, 16, 0, 0),
 'Oct-22',
 datetime.datetime(2021, 12, 31, 0, 0),
 datetime.datetime(2018, 2, 1, 0, 0),
 'Jan-24',
 'Apr-24',
 datetime.datetime(2021, 7, 7, 0, 0),
 datetime.datetime(2020, 5, 7, 0, 0),
 datetime.datet

In [118]:
allocation.allocation_date.unique().tolist()

[datetime.datetime(2015, 10, 22, 0, 0),
 datetime.datetime(2016, 5, 19, 0, 0),
 datetime.datetime(2016, 6, 30, 0, 0),
 datetime.datetime(2015, 12, 10, 0, 0),
 datetime.datetime(2015, 8, 27, 0, 0),
 datetime.datetime(2016, 1, 21, 0, 0),
 datetime.datetime(2017, 6, 29, 0, 0),
 datetime.datetime(2016, 10, 20, 0, 0),
 datetime.datetime(2017, 8, 17, 0, 0),
 datetime.datetime(2018, 2, 1, 0, 0),
 datetime.datetime(2016, 3, 16, 0, 0),
 datetime.datetime(2016, 3, 17, 0, 0),
 datetime.datetime(2017, 1, 19, 0, 0),
 datetime.datetime(2018, 8, 16, 0, 0),
 datetime.datetime(2022, 5, 19, 0, 0),
 datetime.datetime(2018, 12, 6, 0, 0),
 datetime.datetime(2019, 10, 9, 0, 0),
 datetime.datetime(2017, 3, 16, 0, 0),
 datetime.datetime(2021, 6, 24, 0, 0),
 datetime.datetime(2020, 6, 25, 0, 0),
 datetime.datetime(2016, 12, 8, 0, 0),
 datetime.datetime(2020, 6, 24, 0, 0),
 datetime.datetime(2019, 12, 5, 0, 0),
 datetime.datetime(2018, 10, 18, 0, 0),
 nan,
 datetime.datetime(2021, 1, 28, 0, 0),
 datetime.dateti

In [119]:
allocation.led.unique().tolist()

[datetime.datetime(2022, 3, 31, 0, 0),
 datetime.datetime(2019, 6, 1, 0, 0),
 datetime.datetime(2020, 6, 28, 0, 0),
 datetime.datetime(2021, 6, 30, 0, 0),
 datetime.datetime(2019, 11, 3, 0, 0),
 datetime.datetime(2018, 11, 30, 0, 0),
 datetime.datetime(2020, 6, 30, 0, 0),
 datetime.datetime(2019, 1, 8, 0, 0),
 datetime.datetime(2018, 6, 30, 0, 0),
 datetime.datetime(2020, 6, 29, 0, 0),
 datetime.datetime(2019, 11, 1, 0, 0),
 datetime.datetime(2018, 12, 10, 0, 0),
 datetime.datetime(2021, 2, 2, 0, 0),
 datetime.datetime(2020, 6, 23, 0, 0),
 datetime.datetime(2022, 9, 30, 0, 0),
 'October 15, 2018\nSeptember 30, 2021',
 datetime.datetime(2021, 9, 22, 0, 0),
 datetime.datetime(2020, 5, 16, 0, 0),
 nan,
 datetime.datetime(2023, 6, 30, 0, 0),
 datetime.datetime(2021, 6, 1, 0, 0),
 datetime.datetime(2024, 6, 9, 0, 0),
 datetime.datetime(2024, 6, 24, 0, 0),
 datetime.datetime(2025, 2, 17, 0, 0),
 datetime.datetime(2022, 12, 30, 0, 0),
 datetime.datetime(2024, 9, 22, 0, 0),
 datetime.datetime(

## Check PPNO between Allocation and Project Sheet
* Use the PPNO off of "Project Sheet" as the source of truth.

In [120]:
# PPNO list for project
project_ppno_list = project.ppno.unique().tolist()

In [121]:
# PPNO list for allocation
allocation_ppno_list = allocation.ppno.unique().tolist()

In [122]:
# Check lengths 
len(project_ppno_list) == len(allocation_ppno_list)

False

In [123]:
# Check differences between allocation vs ppno list
ppno_unique_to_alloc = list(set(allocation_ppno_list) - set(project_ppno_list))

In [144]:
len(ppno_unique_to_alloc)

9

In [124]:
# Check out the projects that appear in alloc sheet but not projects. 
projects_found_only_in_alloc = (allocation[["award_year", "grant_recipient", "ppno"]][
    allocation["ppno"].isin(ppno_unique_to_alloc)
]).drop_duplicates().reset_index(drop = True)

In [125]:
projects_found_only_in_alloc

Unnamed: 0,award_year,grant_recipient,ppno
0,2015.0,Southern California Regional Rail Authority (Metrolink),CP002
1,2016.0,Antelope Valley Transit Authority,CP018
2,2016.0,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN),CP024
3,2016.0,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN),CP021
4,2018.0,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN),CP301
5,2018.0,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN),CP042
6,2018.0,Sacramento Regional Transit District,CP053
7,2018.0,San Diego Metropolitan Transit System (MTS),CP032
8,2018.0,Transportation Agency for Monterey County,1155N


In [126]:
# Merge df above with a subset of the project df to figure out what's going on. 
project_subset = ['award_year','grant_recipient','project_title','ppno']
m1 = pd.merge(projects_found_only_in_alloc, project[project_subset], how="left", on = ['grant_recipient','award_year'], suffixes = ('_alloc','_project'))

In [134]:
# For projects that natched over, correct ppno_alloc values to ppno_project
m1[m1['project_title'].notnull()].drop_duplicates()

Unnamed: 0,award_year,grant_recipient,ppno_alloc,project_title,ppno_project
2,2016.0,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN),CP024,All Aboard: Transforming Southern California Rail Travel,CP043
3,2016.0,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN),CP021,All Aboard: Transforming Southern California Rail Travel,CP043
4,2018.0,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN),CP301,All Aboard 2018: Transforming SoCal Rail Travel,CP031
5,2018.0,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN),CP301,Building Up: LOSSAN North Improvement Program,CP031
6,2018.0,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN),CP042,All Aboard 2018: Transforming SoCal Rail Travel,CP031
7,2018.0,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN),CP042,Building Up: LOSSAN North Improvement Program,CP031


In [128]:
# Find the ppno for the rows missing values from the Project sheet after merging
missing_m1 = m1[m1['project_title'].isnull()]
missing_m1

Unnamed: 0,award_year,grant_recipient,ppno_alloc,project_title,ppno_project
0,2015.0,Southern California Regional Rail Authority (Metrolink),CP002,,
1,2016.0,Antelope Valley Transit Authority,CP018,,
8,2018.0,Sacramento Regional Transit District,CP053,,
9,2018.0,San Diego Metropolitan Transit System (MTS),CP032,,
10,2018.0,Transportation Agency for Monterey County,1155N,,


In [129]:
# Grab years for filtering
missing_m1_years = (missing_m1.award_year.unique().tolist()) 

In [130]:
# Grab agencies for filtering
missing_m1_agencies = (missing_m1.grant_recipient.unique().tolist()) 

In [131]:
# Join agencies all into a list for string searching
missing_m1_agencies = f"({'|'.join(missing_m1_agencies)})"

In [132]:
# Filter
missing_m1_projects = project[project_subset][
    project["award_year"].isin(missing_m1_years)
]

In [133]:
missing_m1_projects[missing_m1_projects["grant_recipient"].str.contains(missing_m1_agencies)]



Unnamed: 0,award_year,grant_recipient,project_title,ppno
0,2015,Antelope Valley Transit Authority (AVTA),Regional Transit Interconnectivity & Environmental Sustability,CP005
6,2015,Sacramento Regional Transit District (SacRT),Sacramento Regional Transit's Refurbishment of 7 Light Rail Vehicles,CP001
14,2016,Antelope Valley Transit Authority (AVTA),"Zero Emission Bus and Vanpool Expansion in the Antelope Valley, Kern County and the Coachella Valley",CP019
23,2016,Sacramento Regional Transit District (SacRT),Downtown/Riverfront Sacramento-West Sacramento Streetcar,CP080
30,2018,Antelope Valley Transit Authority (AVTA) & Long Beach Transit,From the Desert to the Sea: Antelope Valley Transit Authority and Long Beach Transit Zero Emission Bus Initiative,CP028
41,2018,Sacramento Regional Transit District (SacRT),Accelerating Rail Modernization and Expansion in the Capital Region,CP052
55,2018,Transportation Agency for Monterey County (TAMC),Extend rail service to Monterey County,1155A
