## Looking at SmartSheet Columns versus QRMS

In [1]:
import _utils
import _string_utils
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase



In [2]:
pd.options.display.max_columns = 200
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
import fuzzywuzzy
from fuzzywuzzy import process

In [4]:
gcs_path = f"{_utils.GCS_FILE_PATH}smartsheet_columns/"

### Looking at SmartSheet Columns
* Manually cleaned up Excel sheet to retain only columns drawn from CTIPS/PRSM 

In [5]:
smartsheet_cols = to_snakecase(pd.read_excel(f"{gcs_path}Smartsheet Columns.xlsx"))

In [6]:
smartsheet_cols = smartsheet_cols.replace({'ü':'yes'}).fillna('no')

In [7]:
smartsheet_cols.sample()

Unnamed: 0,column_name,description_notes,ctips,prsm
13,Target Opening Year,"Construction contract acceptance milestone – end construction\nDistrict provides updated schedule information. Alternatively, could use CTIPS.",yes,yes


In [8]:
smartsheet_cols.column_name = smartsheet_cols.column_name.str.lower().str.replace(" ","_")

In [9]:
smartsheet_cols_list = smartsheet_cols.column_name.sort_values().unique().tolist()

In [10]:
len(smartsheet_cols_list)

27

In [38]:
nonshopp_full_df = to_snakecase(
    pd.read_excel(f"{_utils.GCS_FILE_PATH}cleaned_data_atp_tircp.xlsx")
)

In [42]:
# nonshopp_full_df.sample().T.reset_index().sort_values('index')

In [11]:
smartsheet_cols_list

['beg_pm_(pm_back)',
 'caltrans_(ct)_project_id',
 'con_capital_cost_($1,000)',
 'con_existing_source_of_funding',
 'con_start_date_(m500)',
 'con_support_cost_($1,000)',
 'county',
 'current_phase',
 'district',
 'end_pm',
 'expenditure_authorization_(ea)',
 'funding_need_phase',
 'pa&ed_cost_($1,000)',
 'parcel_counts',
 'pid_approval_date_(m010)',
 'project_description',
 'project_name',
 'project_planning_number_(ppno)',
 'ps&e_cost_($1,000)',
 'purpose_&_need',
 'route',
 'row_cost_($1,000)',
 'rtl_date_(m460)',
 'target_opening_year',
 'target_pa&ed_(m200)',
 'total_project_cost_($1,000)',
 'total_unfunded_need_($1,000)']

#### Columns from Capital Improvement Project Report
* https://qmrs.dot.ca.gov/qmrs/f?p=CIPOUTNEW:HOME
* This report has all the current PRSM milestone information and programmed CTIPS funding.
* No additional information in the "about" page. 
* qmrs tutorial: https://pdelearning.ctpass.dot.ca.gov/mod/scorm/view.php?id=1503

In [12]:
prsm_ctips = to_snakecase(pd.read_excel(f"{gcs_path}Capital Improvement Project Report.xlsx"))



In [35]:
prsm_ctips.program.value_counts()

other-local          97
stip-rip             20
shopp                12
Planning              6
rte-99                5
other-state-funds     5
cmia                  5
local-asst            4
MINOR-A               4
STIP                  3
partnership           3
maint                 2
MINOR-B               2
stip-iip              1
tcrp                  1
Name: program, dtype: int64

<b>Manual Check</b>
Left `smartsheet`. Right: `prsm_ctips`
* 'caltrans_(ct)_project_id': 'project_id'
* 'total_project_cost_($1,000)': 'total_capital_estimate'
* 'project_planning_number_(ppno)': ctips_ppno

<b>Unsure</b>
* 'beg_pm'= bpm ???
* 'rtl_date__m460': 'rtl_finish_date'

In [37]:
prsm_ctips.sample().T.reset_index().sort_values('index')

Unnamed: 0,index,173
135,aadd?,Yes
67,ac_finish_date,2032-11-06 17:00:00
68,ac_fiscal_year,2033.0
66,ac_milestone,M500
70,ac_percent_complete,0.0
69,ac_quarter,2.0
57,adv_finish_date,2032-07-05 17:00:00
58,adv_fiscal_year,2033.0
56,adv_milestone,M480
60,adv_percent_complete,0.0


In [14]:
prsm_ctips_cols = list(prsm_ctips.columns.sort_values())

In [25]:
len(prsm_ctips_cols)

165

In [15]:
# Only 3 columns overlap for sure
print(set(smartsheet_cols_list).intersection(set(prsm_ctips_cols)))

{'district', 'route', 'county'}


In [17]:
for i in prsm_ctips_cols:
    _string_utils.replace_matches_set_ratio(
        smartsheet_cols, "column_name", "prsm_ctips_1", i, 70
 )

In [18]:
len(smartsheet_cols[smartsheet_cols.prsm_ctips_1.notnull()])

6

In [19]:
smartsheet_cols[smartsheet_cols.prsm_ctips_1.notnull()]

Unnamed: 0,column_name,description_notes,ctips,prsm,prsm_ctips_1
1,expenditure_authorization_(ea),Unique 5-digit number assigned by Caltrans.,yes,yes,ea
3,project_name,In CTIPS: “Project Title”,yes,yes,project_nickname
4,district,Caltrans HQ DOTP is District 74. DRMT is District 75,yes,yes,district
5,county,Multiple counties = “VAR” in CTIPS,yes,yes,county
6,route,CTIPS is blank for rail projects; OSIP uses “OFF”,yes,yes,route
7,project_description,CTIPS: “Description”. Note: CTIPS description often does not match the project description provided in the intake forms.,yes,yes,work_description


### Statewide Delivery Plan
* Only SHOPP project

In [20]:
sw_delivery_plans = to_snakecase(pd.read_excel(f"{gcs_path}Statewide Delivery Plan.xlsx"))



In [34]:
sw_delivery_plans.document.value_counts()

SHOPP    200
STIP      12
Name: document, dtype: int64