## Looking at SmartSheet Columns versus QRMS Premade Reports

In [1]:
import _utils
import _string_utils
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase



In [2]:
pd.options.display.max_columns = 200
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
import fuzzywuzzy
from fuzzywuzzy import process

In [4]:
gcs_path = f"{_utils.GCS_FILE_PATH}smartsheet_columns/"

### Looking at SmartSheet Columns
* Manually cleaned up Excel sheet to retain only columns drawn from CTIPS/PRSM 

In [5]:
smartsheet_cols = to_snakecase(pd.read_excel(f"{gcs_path}Smartsheet Columns.xlsx"))

In [6]:
# Replace checkmarks with yes and no
smartsheet_cols = smartsheet_cols.replace({'ü':'yes'}).fillna('no')

In [7]:
# Turn column titles to snakecase
smartsheet_cols.column_name = smartsheet_cols.column_name.str.lower().str.replace(" ","_")

In [8]:
# Place names into a list
smartsheet_cols_list = smartsheet_cols.column_name.sort_values().unique().tolist()

In [9]:
len(smartsheet_cols_list)

27

In [10]:
# Read in the full df with data, to compare columns
nonshopp_full_df = to_snakecase(
    pd.read_excel(f"{_utils.GCS_FILE_PATH}cleaned_data_atp_tircp.xlsx")
)

In [11]:
# nonshopp_full_df.sample().T.reset_index().sort_values('index')

In [12]:
smartsheet_cols_list

['beg_pm_(pm_back)',
 'caltrans_(ct)_project_id',
 'con_capital_cost_($1,000)',
 'con_existing_source_of_funding',
 'con_start_date_(m500)',
 'con_support_cost_($1,000)',
 'county',
 'current_phase',
 'district',
 'end_pm',
 'expenditure_authorization_(ea)',
 'funding_need_phase',
 'pa&ed_cost_($1,000)',
 'parcel_counts',
 'pid_approval_date_(m010)',
 'project_description',
 'project_name',
 'project_planning_number_(ppno)',
 'ps&e_cost_($1,000)',
 'purpose_&_need',
 'route',
 'row_cost_($1,000)',
 'rtl_date_(m460)',
 'target_opening_year',
 'target_pa&ed_(m200)',
 'total_project_cost_($1,000)',
 'total_unfunded_need_($1,000)']

#### Columns from Capital Improvement Project Report
* https://qmrs.dot.ca.gov/qmrs/f?p=CIPOUTNEW:HOME
* Per John, this report has all the current PRSM milestone information and programmed CTIPS funding.
* No additional information in the "about" page. 
* Maybe around 14 columns in here that match Smartsheet?
* QMRS tutorial: https://pdelearning.ctpass.dot.ca.gov/mod/scorm/view.php?id=1503

In [13]:
prsm_ctips = to_snakecase(pd.read_excel(f"{gcs_path}Capital Improvement Project Report.xlsx"))



<b>Manual Check</b>
* Left `smartsheet`. Right: `prsm_ctips`

<b> Somewhat sure</b>
1. 'caltrans_(ct)_project_id': 'project_id'
2. 'total_project_cost_($1,000)': 'total_capital_estimate'
3. 'project_planning_number_(ppno)': ctips_ppno
4. 'con_capital_cost_($1,000)': 'total_capital_estimate	' 
5. 'con_capital_cost_($1,000)': 'total_con_capital_est'

<b>Unsure</b>
1. 'beg_pm'= bpm ???
2. 'rtl_date__m460': 'rtl_finish_date'
3. 'target_pa&ed_(m200)': 'target_pa_ed__m200'
4. 'current_phase':'project_status'
5.  'rtl_date_(m460)': 'rtl_finish_date' 

In [14]:
# prsm_ctips.sample().T.reset_index().sort_values('index')

In [15]:
prsm_ctips_cols = list(prsm_ctips.columns.sort_values())

In [16]:
len(prsm_ctips_cols)

165

In [17]:
# Only 3 columns overlap for sure
print(set(smartsheet_cols_list).intersection(set(prsm_ctips_cols)))

{'county', 'route', 'district'}


In [18]:
# Use fuzzy wuzzy to check out any columns
for i in prsm_ctips_cols:
    _string_utils.replace_matches_set_ratio(
        smartsheet_cols, "column_name", "prsm_ctips_1", i, 70
 )

In [19]:
len(smartsheet_cols[smartsheet_cols.prsm_ctips_1.notnull()])

6

In [20]:
smartsheet_cols[smartsheet_cols.prsm_ctips_1.notnull()]

Unnamed: 0,column_name,description_notes,ctips,prsm,prsm_ctips_1
1,expenditure_authorization_(ea),Unique 5-digit number assigned by Caltrans.,yes,yes,ea
3,project_name,In CTIPS: “Project Title”,yes,yes,project_nickname
4,district,Caltrans HQ DOTP is District 74. DRMT is District 75,yes,yes,district
5,county,Multiple counties = “VAR” in CTIPS,yes,yes,county
6,route,CTIPS is blank for rail projects; OSIP uses “OFF”,yes,yes,route
7,project_description,CTIPS: “Description”. Note: CTIPS description often does not match the project description provided in the intake forms.,yes,yes,work_description


### Statewide Delivery Plan
* Only SHOPP project

In [21]:
sw_delivery_plans = to_snakecase(pd.read_excel(f"{gcs_path}Statewide Delivery Plan.xlsx"))



In [22]:
sw_delivery_plans.document.value_counts()

SHOPP    200
STIP      12
Name: document, dtype: int64