## Clustering with 2023 data ONLY

Try with only 2023 data

In [None]:
from calitp_data_analysis.sql import query_sql
from utils_transit_peer_groups import *
import sys
import altair as alt
import pandas as pd

sys.path.append("../ntd/monthly_ridership_report")
from update_vars import GCS_FILE_PATH, NTD_MODES, NTD_TOS

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [None]:
# include in clustering
categorical_cols = ["mode", "service", "reporter_type"]

# include in clustering
numerical_cols = [
    "total_upt",
    "total_vrh",
    "total_vrm",
    "opexp_total",
    "total_voms",
    # "uza_population",
    # "uza_area_sq_miles",
]

In [2]:
# alt method to query warehous data, via cal-itp docs
query = f"""
        SELECT
          upt.ntd_id,
          upt.source_agency,
          upt.agency_status,
          upt.city,
          upt.primary_uza_name,
          upt.uza_population,
          upt.uza_area_sq_miles,
          upt.year,
          upt.mode,
          upt.service,
          upt.reporter_type,
          SUM(upt.upt) AS total_upt,
          SUM(voms.voms) AS total_voms,
          SUM(vrh.vrh) AS total_vrh,
          SUM(vrm.vrm) AS total_vrm,
          SUM(opexp_total.opexp_total) AS opexp_total
        FROM
          cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt AS upt
        INNER JOIN
          cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_voms AS voms
        ON
          upt.ntd_id = voms.ntd_id
          AND upt.year = voms.year
          AND upt.source_agency = voms.source_agency
          AND upt.agency_status = voms.agency_status
          AND upt.primary_uza_name = voms.primary_uza_name
          AND upt.uza_population = voms.uza_population
          AND upt.uza_area_sq_miles = voms.uza_area_sq_miles
        INNER JOIN
          cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_vrh AS vrh
        ON
          upt.ntd_id = vrh.ntd_id
          AND upt.year = vrh.year
          AND upt.source_agency = vrh.source_agency
          AND upt.agency_status = vrh.agency_status
          AND upt.primary_uza_name = vrh.primary_uza_name
          AND upt.uza_population = vrh.uza_population
          AND upt.uza_area_sq_miles = vrh.uza_area_sq_miles
        INNER JOIN
          cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_vrm AS vrm
        ON
          upt.ntd_id = vrm.ntd_id
          AND upt.year = vrm.year
          AND upt.source_agency = vrm.source_agency
          AND upt.agency_status = vrm.agency_status
          AND upt.primary_uza_name = vrm.primary_uza_name
          AND upt.uza_population = vrm.uza_population
          AND upt.uza_area_sq_miles = vrm.uza_area_sq_miles
        INNER JOIN
          cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_opexp_total AS opexp_total
        ON
          upt.ntd_id = opexp_total.ntd_id
          AND upt.year = opexp_total.year
          AND upt.source_agency = opexp_total.source_agency
          AND upt.agency_status = opexp_total.agency_status
          AND upt.primary_uza_name = opexp_total.primary_uza_name
          AND upt.uza_population = opexp_total.uza_population
          AND upt.uza_area_sq_miles = opexp_total.uza_area_sq_miles
        WHERE
          upt.source_state = "CA"
          AND upt.year = 2023
        GROUP BY
          upt.ntd_id,
          upt.source_agency,
          upt.agency_status,
          upt.city,
          upt.year,
          upt.primary_uza_name,
          upt.uza_population,
          upt.uza_area_sq_miles,
          upt.mode,
          upt.service,
          upt.reporter_type
        """

ntd_2023_data = query_sql(query).fillna(0)

In [6]:
ntd_2023_data[numerical_cols] = ntd_2023_data[numerical_cols].astype("int64", errors="ignore")

### 2023 - explore data

In [None]:
display(
    ntd_2023_data.info(),

    ntd_2023_data.head(),

    ntd_2023_data["service"].value_counts(),

    ntd_2023_data["mode"].value_counts(),
)

### 2023 - correlation matrix

In [None]:
ntd_2023_data[numerical_cols].corr()

### 2023 - Test Hierarchal clustering w/ ward

In [7]:
from sklearn.cluster import AgglomerativeClustering 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [8]:
display(
    numerical_cols,
    categorical_cols
)

['total_upt', 'total_vrh', 'total_vrm', 'opexp_total', 'total_voms']

['mode', 'service', 'reporter_type']

In [45]:
# 1. set up pre-processing steps with column transformer

preprocessor = ColumnTransformer(
    [
        ("ntd_metrics", StandardScaler(), numerical_cols),
        ("categorical", OneHotEncoder(drop="first", sparse_output=False), categorical_cols)
    ]
)
preprocessor

0,1,2
,transformers,"[('ntd_metrics', ...), ('categorical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [46]:
# 2. set up pipeline. First pre-processing, then clustering
pipeline = Pipeline(
    [
        ("preprocessing", preprocessor),
        ("clustering", AgglomerativeClustering(n_clusters=10, linkage="ward"))
    ]
)

pipeline

0,1,2
,steps,"[('preprocessing', ...), ('clustering', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('ntd_metrics', ...), ('categorical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_clusters,10
,metric,'euclidean'
,memory,
,connectivity,
,compute_full_tree,'auto'
,linkage,'ward'
,distance_threshold,
,compute_distances,False


In [47]:
# 3. use pipeline to fit clustering model. create new column for clustering
ntd_2023_fit = ntd_2023_data.copy() # why do i need to copy/clone?

ntd_2023_fit["cluster_name"] = pipeline.fit_predict(ntd_2023_fit)

In [48]:
display(
    ntd_2023_fit["cluster_name"].value_counts(),
    ntd_2023_fit[["source_agency","total_upt","total_vrh","cluster_name"]].sample(3)
)

0    167
2    135
1    114
3    103
8     84
6     45
7     10
4      7
9      1
5      1
Name: cluster_name, dtype: int64

Unnamed: 0,source_agency,total_upt,total_vrh,cluster_name
219,County of Placer (PCT/TART) - Department of Pu...,1212473344,54167552,1
507,City of Escalon - Transit Services,58077,21330,3
503,City of El Monte - Transportation Services,3348480,2021824,8


In [49]:
ntd_2023_data["service"].unique()

array(['PT', 'DO', 'TX', 'TN'], dtype=object)

In [50]:
## what are the feature columns with onehotencoding drop=first?
feature_names = preprocessor.get_feature_names_out()
display(list(feature_names))


['ntd_metrics__total_upt',
 'ntd_metrics__total_vrh',
 'ntd_metrics__total_vrm',
 'ntd_metrics__opexp_total',
 'ntd_metrics__total_voms',
 'categorical__mode_CC',
 'categorical__mode_CR',
 'categorical__mode_DR',
 'categorical__mode_DT',
 'categorical__mode_FB',
 'categorical__mode_HR',
 'categorical__mode_JT',
 'categorical__mode_LR',
 'categorical__mode_MB',
 'categorical__mode_MG',
 'categorical__mode_OR',
 'categorical__mode_OT',
 'categorical__mode_RB',
 'categorical__mode_SR',
 'categorical__mode_TB',
 'categorical__mode_VP',
 'categorical__mode_YR',
 'categorical__service_PT',
 'categorical__service_TN',
 'categorical__service_TX',
 'categorical__reporter_type_Full Reporter',
 'categorical__reporter_type_Reduced Reporter',
 'categorical__reporter_type_Rural Reporter',
 'categorical__reporter_type_Separate Service']

## what are the feature columns with onehotencoding drop=None?


In [59]:
preprocessor_2 = ColumnTransformer(
    [
        ("ntd_metrics", StandardScaler(), numerical_cols),
        ("categorical", OneHotEncoder(
            # drop="first", 
            sparse_output=False
        ), categorical_cols)
    ]
)

pipeline_2 = Pipeline(
    [
        ("preprocessing", preprocessor_2),
        ("clustering", AgglomerativeClustering(n_clusters=10, linkage="ward"))
    ]
)

ntd_2023_fit_2 = ntd_2023_data.copy()

ntd_2023_fit_2["cluster"] = pipeline_2.fit_predict(ntd_2023_fit_2)

In [60]:
feature_names_2= preprocessor_2.get_feature_names_out()

display(list(feature_names_2)) # service_DO appears

['ntd_metrics__total_upt',
 'ntd_metrics__total_vrh',
 'ntd_metrics__total_vrm',
 'ntd_metrics__opexp_total',
 'ntd_metrics__total_voms',
 'categorical__mode_CB',
 'categorical__mode_CC',
 'categorical__mode_CR',
 'categorical__mode_DR',
 'categorical__mode_DT',
 'categorical__mode_FB',
 'categorical__mode_HR',
 'categorical__mode_JT',
 'categorical__mode_LR',
 'categorical__mode_MB',
 'categorical__mode_MG',
 'categorical__mode_OR',
 'categorical__mode_OT',
 'categorical__mode_RB',
 'categorical__mode_SR',
 'categorical__mode_TB',
 'categorical__mode_VP',
 'categorical__mode_YR',
 'categorical__service_DO',
 'categorical__service_PT',
 'categorical__service_TN',
 'categorical__service_TX',
 'categorical__reporter_type_Building Reporter',
 'categorical__reporter_type_Full Reporter',
 'categorical__reporter_type_Reduced Reporter',
 'categorical__reporter_type_Rural Reporter',
 'categorical__reporter_type_Separate Service']

## Using utils function to cluster

In [None]:
raw_ntd_metrics_merge = raw_ntd_metrics_merge.fillna(0)

In [None]:
categorical_cols

In [None]:
ntd_metrics_yearly.columns

In [None]:
cluster_2023 = make_hierarchal_clustering(
    data = ntd_2023_data,
    num_cols = numerical_cols,
    cat_cols = categorical_cols,
    cluster_num = 10
    
)

# raw_ntd_metrics_merge was toooo much data, need to use the yearly aggregated total 
# using ntd_metrics_yearly doesnt have a lot of categorical columns
cluster_18_23 = make_hierarchal_clustering(
    data = ntd_metrics_yearly,
    num_cols = ['total_upt', 'total_voms', 'total_vrh', 'total_vrm', 'opexp_total'],
    cat_cols = ["reporter_type"],
    cluster_num = 10
    
)

In [None]:
cluster_2023["cluster_name"].value_counts() == ntd_2023_fit["cluster_name"].value_counts()

# confirms the function matches the manual way

In [None]:
display(
    cluster_2023["cluster_name"].value_counts(),
    cluster_18_23["cluster_name"].value_counts()
)