# Clustering with 2023 data ONLY

Try with only 2023 data

In [1]:
!pip install scipy==1.11.4



In [2]:
from calitp_data_analysis.sql import query_sql
from utils_transit_peer_groups import *
import sys
import altair as alt
import pandas as pd

sys.path.append("../ntd/monthly_ridership_report")
from update_vars import GCS_FILE_PATH, NTD_MODES, NTD_TOS

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [3]:
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

In [4]:
fixed_guideway = [
    "SR",
    "HR",
    "YR",
    "CR",
    "CC",
    "LR",
    "MG",
    "TB",
]

nonfixed_guideway = [
    "DR",
    "MB",
    "DT",
    "CB",
    "VP",
    "RB",
]

other_modes = [
    "FB",
    "OT", # no clue what this is, not in NTD glossary
    "OR", # same with this
    
]

mode_dict ={
    "SR":"fixed_guideway",
    "HR":"fixed_guideway",
    "YR":"fixed_guideway",
    "CR":"fixed_guideway",
    "CC":"fixed_guideway",
    "LR":"fixed_guideway",
    "MG":"fixed_guideway",
    "TB":"fixed_guideway",
    "DR":"nonfixed_guideway",
    "MB":"nonfixed_guideway",
    "DT":"nonfixed_guideway",
    "CB":"nonfixed_guideway",
    "VP":"nonfixed_guideway",
    "RB":"nonfixed_guideway",
    "FB":"other_mode",
    "OT":"other_mode", # no clue what this is, not in NTD glossary
    "OR":"other_mode",
}

In [5]:
#ID cols
id_cols = [
    "ntd_id",
    "source_agency",
    "city",
    "primary_uza_name"
]
# include in clustering
categorical_cols = ["mode_cat", "service", "reporter_type"]

# include in clustering
numerical_cols = [
    "total_upt",
    "total_vrh",
    "total_vrm",
    "opexp_total",
    "total_voms",
    # "uza_population",
    # "uza_area_sq_miles",
]

In [6]:
# alt method to query warehous data, via cal-itp docs
query = f"""
        SELECT
          upt.ntd_id,
          upt.source_agency,
          upt.agency_status,
          upt.city,
          upt.primary_uza_name,
          upt.uza_population,
          upt.uza_area_sq_miles,
          upt.year,
          upt.mode,
          upt.service,
          upt.reporter_type,
          SUM(upt.upt) AS total_upt,
          SUM(voms.voms) AS total_voms,
          SUM(vrh.vrh) AS total_vrh,
          SUM(vrm.vrm) AS total_vrm,
          SUM(opexp_total.opexp_total) AS opexp_total
        FROM
          cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt AS upt
        INNER JOIN
          cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_voms AS voms
        ON
          upt.ntd_id = voms.ntd_id
          AND upt.year = voms.year
          AND upt.source_agency = voms.source_agency
          AND upt.agency_status = voms.agency_status
          AND upt.primary_uza_name = voms.primary_uza_name
          AND upt.uza_population = voms.uza_population
          AND upt.uza_area_sq_miles = voms.uza_area_sq_miles
        INNER JOIN
          cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_vrh AS vrh
        ON
          upt.ntd_id = vrh.ntd_id
          AND upt.year = vrh.year
          AND upt.source_agency = vrh.source_agency
          AND upt.agency_status = vrh.agency_status
          AND upt.primary_uza_name = vrh.primary_uza_name
          AND upt.uza_population = vrh.uza_population
          AND upt.uza_area_sq_miles = vrh.uza_area_sq_miles
        INNER JOIN
          cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_vrm AS vrm
        ON
          upt.ntd_id = vrm.ntd_id
          AND upt.year = vrm.year
          AND upt.source_agency = vrm.source_agency
          AND upt.agency_status = vrm.agency_status
          AND upt.primary_uza_name = vrm.primary_uza_name
          AND upt.uza_population = vrm.uza_population
          AND upt.uza_area_sq_miles = vrm.uza_area_sq_miles
        INNER JOIN
          cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_opexp_total AS opexp_total
        ON
          upt.ntd_id = opexp_total.ntd_id
          AND upt.year = opexp_total.year
          AND upt.source_agency = opexp_total.source_agency
          AND upt.agency_status = opexp_total.agency_status
          AND upt.primary_uza_name = opexp_total.primary_uza_name
          AND upt.uza_population = opexp_total.uza_population
          AND upt.uza_area_sq_miles = opexp_total.uza_area_sq_miles
        WHERE
          upt.source_state = "CA"
          AND upt.year = 2023
        GROUP BY
          upt.ntd_id,
          upt.source_agency,
          upt.agency_status,
          upt.city,
          upt.year,
          upt.primary_uza_name,
          upt.uza_population,
          upt.uza_area_sq_miles,
          upt.mode,
          upt.service,
          upt.reporter_type
        """

ntd_2023_data = query_sql(query).fillna(0)

In [7]:
ntd_2023_data[numerical_cols] = ntd_2023_data[numerical_cols].astype("int64", errors="ignore")

## replace values for mode and tos

In [8]:
service_dict = {
    "TX":"PT",
    "TN":"PT"
}

ntd_2023_data["service"] = ntd_2023_data["service"].replace(service_dict)

In [9]:
ntd_2023_data["service"].unique()

array(['PT', 'DO'], dtype=object)

In [10]:
ntd_2023_data["mode_cat"] = ntd_2023_data["mode"].map(mode_dict)

In [11]:
ntd_2023_data["mode_cat"].unique()

array(['nonfixed_guideway', 'fixed_guideway', 'other_mode', nan],
      dtype=object)

In [20]:
ntd_2023_data.head()

Unnamed: 0,ntd_id,source_agency,agency_status,city,primary_uza_name,uza_population,uza_area_sq_miles,year,mode,service,reporter_type,total_upt,total_voms,total_vrh,total_vrm,opexp_total,mode_cat
0,90198,City of Porterville (COLT) - Transit Department,Active,Porterville,"Porterville, CA",69862,16.35,2023,MB,PT,Building Reporter,0,0,0,0,0,nonfixed_guideway
1,90198,City of Porterville (COLT) - Transit Department,Active,Porterville,"Porterville, CA",69862,16.35,2023,DR,PT,Building Reporter,0,0,0,0,0,nonfixed_guideway
2,90036,Orange County Transportation Authority (OCTA),Active,Orange,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,SR,PT,Full Reporter,0,1075448,2764357893,37310649233,428923289696,fixed_guideway
3,90013,Santa Clara Valley Transportation Authority (VTA),Active,San Jose,"San Jose, CA",1837446,285.48,2023,HR,DO,Full Reporter,0,262144,887382528,11150999040,232484337152,fixed_guideway
4,90170,ATC / Vancom,Inactive,0,"San Francisco--Oakland, CA",3515933,513.8,2023,DR,PT,Full Reporter,0,0,0,0,0,nonfixed_guideway


## 2023 - explore data

In [14]:
display(
    ntd_2023_data.info(),

    ntd_2023_data["service"].value_counts(),

    ntd_2023_data["mode_cat"].value_counts(),
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667 entries, 0 to 666
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ntd_id             667 non-null    object 
 1   source_agency      667 non-null    object 
 2   agency_status      667 non-null    object 
 3   city               667 non-null    object 
 4   primary_uza_name   667 non-null    object 
 5   uza_population     667 non-null    int64  
 6   uza_area_sq_miles  667 non-null    float64
 7   year               667 non-null    int64  
 8   mode               667 non-null    object 
 9   service            667 non-null    object 
 10  reporter_type      667 non-null    object 
 11  total_upt          667 non-null    int64  
 12  total_voms         667 non-null    int64  
 13  total_vrh          667 non-null    int64  
 14  total_vrm          667 non-null    int64  
 15  opexp_total        667 non-null    int64  
 16  mode_cat           666 non

None

PT    437
DO    230
Name: service, dtype: int64

nonfixed_guideway    622
fixed_guideway        28
other_mode            16
Name: mode_cat, dtype: int64

## 2023 - correlation matrix

In [None]:
ntd_2023_data[numerical_cols].corr()

## 2023 - Manual Hierarchal clustering w/ ward

In [15]:
from sklearn.cluster import AgglomerativeClustering 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [16]:
numerical_cols

['total_upt', 'total_vrh', 'total_vrm', 'opexp_total', 'total_voms']

In [17]:
categorical_cols

['mode_cat', 'service', 'reporter_type']

In [18]:
# 1. set up pre-processing steps with column transformer

preprocessor = ColumnTransformer(
    [
        ("ntd_metrics", StandardScaler(), numerical_cols),
        ("categorical", OneHotEncoder(drop="first", sparse_output=False), categorical_cols)
    ]
)
preprocessor

0,1,2
,transformers,"[('ntd_metrics', ...), ('categorical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [19]:
# 2. set up pipeline. First pre-processing, then clustering
pipeline = Pipeline(
    [
        ("preprocessing", preprocessor),
        ("clustering", AgglomerativeClustering(n_clusters=10, linkage="ward"))
    ]
)

pipeline

0,1,2
,steps,"[('preprocessing', ...), ('clustering', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('ntd_metrics', ...), ('categorical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_clusters,10
,metric,'euclidean'
,memory,
,connectivity,
,compute_full_tree,'auto'
,linkage,'ward'
,distance_threshold,
,compute_distances,False


### Clustering with entire raw data 

In [21]:
# 3. use pipeline to fit clustering model. create new column for clustering
ntd_2023_fit = ntd_2023_data.copy() # why do i need to copy/clone?

ntd_2023_fit["cluster_name"] = pipeline.fit_predict(ntd_2023_fit)

In [22]:
display(
    ntd_2023_fit["cluster_name"].value_counts(),
    ntd_2023_fit[["source_agency","total_upt","total_vrh","cluster_name"]].sample(3)
)

8    183
1    182
2    128
0     83
9     50
4     22
3     10
6      7
5      1
7      1
Name: cluster_name, dtype: int64

Unnamed: 0,source_agency,total_upt,total_vrh,cluster_name
508,Los Angeles County - Department of Public Work...,20089,6116,1
655,Yosemite Area Regional Transportation System (...,991200,138920,2
538,Morongo Basin Transit Authority (MBTA BUS),11810934,860409,2


In [23]:
ntd_2023_data["service"].unique()

array(['PT', 'DO'], dtype=object)

### what are the feature columns with onehotencoding drop=first?

In [24]:
feature_names = preprocessor.get_feature_names_out()
display(list(feature_names))

['ntd_metrics__total_upt',
 'ntd_metrics__total_vrh',
 'ntd_metrics__total_vrm',
 'ntd_metrics__opexp_total',
 'ntd_metrics__total_voms',
 'categorical__mode_cat_nonfixed_guideway',
 'categorical__mode_cat_other_mode',
 'categorical__mode_cat_nan',
 'categorical__service_PT',
 'categorical__reporter_type_Full Reporter',
 'categorical__reporter_type_Reduced Reporter',
 'categorical__reporter_type_Rural Reporter',
 'categorical__reporter_type_Separate Service']

### what are the feature columns with `onehotencoding drop=None`?


In [25]:
preprocessor_2 = ColumnTransformer(
    [
        ("ntd_metrics", StandardScaler(), numerical_cols),
        ("categorical", OneHotEncoder(
            # drop="first", 
            sparse_output=False
        ), categorical_cols)
    ]
)

pipeline_2 = Pipeline(
    [
        ("preprocessing", preprocessor_2),
        ("clustering", AgglomerativeClustering(n_clusters=10, linkage="ward"))
    ]
)

ntd_2023_fit_2 = ntd_2023_data.copy()

ntd_2023_fit_2["cluster"] = pipeline_2.fit_predict(ntd_2023_fit_2)

In [26]:
feature_names_2= preprocessor_2.get_feature_names_out()

display(list(feature_names_2)) # service_DO appears

['ntd_metrics__total_upt',
 'ntd_metrics__total_vrh',
 'ntd_metrics__total_vrm',
 'ntd_metrics__opexp_total',
 'ntd_metrics__total_voms',
 'categorical__mode_cat_fixed_guideway',
 'categorical__mode_cat_nonfixed_guideway',
 'categorical__mode_cat_other_mode',
 'categorical__mode_cat_nan',
 'categorical__service_DO',
 'categorical__service_PT',
 'categorical__reporter_type_Building Reporter',
 'categorical__reporter_type_Full Reporter',
 'categorical__reporter_type_Reduced Reporter',
 'categorical__reporter_type_Rural Reporter',
 'categorical__reporter_type_Separate Service']

## Using utils function to cluster

### 2023 data, all modes, PT/DO service

In [27]:
cluster_2023 = make_hierarchal_clustering(
    data = ntd_2023_data,
    num_cols = numerical_cols,
    cat_cols = categorical_cols,
    cluster_num = 10
    
)

for i in cluster_2023["cluster_name"].unique():
    print(f"""
    \n How many unique agencies are in Cluster {i}: {cluster_2023[cluster_2023['cluster_name']==i]["source_agency"].nunique()}
    Cluster {i} sample:""")
    display(cluster_2023[cluster_2023["cluster_name"]==i].head()),
    print(f"\nCluster {i} summary stats:")
    display(cluster_2023[cluster_2023["cluster_name"]==i].describe())



    
 How many unique agencies are in Cluster 2: 52
    Cluster 2 sample:


Unnamed: 0,ntd_id,source_agency,agency_status,city,primary_uza_name,uza_population,uza_area_sq_miles,year,mode,service,reporter_type,total_upt,total_voms,total_vrh,total_vrm,opexp_total,mode_cat,cluster_name
0,90198,City of Porterville (COLT) - Transit Department,Active,Porterville,"Porterville, CA",69862,16.35,2023,MB,PT,Building Reporter,0,0,0,0,0,nonfixed_guideway,2
1,90198,City of Porterville (COLT) - Transit Department,Active,Porterville,"Porterville, CA",69862,16.35,2023,DR,PT,Building Reporter,0,0,0,0,0,nonfixed_guideway,2
9,91063,Calaveras County Department of Public Works,Inactive,0,California Non-UZA,4357850,0.0,2023,MB,PT,Rural Reporter,0,0,0,0,0,nonfixed_guideway,2
15,91018,City of Arcata (A&MRTS),Active,Arcata,California Non-UZA,4357850,0.0,2023,MB,DO,Rural Reporter,0,16,45384,609936,8749840,nonfixed_guideway,2
25,91071,City of Chowchilla (CATX) - Transit Department,Active,Chowchilla,California Non-UZA,4357850,0.0,2023,DR,PT,Rural Reporter,0,54,55350,527958,14011704,nonfixed_guideway,2



Cluster 2 summary stats:


Unnamed: 0,uza_population,uza_area_sq_miles,year,total_upt,total_voms,total_vrh,total_vrm,opexp_total,cluster_name
count,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0
mean,4182396.0,15.39125,2023.0,5340320.0,532.25,794825.0,14108640.0,84993760.0,2.0
std,763558.5,81.773353,0.0,23111910.0,999.613256,1291742.0,24799460.0,127538000.0,0.0
min,69862.0,0.0,2023.0,0.0,0.0,0.0,0.0,0.0,2.0
25%,4357850.0,0.0,2023.0,0.0,14.5,26860.0,436746.0,3131456.0,2.0
50%,4357850.0,0.0,2023.0,63872.0,136.0,190001.5,5315841.0,30603450.0,2.0
75%,4357850.0,0.0,2023.0,1106420.0,486.0,764208.0,16158500.0,121001200.0,2.0
max,4357850.0,467.58,2023.0,216431600.0,4160.0,4746250.0,105933400.0,455201400.0,2.0



    
 How many unique agencies are in Cluster 3: 1
    Cluster 3 sample:


Unnamed: 0,ntd_id,source_agency,agency_status,city,primary_uza_name,uza_population,uza_area_sq_miles,year,mode,service,reporter_type,total_upt,total_voms,total_vrh,total_vrm,opexp_total,mode_cat,cluster_name
2,90036,Orange County Transportation Authority (OCTA),Active,Orange,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,SR,PT,Full Reporter,0,1075448,2764357893,37310649233,428923289696,fixed_guideway,3
159,90036,Orange County Transportation Authority (OCTA),Active,Orange,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,CR,PT,Full Reporter,0,1075448,2764357893,37310649233,428923289696,fixed_guideway,3
160,90036,Orange County Transportation Authority (OCTA),Active,Orange,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,DT,PT,Full Reporter,0,1075448,2764357893,37310649233,428923289696,nonfixed_guideway,3
161,90036,Orange County Transportation Authority (OCTA),Active,Orange,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,CB,DO,Full Reporter,0,1075448,2764357893,37310649233,428923289696,nonfixed_guideway,3
162,90036,Orange County Transportation Authority (OCTA),Active,Orange,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,CB,PT,Full Reporter,0,1075448,2764357893,37310649233,428923289696,nonfixed_guideway,3



Cluster 3 summary stats:


Unnamed: 0,uza_population,uza_area_sq_miles,year,total_upt,total_voms,total_vrh,total_vrm,opexp_total,cluster_name
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,12237376.0,1636.83,2023.0,14695220000.0,1075448.0,2764358000.0,37310650000.0,428923300000.0,3.0
std,0.0,0.0,0.0,39879800000.0,0.0,0.0,0.0,0.0,0.0
min,12237376.0,1636.83,2023.0,0.0,1075448.0,2764358000.0,37310650000.0,428923300000.0,3.0
25%,12237376.0,1636.83,2023.0,0.0,1075448.0,2764358000.0,37310650000.0,428923300000.0,3.0
50%,12237376.0,1636.83,2023.0,0.0,1075448.0,2764358000.0,37310650000.0,428923300000.0,3.0
75%,12237376.0,1636.83,2023.0,4257365000.0,1075448.0,2764358000.0,37310650000.0,428923300000.0,3.0
max,12237376.0,1636.83,2023.0,127754500000.0,1075448.0,2764358000.0,37310650000.0,428923300000.0,3.0



    
 How many unique agencies are in Cluster 4: 3
    Cluster 4 sample:


Unnamed: 0,ntd_id,source_agency,agency_status,city,primary_uza_name,uza_population,uza_area_sq_miles,year,mode,service,reporter_type,total_upt,total_voms,total_vrh,total_vrm,opexp_total,mode_cat,cluster_name
3,90013,Santa Clara Valley Transportation Authority (VTA),Active,San Jose,"San Jose, CA",1837446,285.48,2023,HR,DO,Full Reporter,0,262144,887382528,11150999040,232484337152,fixed_guideway,4
13,90015,City and County of San Francisco (SFMTA) - Tra...,Active,San Francisco,"San Francisco--Oakland, CA",3515933,513.8,2023,DT,PT,Full Reporter,0,265482,1132664890,8302615811,334251644027,nonfixed_guideway,4
188,90026,San Diego Metropolitan Transit System (MTS),Active,San Diego,"San Diego, CA",3070300,674.72,2023,DT,PT,Full Reporter,0,265482,882610687,11694910507,121153121614,nonfixed_guideway,4
210,90013,Santa Clara Valley Transportation Authority (VTA),Active,San Jose,"San Jose, CA",1837446,285.48,2023,VP,PT,Full Reporter,0,262144,887382528,11150999040,232484337152,nonfixed_guideway,4
211,90013,Santa Clara Valley Transportation Authority (VTA),Active,San Jose,"San Jose, CA",1837446,285.48,2023,DR,PT,Full Reporter,0,262144,887382528,11150999040,232484337152,nonfixed_guideway,4



Cluster 4 summary stats:


Unnamed: 0,uza_population,uza_area_sq_miles,year,total_upt,total_voms,total_vrh,total_vrm,opexp_total,cluster_name
count,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
mean,2763782.0,481.976364,2023.0,27102140000.0,264268.181818,963908600.0,10417760000.0,229441300000.0,4.0
std,739452.1,165.61955,0.0,44726120000.0,1643.517588,118012500.0,1496595000.0,87028960000.0,0.0
min,1837446.0,285.48,2023.0,0.0,262144.0,882610700.0,8302616000.0,121153100000.0,4.0
25%,1837446.0,285.48,2023.0,42518110.0,262144.0,882610700.0,8302616000.0,121153100000.0,4.0
50%,3070300.0,513.8,2023.0,949383100.0,265482.0,887382500.0,11151000000.0,232484300000.0,4.0
75%,3515933.0,674.72,2023.0,39106490000.0,265482.0,1132665000.0,11694910000.0,334251600000.0,4.0
max,3515933.0,674.72,2023.0,172516400000.0,265482.0,1132665000.0,11694910000.0,334251600000.0,4.0



    
 How many unique agencies are in Cluster 8: 82
    Cluster 8 sample:


Unnamed: 0,ntd_id,source_agency,agency_status,city,primary_uza_name,uza_population,uza_area_sq_miles,year,mode,service,reporter_type,total_upt,total_voms,total_vrh,total_vrm,opexp_total,mode_cat,cluster_name
4,90170,ATC / Vancom,Inactive,0,"San Francisco--Oakland, CA",3515933,513.8,2023,DR,PT,Full Reporter,0,0,0,0,0,nonfixed_guideway,8
5,90157,Access Services (AS),Active,El Monte,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,DT,PT,Full Reporter,0,24840,52087617,863173746,5569456833,nonfixed_guideway,8
7,90182,Altamont Corridor Express (ACE),Active,Stockton,"Stockton, CA",414847,92.46,2023,MB,PT,Full Reporter,0,216,198656,7908240,289204088,nonfixed_guideway,8
8,90121,Antelope Valley Transit Authority (AVTA),Active,Lancaster,"Palmdale--Lancaster, CA",359559,84.78,2023,DT,PT,Full Reporter,0,5504,14106304,225935616,2014672512,nonfixed_guideway,8
10,90127,Chico Area Transit System City of Chico (CATS),Inactive,0,"Chico, CA",111411,33.72,2023,MB,PT,Full Reporter,0,0,0,0,0,nonfixed_guideway,8



Cluster 8 summary stats:


Unnamed: 0,uza_population,uza_area_sq_miles,year,total_upt,total_voms,total_vrh,total_vrm,opexp_total,cluster_name
count,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0
mean,3780447.0,559.030656,2023.0,80817280.0,6984.016393,15221640.0,216453600.0,2493637000.0,8.0
std,5020212.0,655.621215,0.0,669599400.0,11139.007553,24766590.0,329626700.0,3969685000.0,0.0
min,56904.0,14.11,2023.0,0.0,0.0,0.0,0.0,0.0,8.0
25%,175132.0,43.08,2023.0,0.0,164.0,278549.5,6571976.0,48148310.0,8.0
50%,376117.0,132.12,2023.0,756736.0,1215.0,2673351.0,43708520.0,417076200.0,8.0
75%,7876654.0,1155.775,2023.0,14250620.0,7875.0,17140620.0,269549500.0,2329236000.0,8.0
max,12237380.0,1636.83,2023.0,8965112000.0,46250.0,104547000.0,1292360000.0,14032320000.0,8.0



    
 How many unique agencies are in Cluster 9: 7
    Cluster 9 sample:


Unnamed: 0,ntd_id,source_agency,agency_status,city,primary_uza_name,uza_population,uza_area_sq_miles,year,mode,service,reporter_type,total_upt,total_voms,total_vrh,total_vrm,opexp_total,mode_cat,cluster_name
6,90014,Alameda-Contra Costa Transit District,Active,Oakland,"San Francisco--Oakland, CA",3515933,513.8,2023,DR,DO,Full Reporter,0,133920,442299312,4827739104,114999842880,nonfixed_guideway,9
154,90030,North County Transit District (NCTD),Active,Oceanside,"San Diego, CA",3070300,674.72,2023,YR,PT,Full Reporter,0,137781,383370894,5980799106,98931414123,fixed_guideway,9
155,90030,North County Transit District (NCTD),Active,Oceanside,"San Diego, CA",3070300,674.72,2023,CR,PT,Full Reporter,0,137781,383370894,5980799106,98931414123,fixed_guideway,9
156,90030,North County Transit District (NCTD),Active,Oceanside,"San Diego, CA",3070300,674.72,2023,MB,DO,Full Reporter,0,137781,383370894,5980799106,98931414123,nonfixed_guideway,9
157,90030,North County Transit District (NCTD),Active,Oceanside,"San Diego, CA",3070300,674.72,2023,LR,PT,Full Reporter,0,137781,383370894,5980799106,98931414123,fixed_guideway,9



Cluster 9 summary stats:


Unnamed: 0,uza_population,uza_area_sq_miles,year,total_upt,total_voms,total_vrh,total_vrm,opexp_total,cluster_name
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,2422306.0,465.2932,2023.0,4111379000.0,103417.2,291250300.0,4563309000.0,70675040000.0,9.0
std,1161831.0,200.432996,0.0,8924114000.0,28279.115545,102639400.0,2498462000.0,31129970000.0,0.0
min,414847.0,92.46,2023.0,0.0,71496.0,140797200.0,1683439000.0,31274340000.0,9.0
25%,1946618.0,467.58,2023.0,0.0,72467.25,197664000.0,2612286000.0,41780090000.0,9.0
50%,3070300.0,513.8,2023.0,227378000.0,93000.0,340509200.0,4813493000.0,81757160000.0,9.0
75%,3515933.0,608.56,2023.0,1654991000.0,133920.0,383370900.0,5980799000.0,98931410000.0,9.0
max,3515933.0,674.72,2023.0,38918700000.0,137781.0,442299300.0,10654220000.0,114999800000.0,9.0



    
 How many unique agencies are in Cluster 0: 56
    Cluster 0 sample:


Unnamed: 0,ntd_id,source_agency,agency_status,city,primary_uza_name,uza_population,uza_area_sq_miles,year,mode,service,reporter_type,total_upt,total_voms,total_vrh,total_vrm,opexp_total,mode_cat,cluster_name
14,90150,City of Alameda Ferry Services,Inactive,0,"San Francisco--Oakland, CA",3515933,513.8,2023,FB,PT,Full Reporter,0,0,0,0,0,other_mode,0
35,91040,City of Dinuba,Inactive,0,California Non-UZA,4357850,0.0,2023,OT,DO,Rural Reporter,0,0,0,0,0,other_mode,0
44,90027,City of Fresno (FAX) - Department of Transport...,Active,Fresno,"Fresno, CA",717589,159.11,2023,DR,DO,Full Reporter,0,3888,13433715,153100692,1923617079,nonfixed_guideway,0
51,90024,City of La Mirada (LMT) - CS Department,Active,La Mirada,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,DR,DO,Full Reporter,0,40,47984,453536,8111912,nonfixed_guideway,0
53,90119,City of Laguna Beach (CLB) - Transit and Commu...,Active,Laguna Beach,"Mission Viejo--Lake Forest--Laguna Niguel, CA",646843,163.63,2023,JT,DO,Reduced Reporter,0,1536,2868096,27537728,298122368,,0



Cluster 0 summary stats:


Unnamed: 0,uza_population,uza_area_sq_miles,year,total_upt,total_voms,total_vrh,total_vrm,opexp_total,cluster_name
count,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0
mean,4909693.0,627.364217,2023.0,493463700.0,6023.783133,14013010.0,193358600.0,2680982000.0,0.0
std,5168408.0,706.604734,0.0,1547691000.0,9455.342439,21472140.0,286048100.0,3917671000.0,0.0
min,56904.0,0.0,2023.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,297329.0,41.18,2023.0,0.0,60.5,64516.0,756051.0,8509370.0,0.0
50%,3070300.0,163.63,2023.0,0.0,1536.0,2868096.0,46923680.0,651924500.0,0.0
75%,12237380.0,1636.83,2023.0,108300200.0,5125.0,17140620.0,255654400.0,3386562000.0,0.0
max,12237380.0,1636.83,2023.0,10487810000.0,35640.0,82099620.0,983462900.0,14032320000.0,0.0



    
 How many unique agencies are in Cluster 1: 93
    Cluster 1 sample:


Unnamed: 0,ntd_id,source_agency,agency_status,city,primary_uza_name,uza_population,uza_area_sq_miles,year,mode,service,reporter_type,total_upt,total_voms,total_vrh,total_vrm,opexp_total,mode_cat,cluster_name
16,90300,City of Artesia - Transportation Division,Active,Artesia,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,DT,PT,Reduced Reporter,0,40,2296,37696,219984,nonfixed_guideway,1
17,90194,City of Atascadero - Public Works,Active,Atascadero,"El Paso de Robles (Paso Robles)--Atascadero, CA",67804,29.96,2023,MB,PT,Reduced Reporter,0,16,23496,272896,3774760,nonfixed_guideway,1
18,90249,City of Avalon,Active,Avalon,California Non-UZA,4357850,0.0,2023,DT,PT,Reduced Reporter,0,108,253935,1122093,25475229,nonfixed_guideway,1
19,90252,City of Bell - Community Services Department,Active,Bell,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,DT,PT,Reduced Reporter,0,640,346560,3175552,23752576,nonfixed_guideway,1
22,90258,City of Carson - Transportation Services Division,Active,Carson,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,DT,PT,Reduced Reporter,0,2375,885000,12466000,106829000,nonfixed_guideway,1



Cluster 1 summary stats:


Unnamed: 0,uza_population,uza_area_sq_miles,year,total_upt,total_voms,total_vrh,total_vrm,opexp_total,cluster_name
count,182.0,182.0,182.0,182.0,182.0,182.0,182.0,182.0,182.0
mean,7023436.0,913.759066,2023.0,2780456.0,302.82967,407497.4,5475997.0,53736710.0,1.0
std,5579561.0,767.493136,0.0,12891150.0,472.677256,629177.1,9330211.0,84107760.0,0.0
min,54287.0,0.0,2023.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,362005.0,54.81,2023.0,3890.5,16.0,15805.25,208098.0,1909559.0,1.0
50%,12237380.0,1636.83,2023.0,189580.5,108.0,162824.0,1682824.0,18502630.0,1.0
75%,12237380.0,1636.83,2023.0,1419606.0,432.0,475498.0,5063152.0,60716190.0,1.0
max,12237380.0,1636.83,2023.0,158758700.0,2375.0,2868096.0,52224210.0,356156900.0,1.0



    
 How many unique agencies are in Cluster 6: 1
    Cluster 6 sample:


Unnamed: 0,ntd_id,source_agency,agency_status,city,primary_uza_name,uza_population,uza_area_sq_miles,year,mode,service,reporter_type,total_upt,total_voms,total_vrh,total_vrm,opexp_total,mode_cat,cluster_name
139,90154,Los Angeles County Metropolitan Transportation...,Active,Los Angeles,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,DR,PT,Full Reporter,0,1357312,4208721920,56047345152,1124671686144,nonfixed_guideway,6
513,90154,Los Angeles County Metropolitan Transportation...,Active,Los Angeles,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,DR,DO,Full Reporter,2851143680,1357312,4208721920,56047345152,1124671686144,nonfixed_guideway,6
514,90154,Los Angeles County Metropolitan Transportation...,Active,Los Angeles,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,RB,DO,Full Reporter,17907421184,1357312,4208721920,56047345152,1124671686144,nonfixed_guideway,6
515,90154,Los Angeles County Metropolitan Transportation...,Active,Los Angeles,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,VP,PT,Full Reporter,6353235968,1357312,4208721920,56047345152,1124671686144,nonfixed_guideway,6
517,90154,Los Angeles County Metropolitan Transportation...,Active,Los Angeles,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,HR,DO,Full Reporter,109997854720,1357312,4208721920,56047345152,1124671686144,fixed_guideway,6



Cluster 6 summary stats:


Unnamed: 0,uza_population,uza_area_sq_miles,year,total_upt,total_voms,total_vrh,total_vrm,opexp_total,cluster_name
count,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
mean,12237376.0,1636.83,2023.0,46598270000.0,1357312.0,4208722000.0,56047350000.0,1124672000000.0,6.0
std,0.0,0.0,0.0,59534240000.0,0.0,0.0,0.0,0.0,0.0
min,12237376.0,1636.83,2023.0,0.0,1357312.0,4208722000.0,56047350000.0,1124672000000.0,6.0
25%,12237376.0,1636.83,2023.0,4602190000.0,1357312.0,4208722000.0,56047350000.0,1124672000000.0,6.0
50%,12237376.0,1636.83,2023.0,17907420000.0,1357312.0,4208722000.0,56047350000.0,1124672000000.0,6.0
75%,12237376.0,1636.83,2023.0,74605880000.0,1357312.0,4208722000.0,56047350000.0,1124672000000.0,6.0
max,12237376.0,1636.83,2023.0,149864400000.0,1357312.0,4208722000.0,56047350000.0,1124672000000.0,6.0



    
 How many unique agencies are in Cluster 5: 1
    Cluster 5 sample:


Unnamed: 0,ntd_id,source_agency,agency_status,city,primary_uza_name,uza_population,uza_area_sq_miles,year,mode,service,reporter_type,total_upt,total_voms,total_vrh,total_vrm,opexp_total,mode_cat,cluster_name
516,90154,Los Angeles County Metropolitan Transportation...,Active,Los Angeles,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,MB,DO,Full Reporter,805546905600,1357312,4208721920,56047345152,1124671686144,nonfixed_guideway,5



Cluster 5 summary stats:


Unnamed: 0,uza_population,uza_area_sq_miles,year,total_upt,total_voms,total_vrh,total_vrm,opexp_total,cluster_name
count,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
mean,12237376.0,1636.83,2023.0,805546900000.0,1357312.0,4208722000.0,56047350000.0,1124672000000.0,5.0
std,,,,,,,,,
min,12237376.0,1636.83,2023.0,805546900000.0,1357312.0,4208722000.0,56047350000.0,1124672000000.0,5.0
25%,12237376.0,1636.83,2023.0,805546900000.0,1357312.0,4208722000.0,56047350000.0,1124672000000.0,5.0
50%,12237376.0,1636.83,2023.0,805546900000.0,1357312.0,4208722000.0,56047350000.0,1124672000000.0,5.0
75%,12237376.0,1636.83,2023.0,805546900000.0,1357312.0,4208722000.0,56047350000.0,1124672000000.0,5.0
max,12237376.0,1636.83,2023.0,805546900000.0,1357312.0,4208722000.0,56047350000.0,1124672000000.0,5.0



    
 How many unique agencies are in Cluster 7: 1
    Cluster 7 sample:


Unnamed: 0,ntd_id,source_agency,agency_status,city,primary_uza_name,uza_population,uza_area_sq_miles,year,mode,service,reporter_type,total_upt,total_voms,total_vrh,total_vrm,opexp_total,mode_cat,cluster_name
554,90036,Orange County Transportation Authority (OCTA),Active,Orange,"Los Angeles--Long Beach--Anaheim, CA",12237376,1636.83,2023,MB,DO,Full Reporter,331437877573,1075448,2764357893,37310649233,428923289696,nonfixed_guideway,7



Cluster 7 summary stats:


Unnamed: 0,uza_population,uza_area_sq_miles,year,total_upt,total_voms,total_vrh,total_vrm,opexp_total,cluster_name
count,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
mean,12237376.0,1636.83,2023.0,331437900000.0,1075448.0,2764358000.0,37310650000.0,428923300000.0,7.0
std,,,,,,,,,
min,12237376.0,1636.83,2023.0,331437900000.0,1075448.0,2764358000.0,37310650000.0,428923300000.0,7.0
25%,12237376.0,1636.83,2023.0,331437900000.0,1075448.0,2764358000.0,37310650000.0,428923300000.0,7.0
50%,12237376.0,1636.83,2023.0,331437900000.0,1075448.0,2764358000.0,37310650000.0,428923300000.0,7.0
75%,12237376.0,1636.83,2023.0,331437900000.0,1075448.0,2764358000.0,37310650000.0,428923300000.0,7.0
max,12237376.0,1636.83,2023.0,331437900000.0,1075448.0,2764358000.0,37310650000.0,428923300000.0,7.0


### 2023 data, non-fixed route modes , PT/DO service

In [37]:
non_fixed_route_data = ntd_2023_data[
    ntd_2023_data["mode_cat"]=="nonfixed_guideway"
].groupby(id_cols).agg({
    i:"sum" for i in numerical_cols
}).reset_index()

display(
    non_fixed_route_data["ntd_id"].value_counts()[non_fixed_route_data["ntd_id"].value_counts()>1], # empty, there are no duplicate ntd_ids
    non_fixed_route_data.head()
)

Series([], Name: ntd_id, dtype: int64)

Unnamed: 0,ntd_id,source_agency,city,primary_uza_name,total_upt,total_vrh,total_vrm,opexp_total,total_voms
0,90003,San Francisco Bay Area Rapid Transit District ...,Oakland,"San Francisco--Oakland, CA",0,681018500,21308437250,195646239500,145500
1,90004,Golden Empire Transit District (GET),Bakersfield,"Bakersfield, CA",266781033,23436378,317845296,2960905266,7209
2,90006,Santa Cruz Metropolitan Transit District (SCMTD),Santa Cruz,"Santa Cruz, CA",4341633696,278313408,3855763296,62945840016,120528
3,90007,City of Modesto (MAX),0,"Modesto, CA",0,0,0,0,0
4,90008,City of Santa Monica (BBB) - Department of Tra...,Santa Monica,"Los Angeles--Long Beach--Anaheim, CA",4854828125,260590000,2450246875,50485795000,89375


In [None]:
non_fixed_route_2023 = make_hierarchal_clustering(
    data = non_fixed_route_data,
    num_cols = numerical_cols,
    cat_cols = categorical_cols,
    cluster_num = 10
    
)

z_nonfixed_route = make_dendrogram_data(
    data = non_fixed_route_data,
    num_cols = numerical_cols,
    cat_cols = categorical_cols
)


for i in non_fixed_route_2023["cluster_name"].unique():
    print(f"""
    \n How many unique agencies are in Cluster {i}: {non_fixed_route_2023[non_fixed_route_2023['cluster_name']==i]["source_agency"].nunique()}
    Cluster {i} sample:""")
    display(non_fixed_route_2023[non_fixed_route_2023["cluster_name"]==i].head()),
    print(f"\nCluster {i} summary stats:")
    display(non_fixed_route_2023[non_fixed_route_2023["cluster_name"]==i].describe())

    
# fig_nonfixed_route = ff.create_dendrogram(z_nonfixed_route, labels = non_fixed_route_data["source_agency"].tolist())
# fig_nonfixed_route.update_layout(width=800, height=500)
# fig_nonfixed_route.show()

plt.figure(figsize=(10, 5))
dendrogram(z_nonfixed_route, labels=non_fixed_route_data["ntd_id"].tolist(), leaf_rotation=90)
plt.title("Dendrogram (Ward's Method)")
plt.xlabel("ntd_id")
plt.ylabel("Distance")
plt.tight_layout()
plt.show()

### 2023 data, fixed route modes , PT/DO service

In [None]:
fixed_route_data = ntd_2023_data[
    ntd_2023_data["mode"].isin(fixed_guideway)
].groupby(id_cols+categorical_cols).agg({
    i:"sum" for i in numerical_cols
}).reset_index()

fixed_route_data.head()

In [None]:
fixed_route_2023 = make_hierarchal_clustering(
    data = fixed_route_data,
    num_cols = numerical_cols,
    cat_cols = categorical_cols,
    cluster_num = 10
    
)

z_fixed_route = make_dendrogram_data(
    data = fixed_route_data,
    num_cols = numerical_cols,
    cat_cols = categorical_cols
)

for i in fixed_route_2023["cluster_name"].unique():
    print(f"""
    \n How many unique agencies are in Cluster {i}: {fixed_route_2023[fixed_route_2023['cluster_name']==i]["source_agency"].nunique()}
    Cluster {i} sample:""")
    display(fixed_route_2023[fixed_route_2023["cluster_name"]==i].head())

# fig_fixed_route = ff.create_dendrogram(z_fixed_route,)
# fig_fixed_route.update_layout(width=800, height=500)
# fig_fixed_route.show()

plt.figure(figsize=(10, 5))
dendrogram(z_fixed_route, labels=fixed_route_data["ntd_id"].tolist(), leaf_rotation=90)
plt.title("Dendrogram (Ward's Method)")
plt.xlabel("ntd_id")
plt.ylabel("Distance")
plt.tight_layout()
plt.show()

### 2023 data, other modes, PT/DO service

In [None]:
other_modes

In [None]:


other_mode_2023 = make_hierarchal_clustering(
    data = ntd_2023_data[ntd_2023_data["mode"].isin(other_modes)],
    num_cols = numerical_cols,
    cat_cols = categorical_cols,
    cluster_num = 10
    
)

z_other_modes = make_dendrogram_data(
    data = ntd_2023_data[ntd_2023_data["mode"].isin(other_modes)],
    num_cols = numerical_cols,
    cat_cols = categorical_cols
)

for i in other_mode_2023["cluster_name"].unique():
    print(f"""
    \n How many unique agencies are in Cluster {i}: {other_mode_2023[other_mode_2023['cluster_name']==i]["source_agency"].nunique()}
    Cluster {i} sample:""")
    display(other_mode_2023[other_mode_2023["cluster_name"]==i].head()),
    print(f"\nCluster {i} summary stats:")
    display(other_mode_2023[other_mode_2023["cluster_name"]==i].describe())

    
fig_fixed_route = ff.create_dendrogram(z_other_modes)
fig_fixed_route.update_layout(width=800, height=500)
fig_fixed_route.show()

### non-fixed route, just DO


In [None]:
non_fixed_route_do = ntd_2023_data[
    (ntd_2023_data["mode"].isin(nonfixed_guideway))
    & (ntd_2023_data["service"]=="DO")
].groupby(id_cols+categorical_cols).agg({
    i:"sum" for i in numerical_cols
}).reset_index()

display(
    non_fixed_route_do.info(),
    non_fixed_route_do.head()
)



In [None]:
non_fixed_route_do_2023 = make_hierarchal_clustering(
    data = non_fixed_route_do,
    num_cols = numerical_cols,
    cat_cols = categorical_cols,
    cluster_num = 10
    
)

z_nonfixed_route_do = make_dendrogram_data(
    data = non_fixed_route_do,
    num_cols = numerical_cols,
    cat_cols = categorical_cols
)


for i in non_fixed_route_do_2023["cluster_name"].unique():
    print(f"""
    \n How many unique agencies are in Cluster {i}: {non_fixed_route_do_2023[non_fixed_route_do_2023['cluster_name']==i]["source_agency"].nunique()}
    Cluster {i} sample:""")
    display(non_fixed_route_do_2023[non_fixed_route_do_2023["cluster_name"]==i].head()),
    print(f"\nCluster {i} summary stats:")
    display(non_fixed_route_do_2023[non_fixed_route_do_2023["cluster_name"]==i].describe())

    
# fig_nonfixed_route = ff.create_dendrogram(z_nonfixed_route, labels = non_fixed_route_data["source_agency"].tolist())
# fig_nonfixed_route.update_layout(width=800, height=500)
# fig_nonfixed_route.show()

plt.figure(figsize=(10, 5))
dendrogram(z_nonfixed_route_do, labels=non_fixed_route_do["ntd_id"].tolist(), leaf_rotation=90)
plt.title("Dendrogram (Ward's Method)")
plt.xlabel("ntd_id")
plt.ylabel("Distance")
plt.tight_layout()
plt.show()