# [Research Task - Create visuals for PUC 99314.11 leg report](https://github.com/cal-itp/data-analyses/issues/1656)
1. line graph of each metric (UPT, VRM, PMT) by agency
- x-axis is year
- y-axis is metric
- each line is an agency
- dotted line is average metric for all agencies in the year

2. line graph of each metric, by district
- similar to above
- each line is a district
- dotted line is average metrics for all districts the year

3. line graph of each metric, by mode
- similar to above
- each line is a mode
- dotter line is average metric for all modes in the year

Maybe try a box plot to show min/max/average for each metric?

In [1]:
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
import altair as alt

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

In [2]:
gcs_path = "gs://calitp-analytics-data/data-analyses/ntd/"
ntd_name = "ntd_operator_data_18_23.parquet"

ntd_all_metrics = pd.read_parquet(f"{gcs_path}{ntd_name}")
ntd_all_metrics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1614 entries, 0 to 1613
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ntd_id         1524 non-null   object 
 1   source_agency  1614 non-null   object 
 2   agency_status  1614 non-null   object 
 3   reporter_type  1614 non-null   object 
 4   year           1614 non-null   int64  
 5   total_upt      1291 non-null   float64
 6   total_vrh      1291 non-null   float64
 7   total_pmt      1291 non-null   float64
dtypes: float64(3), int64(1), object(4)
memory usage: 113.5+ KB


In [4]:
# melt big DF so all columns are under 1 column.
group_list = ['source_agency', 'year', 'ntd_id']

melt = pd.melt(
    ntd_all_metrics,
    id_vars= group_list,
    # value_vars = ,
    var_name="metric",
    value_name="metric_value",
    ignore_index=True,
)

In [7]:
# inspect melted DF 
display(
    melt.info(),
    melt["year"].value_counts(),
    melt["metric"].value_counts(),
    # melt["source_agency"].value_counts(),
    melt.head(),
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8070 entries, 0 to 8069
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   source_agency  8070 non-null   object
 1   year           8070 non-null   int64 
 2   ntd_id         7620 non-null   object
 3   metric         8070 non-null   object
 4   metric_value   7101 non-null   object
dtypes: int64(1), object(4)
memory usage: 315.4+ KB


None

2018    1345
2022    1345
2020    1345
2023    1345
2019    1345
2021    1345
Name: year, dtype: int64

agency_status    1614
reporter_type    1614
total_upt        1614
total_vrh        1614
total_pmt        1614
Name: metric, dtype: int64

Unnamed: 0,source_agency,year,ntd_id,metric,metric_value
0,City of Porterville (COLT) - Transit Department,2018,90198,agency_status,Active
1,City of Porterville (COLT) - Transit Department,2022,90198,agency_status,Active
2,City of Porterville (COLT) - Transit Department,2020,90198,agency_status,Active
3,City of Porterville (COLT) - Transit Department,2023,90198,agency_status,Active
4,City of Porterville (COLT) - Transit Department,2019,90198,agency_status,Active


In [16]:
# What does group/agg the melted DF look like?
vrh_total = (
    melt[melt["metric"]=="total_vrh"]
    .groupby(group_list)["metric_value"]
    .sum()
    .reset_index()
).rename(columns={"metric_value":"total_vrh"})

upt_total = (
    melt[melt["metric"]=="total_upt"]
    .groupby(group_list)["metric_value"]
    .sum()
    .reset_index()
).rename(columns={"metric_value":"total_upt"})

passenger_total =(
    melt[melt["metric"]=="total_pmt"]
    .groupby(group_list)["metric_value"]
    .sum()
    .reset_index()
).rename(columns={"metric_value":"total_pmt"})

In [17]:
all_totals = [
    vrh_total,
    upt_total,
    passenger_total
]

for df in all_totals:
    display(df[df["source_agency"].str.contains("Sacramento")].head()) # some operators do not have data for some modes. this makes sense.

Unnamed: 0,source_agency,year,ntd_id,total_vrh
810,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,2018,90216,21368.0
811,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,2019,90216,22367.0
812,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,2020,90216,20677.0
813,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,2021,90216,16932.0
814,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,2022,90216,17302.0


Unnamed: 0,source_agency,year,ntd_id,total_upt
810,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,2018,90216,103992.0
811,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,2019,90216,105479.0
812,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,2020,90216,83585.0
813,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,2021,90216,29659.0
814,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,2022,90216,35807.0


Unnamed: 0,source_agency,year,ntd_id,total_pmt
810,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,2018,90216,0.0
811,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,2019,90216,0.0
812,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,2020,90216,0.0
813,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,2021,90216,0.0
814,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,2022,90216,0.0


In [24]:
all_totals_dict = {
    "total_vrh":vrh_total,
    "total_upt":upt_total,
    "total_pmt":passenger_total
}
for col, df in all_totals_dict.items(): 
    chart = alt.Chart(df).mark_line(point=True).encode(
    x="year:N",
    y=col,
    color="source_name:N"
    ).properties(name = col, width="container")
    display(chart)