## Origin Destination Big Data Analysis

In [1]:
import gcsfs as fs
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis import get_fs, utils
from calitp_data_analysis.sql import to_snakecase
from siuba import *

fs = get_fs()

In [2]:
import altair as alt


In [3]:
import replica_utils

In [4]:
pd.set_option("display.max_columns", None)

In [5]:
shape_data_name = "replica-hta_transit_calpolyhumboldt-10_22_25-origin_layer_Humb.zip"

to_cp_file_name = "replica-hta_transit_calpolyhumboldt-10_22_25-trips_dataset_dest_as_CPH.csv"
from_cp_file_name = "replica-hta_transit_calpolyhumboldt-10_22_25-trips_dataset_origin_as_CPH.csv"

In [6]:
to_cp = replica_utils.read_in_and_prep_replica_data(to_cp_file_name, shape_data_name, file_type="to_cp")

from_cp = replica_utils.read_in_and_prep_replica_data(from_cp_file_name, shape_data_name, file_type="from_cp")

  df = to_snakecase( pd.read_csv(f"{gcs_path}{file_name}"))
  df = to_snakecase( pd.read_csv(f"{gcs_path}{file_name}"))


In [7]:
len(to_cp)

16657

In [8]:
len(from_cp)

16623

In [9]:
df_list = [to_cp, from_cp]

In [10]:
to_cp['trip_type'] = 'Traveling To Cal Poly'
from_cp['trip_type'] = 'Traveling From Cal Poly'

In [20]:
summary = replica_utils.return_score_summary(df_list)

In [26]:
summary

Unnamed: 0,Trip Type,Total Trips,Number of Auto Trips,Pct Auto Trips,Number of Tranist Trips,Pct Transit Trips,Auto Mean Min,Auto Median Min,Auto Max Min,Auto Mean Miles,Auto Median Miles,Auto Max Miles,Transit Mean Min,Transit Median Min,Transit Max Min,Transit Mean Miles,Transit Median Miles,Transit Max Miles
0,Traveling To Cal Poly,16657,9544,0.572972,37,0.002221,12.67938,5.0,413,9.585425,2.0,393.5,30.054054,22.0,75,5.051351,2.6,14.1
1,Traveling From Cal Poly,16623,9179,0.552187,22,0.001323,14.812834,5.0,530,11.49537,1.9,549.1,25.409091,21.0,68,5.75,4.3,15.7


In [22]:
summary.columns = summary.columns.str.title()

In [27]:
summary.columns

Index(['Trip Type', 'Total Trips', 'Number of Auto Trips', 'Pct Auto Trips',
       'Number of Tranist Trips', 'Pct Transit Trips', 'Auto Mean Min',
       'Auto Median Min', 'Auto Max Min', 'Auto Mean Miles',
       'Auto Median Miles', 'Auto Max Miles', 'Transit Mean Min',
       'Transit Median Min', 'Transit Max Min', 'Transit Mean Miles',
       'Transit Median Miles', 'Transit Max Miles'],
      dtype='object')

In [24]:
summary = summary.rename(
    columns={
        "Trip_Type": "Trip Type",
        "Total_Trips": "Total Trips",
        "N_Auto_Trips": "Number of Auto Trips",
        "Pct_Auto_Trips": "Pct Auto Trips",
        "N_Tranist_Trips": "Number of Tranist Trips",
        "Pct_Transit_Trips": "Pct Transit Trips",
        "Auto_Mean_Miles": "Auto Mean Miles",
        "Auto_Median_Miles": "Auto Median Miles",
        "Auto_Max_Miles": "Auto Max Miles",
        "Transit_Mean_Miles": "Transit Mean Miles",
        "Transit_Median_Miles": "Transit Median Miles",
        "Transit_Max_Miles": "Transit Max Miles",
        "Auto_Mean_Minutes": "Auto Mean Min",
        "Auto_Median_Minutes": "Auto Median Min",
        "Auto_Max_Minutes": "Auto Max Min",
        "Transit_Mean_Minutes": "Transit Mean Min",
        "Transit_Median_Minutes": "Transit Median Min",
        "Transit_Max_Minutes": "Transit Max Min",
    }
)

In [35]:
summary_melt =  pd.melt(
        summary,
        id_vars=["Trip Type"],
        value_vars=['Trip Type', 'Total Trips', 'Number of Auto Trips', 'Pct Auto Trips',
                       'Number of Tranist Trips', 'Pct Transit Trips', 'Auto Mean Min',
                       'Auto Median Min', 'Auto Max Min', 'Auto Mean Miles',
                       'Auto Median Miles', 'Auto Max Miles', 'Transit Mean Min',
                       'Transit Median Min', 'Transit Max Min', 'Transit Mean Miles',
                       'Transit Median Miles', 'Transit Max Miles'],
        var_name="Metric",  # New column for original measurement names

    )

In [63]:
summary_long_all_miles = pd.melt(
    summary,
    id_vars=["Trip Type"],
    value_vars=[
        'Auto Mean Miles', 'Auto Median Miles', 'Auto Max Miles', 
        'Transit Mean Miles', 'Transit Median Miles', 'Transit Max Miles'],
        var_name="Metric",  # New column for original measurement names
        value_name="Miles")

In [62]:
summary_long_all_min = pd.melt(
    summary,
    id_vars=["Trip Type"],
    value_vars=[
        'Auto Mean Min', 'Auto Median Min', 'Auto Max Min',
        'Transit Mean Min', 'Transit Median Min', 'Transit Max Min'],
        var_name="Metric",  # New column for original measurement names
        value_name="Mintues")

In [59]:
alt.Chart(summary_long_all_min).mark_bar().encode(
    x="Mintues:Q", y="Metric:O", color="Metric:N", row="Trip Type:O"
).properties(title="Travel Length by Trip[ Type & Mode]")

In [52]:
alt.Chart(summary_long_all_miles).mark_bar().encode(
    x="Miles:Q", y="Metric:O", color="Metric:N", row="Trip Type:O"
).properties(title="Travel Distance by Trip Type & Mode")

In [18]:
from_cp_mode = replica_utils.get_mode_split(from_cp)
to_cp_mode = replica_utils.get_mode_split(to_cp)

modes_breakdown = pd.concat([to_cp_mode, from_cp_mode])

In [19]:
alt.Chart(modes_breakdown).mark_bar(size=150).encode(
    x=alt.X("df_name:O", title = "Trip Type"),
    y=alt.Y("total_trips:Q", title="Total Number of Trips"),
    color="mode:N"
).properties(
    width=600,  
    height=300 )


In [20]:
alt.Chart(modes_breakdown).mark_bar().encode(
    x=alt.X('df_name:O', title ="Trip Type"),
    y= alt.Y('pct_trips:Q', title="Pct of Total Trips"),
    color=alt.Color('df_name:N', legend=alt.Legend(title='Trip Type')),
    column= alt.Column('mode:N', title="Mode")
).properties(width = 100, height = 400)

In [66]:
to_cp.sample()

Unnamed: 0,activity_id,origin_bgrp_2020,origin_trct_2020,origin_cty_2020,origin_st_2020,destination_bgrp_2020,destination_trct_2020,destination_cty_2020,destination_st_2020,primary_mode,trip_purpose,previous_trip_purpose,trip_start_time,trip_end_time,trip_duration_minutes,trip_distance_miles,vehicle_type,vehicle_fuel_type,transit_submode,transit_agency,transit_route,origin_land_use,origin_building_use,destination_land_use,destination_building_use,trip_taker_person_id,trip_taker_household_id,trip_taker_age,trip_taker_sex,trip_taker_race_ethnicity,trip_taker_employment_status,trip_taker_wfh,trip_taker_individual_income,trip_taker_commute_mode,trip_taker_household_size,trip_taker_household_income,trip_taker_available_vehicles,trip_taker_resident_type,trip_taker_industry,trip_taker_building_type,trip_taker_school_grade_attending,trip_taker_education,trip_taker_tenure,trip_taker_language,trip_taker_home_bgrp_2020,trip_taker_home_trct_2020,trip_taker_home_cty_2020,trip_taker_home_st_2020,trip_taker_work_bgrp_2020,trip_taker_work_trct_2020,trip_taker_work_cty_2020,trip_taker_work_st_2020,geometry,trip_type
6330,10867426458158676902,"2 (Tract 10.02, Humboldt, CA)","10.02 (Humboldt, CA)","Humboldt County, CA",California,"2 (Tract 10.02, Humboldt, CA)","10.02 (Humboldt, CA)","Humboldt County, CA",California,other_travel_mode,shop,shop,09:39:00,09:39:00,0,0.0,,unknown_fuel_type,,,,retail,retail,retail,retail,3573634880113782192,14856023507623033788,18.0,male,asian_not_hispanic_or_latino,not_in_labor_force,unemployed_under_16_not_in_labor_force,0.0,other_travel_mode,1,0.0,unknown_num_vehicles,core,not_working,GQ_structure,undergraduate,high_school,GQ,GQ_language,"1 (Tract 12, Humboldt, CA)","12 (Humboldt, CA)","Humboldt County, CA",California,"2 (Tract 10.02, Humboldt, CA)","10.02 (Humboldt, CA)","Humboldt County, CA",California,"POLYGON ((-124.08252 40.87597, -124.08249 40.8...",Traveling To Cal Poly


In [77]:
(to_cp>>count(_.origin_bgrp_2020))

Unnamed: 0,origin_bgrp_2020,n
0,"1 (Tract 1, Humboldt, CA)",429
1,"1 (Tract 1.01, Del Norte, CA)",1
2,"1 (Tract 1.06, Kern, CA)",1
3,"1 (Tract 10.01, Humboldt, CA)",976
4,"1 (Tract 10.02, Humboldt, CA)",546
...,...,...
199,"5 (Tract 3, Humboldt, CA)",43
200,"5 (Tract 6, Humboldt, CA)",49
201,"5 (Tract 8003.38, Los Angeles, CA)",1
202,"6 (Tract 2, Humboldt, CA)",23


In [86]:
to_cp.sample()

Unnamed: 0,activity_id,origin_bgrp_2020,origin_trct_2020,origin_cty_2020,origin_st_2020,destination_bgrp_2020,destination_trct_2020,destination_cty_2020,destination_st_2020,primary_mode,trip_purpose,previous_trip_purpose,trip_start_time,trip_end_time,trip_duration_minutes,trip_distance_miles,vehicle_type,vehicle_fuel_type,transit_submode,transit_agency,transit_route,origin_land_use,origin_building_use,destination_land_use,destination_building_use,trip_taker_person_id,trip_taker_household_id,trip_taker_age,trip_taker_sex,trip_taker_race_ethnicity,trip_taker_employment_status,trip_taker_wfh,trip_taker_individual_income,trip_taker_commute_mode,trip_taker_household_size,trip_taker_household_income,trip_taker_available_vehicles,trip_taker_resident_type,trip_taker_industry,trip_taker_building_type,trip_taker_school_grade_attending,trip_taker_education,trip_taker_tenure,trip_taker_language,trip_taker_home_bgrp_2020,trip_taker_home_trct_2020,trip_taker_home_cty_2020,trip_taker_home_st_2020,trip_taker_work_bgrp_2020,trip_taker_work_trct_2020,trip_taker_work_cty_2020,trip_taker_work_st_2020,geometry,trip_type
5661,5896282829291159807,"2 (Tract 10.02, Humboldt, CA)","10.02 (Humboldt, CA)","Humboldt County, CA",California,"2 (Tract 10.02, Humboldt, CA)","10.02 (Humboldt, CA)","Humboldt County, CA",California,walking,eat,home,17:21:00,17:23:03,2,0.1,unknown_vehicle_type,unknown_fuel_type,,,,education,education,retail,retail,7701630238941121016,10739218317667016655,21.0,female,two_races_not_hispanic_or_latino,employed,remote,12234.0,worked_from_home,1,0.0,unknown_num_vehicles,core,naics61,GQ_structure,undergraduate,some_college,GQ,GQ_language,"2 (Tract 10.02, Humboldt, CA)","10.02 (Humboldt, CA)","Humboldt County, CA",California,"2 (Tract 10.02, Humboldt, CA)","10.02 (Humboldt, CA)","Humboldt County, CA",California,"POLYGON ((-124.08252 40.87597, -124.08249 40.8...",Traveling To Cal Poly


In [102]:
(to_cp.groupby(["origin_bgrp_2020", "geometry"]).agg(
        {"primary_mode": "nunique"}).reset_index())

Unnamed: 0,origin_bgrp_2020,geometry,primary_mode
0,"1 (Tract 1, Humboldt, CA)","POLYGON ((-124.19092 40.79074, -124.18803 40.7...",5
1,"1 (Tract 1.01, Del Norte, CA)","POLYGON ((-124.22684 41.74177, -124.21122 41.7...",1
2,"1 (Tract 1.06, Kern, CA)","POLYGON ((-119.08414 35.44114, -119.08171 35.4...",1
3,"1 (Tract 10.01, Humboldt, CA)","POLYGON ((-124.09095 40.87607, -124.09074 40.8...",7
4,"1 (Tract 10.02, Humboldt, CA)","POLYGON ((-124.08294 40.86648, -124.08282 40.8...",5
...,...,...,...
198,"5 (Tract 2, Humboldt, CA)","POLYGON ((-124.17969 40.78902, -124.17821 40.7...",3
199,"5 (Tract 3, Humboldt, CA)","POLYGON ((-124.16696 40.76600, -124.16586 40.7...",4
200,"5 (Tract 6, Humboldt, CA)","POLYGON ((-124.15340 40.78609, -124.15226 40.7...",4
201,"5 (Tract 8003.38, Los Angeles, CA)","POLYGON ((-118.81487 34.15226, -118.81250 34.1...",1


In [103]:
n_trips = (to_cp>>filter(_.primary_mode != "other_travel_mode")).groupby(["origin_bgrp_2020", "geometry"]).agg(
        {"activity_id": "nunique"}).reset_index()
n_trips = n_trips.set_geometry("geometry")

In [108]:
n_trips>>arrange(-_.activity_id)

Unnamed: 0,origin_bgrp_2020,geometry,activity_id
62,"2 (Tract 10.02, Humboldt, CA)","POLYGON ((-124.08252 40.87597, -124.08249 40.8...",3377
24,"1 (Tract 12, Humboldt, CA)","POLYGON ((-124.08173 40.87997, -124.08161 40.8...",1287
61,"2 (Tract 10.01, Humboldt, CA)","POLYGON ((-124.09306 40.86176, -124.09297 40.8...",1285
2,"1 (Tract 10.01, Humboldt, CA)","POLYGON ((-124.09095 40.87607, -124.09074 40.8...",976
130,"3 (Tract 11.02, Humboldt, CA)","POLYGON ((-124.09186 40.87901, -124.09184 40.8...",697
...,...,...,...
148,"3 (Tract 7.01, Tehama, CA)","POLYGON ((-122.23368 40.16985, -122.22858 40.1...",1
149,"3 (Tract 73.01, Sacramento, CA)","POLYGON ((-121.42350 38.65131, -121.42350 38.6...",1
151,"3 (Tract 84.03, Sacramento, CA)","POLYGON ((-121.17742 38.66997, -121.17717 38.6...",1
153,"3 (Tract 90.11, Sacramento, CA)","POLYGON ((-121.28590 38.57349, -121.28589 38.5...",1


In [106]:
n_trips.explore(column="activity_id")