# Survey Results: Data Cleaning

In [1]:
import pandas as pd 
import altair as alt

from calitp_data_analysis.sql import to_snakecase
from calitp_data_analysis import calitp_color_palette as cp


from IPython.display import Markdown, HTML, display_html, display
from IPython.core.display import display

from siuba import *

import ast
import numpy as np

import _utils

  from IPython.core.display import display


In [2]:
pd.set_option('display.max_columns', None)

In [3]:
## read in the data from google docs where the survey is being collected
sheet_id = "1KwmAzwrl7sKupS8ZX33HzG-SpUSL65sSrW_Ns5a6h3w"
sheet_name = "Replica"
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
df = to_snakecase(pd.read_csv(url))
    
## convert timestamp column in datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])
    
## filter the survey responses to when the survey went live
## this removes the test responses the team sent in
df = df[(df['timestamp'] > '2024-07-10 17:30:00')]

In [4]:
### Read in the Data

In [5]:
both = _utils.read_in_data("Streetlight_and_Replica")
both['survey'] = 'Streetlight_and_Replica'

In [6]:
sl = _utils.read_in_data("Streetlight")
sl['survey'] = 'Streetlight'

In [7]:
rep = _utils.read_in_data("Replica")
rep['survey'] = 'Replica'

In [57]:
len(both)

19

In [58]:
len(sl)

53

In [59]:
both.sample()

Unnamed: 0,timestamp,what_district_division_of_caltrans_are_you_in?,how_often_do_you_use_the_platforms?,"if_you_do_use_both_platforms,_is_there_one_that_you_gravitate_towards_more?","if_you_do_gravitate_towards_one_platform_more,_could_you_briefly_explain_why?","if_you_use_one_platform_more_than_the_other,_what_would_you_say_is_the_breakdown_between_the_two?",how_do_you_like_working_with_streetlight?,any_additional_comments_on_working_with_streetlight?,how_comfortable_are_you_with_using_streetlights_data?,any_additional_comments_on_streetlights_data?,how_do_you_like_working_with_replica?,any_additional_comments_on_working_with_replica?,how_comfortable_are_you_with_using_replicas_data?,any_additional_comments_on_replicas_data?,what_are_the_common_modes_of_transportation_you_are_running?,what_types_of_analyses_are_you_running?,do_you_have_any_success_stories_that_you_would_like_to_share?,are_there_common_challenges_you_encounter_when_running_analyses?,what_sorts_of_challenges_are_you_facing?,have_you_utilized_the_resources_training?,would_you_like_to_be_part_of_the_user_groups?,any_other_comments_or_feedback_you_would_like_to_provide?,survey
22,2024-09-10 11:11:59,"North Region Design, District-3-based",Once every few months,"No, I use both platforms around the same amount.",,50% Streetlight / 50% Replica,Somewhat Disatisfied,,Neutral,Bike Coverage not as extensive as suggested by...,Neutral or No Opinion,,Neutral,,"Bike, Auto",,,Sometimes,Bike traffic coverage does not seem extensive,"Yes, I have gone through the training","No, please do not add me to the User Groups at...",,Streetlight_and_Replica


In [60]:
rep.sample()

Unnamed: 0,timestamp,what_district_division_of_caltrans_are_you_in?,how_often_do_you_use_the_platforms?,how_do_you_like_working_with_replica?,any_additional_comments_on_working_with_replica?,how_comfortable_are_you_with_using_replicas_data?,any_additional_comments_on_replicas_data?,what_are_the_common_modes_of_transportation_you_are_running?,which_types_of_analyses_that_you_typically_use?,what_types_of_analyses_are_you_running?,do_you_have_any_success_stories_that_you_would_like_to_share?,are_there_common_challenges_you_encounter_when_running_analyses?,what_sorts_of_challenges_are_you_facing?,have_you_utilized_the_resources_training?,"are_you_aware_of_streetlight,_the_other_big_data_platform_caltrans_has_a_subscription_to?","if_yes,_is_there_a_particular_reason_why_you_gravitated_towards_replica?",would_you_like_to_be_part_of_the_user_groups?,any_other_comments_or_feedback_you_would_like_to_provide?,survey
5,2024-07-25 14:02:14,12/SSI/Asset Management,On an ad hoc basis,Neutral or No Opinion,,Neutral,Still getting acquainted with it,Auto,AADT Explorer,,,Rarely,,"No, I was not aware of training resources","No, I am not aware",,"No, please do not add me to the User Groups at...",,Replica


In [61]:
all_ = pd.concat([both, sl, rep])

In [62]:
all_ = _utils.add_district_col(all_, "what_district_division_of_caltrans_are_you_in?")


In [63]:
all_ = _utils.add_division_col(all_, "what_district_division_of_caltrans_are_you_in?")

In [64]:
## get list of the columns that have to do with replica. 
## (we are just looking at streetlight)
sl_cols_to_drop = ['how_do_you_like_working_with_replica?',
       'any_additional_comments_on_working_with_replica?',
       'how_comfortable_are_you_with_using_replicas_data?',
       'any_additional_comments_on_replicas_data?']

rep_cols_to_drop = ['how_do_you_like_working_with_streetlight?', 
                   'any_additional_comments_on_working_with_streetlight?',
                   'how_comfortable_are_you_with_using_streetlights_data?',
                   'any_additional_comments_on_streetlights_data?']

In [65]:
## drop replica's columns
all_sl = all_.drop(sl_cols_to_drop, axis=1)

In [66]:
all_sl = all_sl[~(all_sl.survey==("Replica"))]

In [67]:
all_rep = all_.drop(rep_cols_to_drop, axis=1)
all_rep = all_rep[~(all_rep.survey==("Streetlight"))]

In [68]:
all_ = _utils.title_column_names(all_)

In [69]:
all_around_qs = ["District",
    "Division",
    "How Often Do You Use The Platforms?"]

In [70]:
_utils.chart_results(all_, all_around_qs, "Survey")

## Merging the Data for Streetlight

In [71]:
all_sl_title = _utils.title_column_names(all_sl)

In [72]:
# all_sl_title.columns

In [73]:
streetlight_columns_to_chart = [
    "How Do You Like Working With Streetlight?",
    "How Comfortable Are You With Using Streetlights Data?",
    "Are There Common Challenges You Encounter When Running Analyses?",
    "Have You Utilized The Resources Training?",
    "Are You Aware Of Replica, The Other Big Data Platform Caltrans Has A Subscription To?"]


In [74]:
## Charting the responses

In [75]:
_utils.chart_results(all_sl_title, streetlight_columns_to_chart, "Survey")

## Explode a few columns

* 'What Are The Common Modes Of Transportation You Are Running?',
* 'What Types Of Analyses Are You Running?'

In [76]:
### get list of results

In [77]:
analyses = all_>>select(_['What Are The Common Modes Of Transportation You Are Running?'], _.Survey)

In [78]:
analyses = analyses.rename(columns={"What Are The Common Modes Of Transportation You Are Running?":"analyses_types"})

In [79]:
analyses

Unnamed: 0,analyses_types,Survey
4,"Auto, Freight",Streetlight_and_Replica
5,I don't use it at all,Streetlight_and_Replica
7,"Auto, Freight",Streetlight_and_Replica
8,"Auto, Transit",Streetlight_and_Replica
9,Auto,Streetlight_and_Replica
...,...,...
6,"Bike, Pedestrian, Auto, Transit",Replica
7,"Bike, Pedestrian, Auto",Replica
8,"Bike, Pedestrian, Auto, Transit",Replica
9,"Bike, Pedestrian",Replica


In [85]:
analyses.analyses_types.value_counts()

Auto                                                 15
Bike, Pedestrian, Auto                               13
Auto, Freight                                         6
Bike, Pedestrian, Auto, Transit                       6
Bike, Pedestrian                                      6
Bike, Pedestrian, Auto, Freight, Transit              6
Bike, Pedestrian, Auto, Freight                       4
Other                                                 3
Pedestrian, Auto                                      3
Auto, Transit                                         2
Transit                                               2
Freight                                               2
Auto, Freight, Transit                                2
Pedestrian, Freight, Transit                          1
Bike, Pedestrian, Auto, Transit, Other                1
Bike, Auto                                            1
Bike, Auto, Transit                                   1
Bike, Auto, Freight                             

In [81]:
analyses = analyses.replace({'analyses_types': "I don't use it at all"}, np.nan)
analyses = analyses.replace({'analyses_types': "No analysis, I'd use it to evaluate Exist conditions/infrastructure and those impacts on SHOPP planned projects."}, np.nan)


In [82]:
analyses = analyses>>filter(_.analyses_types.notnull())

In [83]:
analyses['analyses_types'] = analyses['analyses_types'].astype("string")


In [84]:
analyses['analyses_types'] = analyses['analyses_types'].replace({"trucks":"Freight", "Truck":"Freight"}, regex=True)
# analyses['analyses_types'] = analyses['analyses_types'].replace({"Truck":"Freight"}, regex=True)
analyses['analyses_types'] = analyses['analyses_types'].replace({"How it relates to housing development and influences value":"Other"}, regex=True)

In [89]:
analyses = _utils.get_dummies_by_type(analyses, "analyses_types")

In [90]:
analyses

Unnamed: 0,analyses_types,Survey,Other,Pedestrian,Transit,Auto,Bike,Freight
4,"Auto, Freight",Streetlight_and_Replica,0,0,0,1,0,1
7,"Auto, Freight",Streetlight_and_Replica,0,0,0,1,0,1
8,"Auto, Transit",Streetlight_and_Replica,0,0,1,1,0,0
9,Auto,Streetlight_and_Replica,0,0,0,1,0,0
10,"Pedestrian, Auto",Streetlight_and_Replica,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...
6,"Bike, Pedestrian, Auto, Transit",Replica,0,1,1,1,1,0
7,"Bike, Pedestrian, Auto",Replica,0,1,0,1,1,0
8,"Bike, Pedestrian, Auto, Transit",Replica,0,1,1,1,1,0
9,"Bike, Pedestrian",Replica,0,1,0,0,1,0


In [91]:
#### this count is giving the wrong answers...

# analyses_ = pd.DataFrame((analyses>>select(_.Transit, _.Freight, _.Other, _.Auto, _.Pedestrian, _.Bike))).transpose().reset_index().rename(columns={'index':'mode_type'})

# ##sum up the number of trips by agency
# analyses_['count'] = analyses_[list(analyses_.columns)].sum(axis=1)
# # analyses_ = analyses_>>select(_.mode_type, _.count)


In [92]:
# analyses_

In [93]:
test = analyses >> select(_.Survey, _.Transit, _.Freight, _.Other, _.Auto, _.Pedestrian, _.Bike)

In [94]:
test

Unnamed: 0,Survey,Transit,Freight,Other,Auto,Pedestrian,Bike
4,Streetlight_and_Replica,0,1,0,1,0,0
7,Streetlight_and_Replica,0,1,0,1,0,0
8,Streetlight_and_Replica,1,0,0,1,0,0
9,Streetlight_and_Replica,0,0,0,1,0,0
10,Streetlight_and_Replica,0,0,0,1,1,0
...,...,...,...,...,...,...,...
6,Replica,1,0,0,1,1,1
7,Replica,0,0,0,1,1,1
8,Replica,1,0,0,1,1,1
9,Replica,0,0,0,0,1,1


In [95]:
test.agg({'Transit': 'sum', 'Freight': 'sum', 'Other': 'sum', 'Auto': 'sum', 'Pedestrian': 'sum', 'Bike': 'sum'})


Transit       23
Freight       23
Other          4
Auto          62
Pedestrian    41
Bike          40
dtype: int64

In [96]:
mode_counts = test.groupby(['Survey'], as_index=False).agg({'Transit': 'sum', 'Freight': 'sum', 'Other': 'sum', 'Auto': 'sum', 'Pedestrian': 'sum', 'Bike': 'sum'})


In [97]:
mode_counts

Unnamed: 0,Survey,Transit,Freight,Other,Auto,Pedestrian,Bike
0,Replica,2,0,0,6,5,5
1,Streetlight,16,15,2,42,27,27
2,Streetlight_and_Replica,5,8,2,14,9,8


In [98]:
mode_df = (mode_counts >> group_by(_.Survey) >> gather('Modes', "Counts", _["Transit":"Bike"]))

In [99]:
type(mode_df)

pandas.core.groupby.generic.DataFrameGroupBy

In [100]:
mode_df= mode_df.apply(pd.DataFrame)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  mode_df= mode_df.apply(pd.DataFrame)


In [101]:
chart = ((alt.Chart(mode_df)
            .mark_bar()
            .encode(
                x=alt.X("Modes:O"),
                 y=alt.Y('Counts:Q'),
                color=alt.Color("Survey", scale=alt.Scale(range = cp.CALITP_SEQUENTIAL_COLORS)
                )
         )
          .properties(title="Types of Modes used in Analyses",
        width=800,
        height=300)))

In [102]:
chart

## Merging the Data for Replica

In [103]:
len(all_rep)

26

In [104]:
all_rep_title = _utils.title_column_names(all_rep)

In [105]:
replica_columns_to_chart = [
    "How Do You Like Working With Replica?",
    "How Comfortable Are You With Using Replicas Data?",
    "Are There Common Challenges You Encounter When Running Analyses?",
    "Have You Utilized The Resources Training?",
    # "Are You Aware Of Streetlight, The Other Big Data Platform Caltrans Has A Subscription To?"
]


In [106]:
all_rep_title>>count(_.Survey)

Unnamed: 0,Survey,n
0,Replica,7
1,Streetlight_and_Replica,19


In [107]:
## Charting the Responses

In [108]:
_utils.chart_results(all_rep_title, replica_columns_to_chart, "Survey")