# Survey Results: Data Cleaning

In [1]:
import pandas as pd 
import altair as alt

from calitp_data_analysis.sql import to_snakecase
from calitp_data_analysis import calitp_color_palette as cp


from IPython.display import Markdown, HTML, display_html, display
from IPython.core.display import display

  from IPython.core.display import display


In [2]:
pd.set_option('display.max_columns', None)

In [3]:
def read_in_data(sheetname):
    ## read in the data from google docs where the survey is being collected
    sheet_id = "1KwmAzwrl7sKupS8ZX33HzG-SpUSL65sSrW_Ns5a6h3w"
    sheet_name = sheetname
    url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
    df = to_snakecase(pd.read_csv(url))
    
    ## convert timestamp column in datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    ## filter the survey responses to when the survey went live
    ## this removes the test responses the team sent in
    df = df[(df['timestamp'] > '2024-07-10 17:30:00')]
    
    ## remove the email attached to the response. 
    df = df.drop('caltrans_email', axis=1)
    
    
    ## rename some of the columns: 
    df = df.rename(columns={"what_are_you_typically_using_the_platforms_for?_what_types_of_analyses_are_you_running?_\n_ex__to_gather_speed_data,_downloading_segment_analysis_data_for_additional_analysis_outside_the_platform_":"what_types_of_analyses_are_you_running?",
                            "do_you_have_any_success_stories_that_you_would_like_to_share?_or_stories_on_how_you_applied_the_data_to_projects?":"do_you_have_any_success_stories_that_you_would_like_to_share?",
                            "what_district_division_of_caltrans_are_you_in?_":"what_district_division_of_caltrans_are_you_in?",
                           "what_sorts_of_challenges_are_you_facing?_":"what_sorts_of_challenges_are_you_facing?",
                           "what_are_the_common_modes_of_transportation_you_are_running_analyses_for?":"what_are_the_common_modes_of_transportation_you_are_running?",
                           "have_you_utilized_the_resources_training_streetlight_and_replica_provide?":"have_you_utilized_the_resources_training?"})
    
    
    
    if sheetname==("Streetlight_and_Replica"):
        df = df.rename(columns={'would_you_like_to_be_part_of_the_caltrans_big_data_user_group_for_replica_and_streetlight?\nthis_user_group_would_be_internal_just_for_caltrans_and_would_meet_quarterly__':'would_you_like_to_be_part_of_the_user_groups?',
                                'what_sorts_of_challenges_are_you_facing?_\n_can_be_for_just_one_or_both_platforms_':'what_sorts_of_challenges_are_you_facing?'})

    
    elif sheetname==("Streetlight"):
        df = df.rename(columns={'would_you_like_to_be_part_of_the_caltrans_big_data_user_group_for_streetlight?\nthis_user_group_would_be_internal_just_for_caltrans_and_would_meet_quarterly__':'would_you_like_to_be_part_of_the_user_groups?',
                               "what_are_you_typically_using_the_platforms_data_for?\n_ex__to_gather_speed_data,_downloading_segment_analysis_data_for_additional_analysis_outside_the_platform_":"what_types_of_analyses_are_you_running?",
                               "have_you_utilized_the_resources_training_streetlight_provides?":"have_you_utilized_the_resources_training?",
                               "if_yes,_is_there_a_particular_reason_why_you_gravitated_towards_streetlight?":"if_you_do_gravitate_towards_one_platform_more,_could_you_briefly_explain_why?"})
   
    elif sheetname==("Replica"):
        df = df.rename(columns={'would_you_like_to_be_part_of_the_caltrans_big_data_user_group_for_replica?\nthis_user_group_would_be_internal_just_for_caltrans_and_would_meet_quarterly__':'would_you_like_to_be_part_of_the_user_groups?',
                               "have_you_utilized_the_resources_training_replica_provides?":"have_you_utilized_the_resources_training?"})
    
    return df

In [4]:
both = read_in_data("Streetlight_and_Replica")

In [5]:
sl = read_in_data("Streetlight")

In [6]:
len(both)

12

In [7]:
len(sl)

20

In [8]:
## get list of the columns that have to do with replica. 
## (we are just looking at streetlight)
cols_to_drop = ['how_do_you_like_working_with_replica?',
       'any_additional_comments_on_working_with_replica?',
       'how_comfortable_are_you_with_using_replicas_data?',
       'any_additional_comments_on_replicas_data?']

In [9]:
## drop replica's columns
both = both.drop(cols_to_drop, axis=1)

In [10]:
both.sample()

Unnamed: 0,timestamp,what_district_division_of_caltrans_are_you_in?,how_often_do_you_use_the_streetlight_and_or_replica_platforms?,"if_you_do_use_both_platforms,_is_there_one_that_you_gravitate_towards_more?","if_you_do_gravitate_towards_one_platform_more,_could_you_briefly_explain_why?","if_you_use_one_platform_more_than_the_other,_what_would_you_say_is_the_breakdown_between_the_two?",how_do_you_like_working_with_streetlight?,any_additional_comments_on_working_with_streetlight?,how_comfortable_are_you_with_using_streetlights_data?,any_additional_comments_on_streetlights_data?,what_are_the_common_modes_of_transportation_you_are_running?,what_types_of_analyses_are_you_running?,do_you_have_any_success_stories_that_you_would_like_to_share?,are_there_common_challenges_you_encounter_when_running_analyses?,what_sorts_of_challenges_are_you_facing?,have_you_utilized_the_resources_training?,would_you_like_to_be_part_of_the_user_groups?,any_other_comments_or_feedback_you_would_like_to_provide?
12,2024-07-11 17:08:09,HQ Traffic Ops,On an ad hoc basis,"No, I use both platforms around the same amount.","For large-scale analysis/data request, Replica...",30% Streetlight / 70% Replica,Somewhat Satisfied,Too slow to generate analysis results for larg...,Confident,Turning movement counts data are not reliable.,"Auto, Freight, Transit","OD Vehicular/freight trips, VMT analysis",,Sometimes,Streetlight zones,"Yes, I have gone through the training","No, please do not add me to the User Groups at...",


## Merging the Data

In [11]:
all_sl = pd.merge(both, sl, how='outer')

In [12]:
def title_column_names(df):
    df.columns = df.columns.map(str.title) 
    df.columns = df.columns.map(lambda x : x.replace("_", " "))
    
    return df

In [13]:
all_sl_title = title_column_names(all_sl)

In [14]:
all_sl_title.columns

Index(['Timestamp', 'What District Division Of Caltrans Are You In?',
       'How Often Do You Use The Streetlight And Or Replica Platforms?',
       'If You Do Use Both Platforms, Is There One That You Gravitate Towards More?',
       'If You Do Gravitate Towards One Platform More, Could You Briefly Explain Why?',
       'If You Use One Platform More Than The Other, What Would You Say Is The Breakdown Between The Two?',
       'How Do You Like Working With Streetlight?',
       'Any Additional Comments On Working With Streetlight?',
       'How Comfortable Are You With Using Streetlights Data?',
       'Any Additional Comments On Streetlights Data?',
       'What Are The Common Modes Of Transportation You Are Running?',
       'What Types Of Analyses Are You Running?',
       'Do You Have Any Success Stories That You Would Like To Share?',
       'Are There Common Challenges You Encounter When Running Analyses?',
       'What Sorts Of Challenges Are You Facing?',
       'Have You Util

In [19]:
columns_to_chart = [
    "How Often Do You Use The Streetlight And Or Replica Platforms?",
    "If You Do Gravitate Towards One Platform More, Could You Briefly Explain Why?",
    "How Do You Like Working With Streetlight?",
    "How Comfortable Are You With Using Streetlights Data?",
    "Are There Common Challenges You Encounter When Running Analyses?",
    "Have You Utilized The Resources Training?",
    "Are You Aware Of Replica, The Other Big Data Platform Caltrans Has A Subscription To?"]


In [20]:
## Charting the responses

In [21]:
for col in columns_to_chart:
    chart = (
        alt.Chart(all_sl_title)
        .mark_bar()
        .encode(
            x=alt.X(col),
             y='count()',
            color=alt.Color(col, scale=alt.Scale(range = cp.CALITP_DIVERGING_COLORS,)
            ))
      .properties(title=col,
    width=800,
    height=300)
        )
    display(chart)