# Seattle Public School Buses

Daily late buses to and from school are scraped from the website four times per day (2 x for to school, 2 x for from school). This captures initial estimates of late buses, and any corrections tot he late time. This data is added to a database. The current database is incomplete due than a less than robust web scraper. The web scraper has been improved with Regular Expressions and I am to implement data collection with a web server so that we have a complete data set.

The functions below focus on two areas:
* (1) Formatting for the map visualization
* (2) Selecting and cleaning the portion of the database that we are interested in visualizing.

Using these function, a map is generated showing the schools with late buses. The size of the circle marker corresponds to the length of time the buses are late at the particular school. The color of the marker indicates the type of school (elementary, high school, middle school, k-8, other).



In [81]:
# import libraries
import datetime
import pandas as pd
import numpy as np
from datascience import *
import random
from arcgis.gis import GIS

now = datetime.datetime.now()

# import data

# read in schools df
school_df = pd.read_pickle('geocoding_schools/cleaned_school_list.pk1')
# read in late bus df
late_bus_df = pd.read_pickle('scraping_bus_data/bus_data.pk1')

late_bus_df

Unnamed: 0,Month,Day,Year,Bus Number,School,To/From,Time,Unit,Data Taken Month,Data Taken Day,Data Taken Year,Data Taken Hour,Data Taken Minute
0,December,21,2018,3,Adams,from,20,minutes,1,1,2019,20,51
1,December,21,2018,120,Loyal Heights,from,20,minutes,1,1,2019,20,51
2,December,21,2018,630,Queen Anne,from,20,minutes,1,1,2019,20,51
3,December,21,2018,148,Northgate,from,10,minutes,1,1,2019,20,51
4,December,21,2018,799,Gatzert,from,120,minutes,1,1,2019,20,51
5,December,21,2018,462,Washington,from,60,minutes,1,1,2019,20,51
6,December,21,2018,792,Kimball,from,120,minutes,1,1,2019,20,51
7,December,21,2018,402,Cleveland,from,60,minutes,1,1,2019,20,51
8,December,21,2018,624,Fairmount Park,from,120,minutes,1,1,2019,20,51
9,December,21,2018,464,Washington,from,60,minutes,1,1,2019,20,51


## Visualization / Mapping Functions

In [93]:
# color coding function
def color_code_fn(school_type):
    """
    input: school type
    output: color of circle marker
    """
    if school_type == "elementary school":
        color_code = [255,0,125,255]
    elif school_type == "middle school":
        color_code = [255,0,0,255]  
    elif school_type == "high school":
        color_code = [255,125,255,255] 
    elif school_type == 'k-8':
        color_code = [0,0,255,255] 
    else:
        color_code = [0,255,0,255] 
        
    return color_code

# circle diameter function
def circle_diameter_fn(max_diameter, time_amount, late_time_array):
    """
    input: maximum diamter and array of late times of all schools included in visualization
    ouput: scaled diameter of circle markerfor each school
    """
    d = (max_diameter*(time_amount))/(max(late_time_array)/1.5)
    return d

def map_all_schools_fn(school):
    """
    input: school dictionary (including location (geocode), address, school name, school type))
    output: input for map.draw() method -- including geocode, pop_up description, and marker format
    """
    circle_location = school['geocode']['location']
    pop_up_description = {"title" : school['School'], "content" : school['geocode']['address']}
    circle_format = {"type": "esriSMS","style": "esriSMSCircle","color": [0,0,0,125],"size": 4,
                         "angle": 0,"xoffset": 0,"yoffset": 0,
                         "outline": {"color": [0,0,0,155],"width": 0}}
    return [circle_location, pop_up_description, circle_format]

def map_late_schools_fn(school, d, color_code, visualize):
    """
    input: (1) school dictionary (including location (geocode), address, school name, school type))
            (2) marker diameter
            (3) marker color code
    output: input for map.draw() method -- including geocode, pop_up description, and marker format
    """
    
    circle_location = school['geocode']['location']
    pop_up_description = {"title" : school['School'], "content" : "Late: "+str(school[visualize])+" minutes"}
    circle_format = {"type": "esriSMS","style": "esriSMSCircle","color": color_code,
                     "size": d,"angle": 0,"xoffset": 0,"yoffset": 0,
                     "outline": {"color": [0,0,0,155],"width": 1}}
    return [circle_location, pop_up_description, circle_format]

## Functions that select the rows to create late bus DataFrame of interest.

In [83]:
# get df for the buses that are late to school today
def today_late_buses_fn(late_bus_df, to_from):
    """
    input: entire late bus data frame
    output: buses that were late **to** school **today** dataframe 
    """
    # 1/7/19 used as a stand in for today since database isn't up to date
    late_bus_df1 = late_bus_df.loc[late_bus_df['Day'] == str(7)]#str(now.day)]
    late_bus_df2 = late_bus_df1.loc[late_bus_df1['To/From'] == to_from]
    #late_bus_df3 = late_bus_df2.loc[late_bus_df2['Data Taken Hour'] == 7]
    late_bus_df3 = late_bus_df2.drop_duplicates(keep = 'last', subset = ["Bus Number", "Month", "Day", "Year"])
    select_late_bus_df = late_bus_df3
    return select_late_bus_df

def entire_database_fn(late_bus_df, to_from):
    """
    input: entire late bus data frame
    output: buses that were late **to** school dataframe
    """
    #late_bus_df1  = late_bus_df.loc[late_bus_df['Day'] == str(7)]#str(now.day)]
    late_bus_df1 = late_bus_df.loc[late_bus_df['To/From'] == to_from]
    late_bus_df2 = late_bus_df1.drop_duplicates(keep = 'last', subset = ["Bus Number", "Month", "Day", "Year"])
    select_late_bus_df = late_bus_df2
    return select_late_bus_df

## Function that creates DataFrame with one row per school with late buses.

In [84]:
def late_minutes_per_school_fn(select_late_bus_df, school_df):  
    """
    input: select late bus dataframe for school's were interested in dataframe
    output: dataframe with one row per school affected by late bus
        new column includes the total number of minutes of late buses for each school
    """
    # one row per school affected
    late_schools_df = school_df.loc[school_df.get('School').isin(select_late_bus_df.get('School'))]
    
    # number of schools affected
    total_rows = late_schools_df.shape[0]#int(late_schools.size/8)
    print("number of schools with late buses: ", total_rows)
    
    # initialize late_time array
    late_time = (total_rows)*[0]
    count = (total_rows)*[0]
    i = 0
    
    # add minute late column to school df
    for index_schools, row_schools in late_schools_df.iterrows():
        for index_buses, row_buses in select_late_bus_df.iterrows():
            if row_schools['School'] == row_buses['School']:
                count[i] += 1
                late_time[i] += row_buses['Time']
        i+=1
    late_schools_df['Late Time'] = late_time
    late_schools_df['count'] = count
    return late_schools_df

## Function that generates the dataframe for visualization.

In [85]:
def get_important_df_fn(select_late_bus_fn, late_bus_df, school_df, to_from):
    """
    input: (1) the function for narrowing down the database to the rows we care about 
    (ex. entire_database_to_school_fn, today_to_school_late_buses_fn)
        (2) the entire late bus database
        (3) the entire school database
        
    output: the late school dataframe generated by late_minutes_per_school_fn
    """
    # finds the entries in the late_bus_df based on what function is passed (today, all, etc)
    select_late_bus_df = select_late_bus_fn(late_bus_df, to_from)
    
    # finds corresponding late schools 
    # calculates the late time per school and adds to df
    late_schools_df  = late_minutes_per_school_fn(select_late_bus_df, school_df)
    
    return late_schools_df

# Map of schools with late buses.

In [97]:
#visualize = 'Late Time' # show the total number of minutes per school
visualize = 'count' # show the total number of late buses per school

#from arcgis.geocoding import geocoding
my_gis = GIS()
seattle = my_gis.map("Seattle, WA")

# map all schools
for index, school in school_df.iterrows():
    try: 
        [circle_location, pop_up_description, circle_format] = map_all_schools_fn(school)
        seattle.draw(circle_location, pop_up_description, circle_format)
    # some schools don't have an address in the table
    except:
        pass  
    
#----get df with schools that have late buses-----

#df_of_interest_fn = today_late_buses_fn
df_of_interest_fn = entire_database_fn

late_schools_df = get_important_df_fn(df_of_interest_fn, late_bus_df, school_df, "to")

# map schools with late buses, total late time
for index, school in late_schools_df.iterrows():
    color_code = color_code_fn(school['Type'])
    d = circle_diameter_fn(max_diameter = 8, time_amount = school[visualize], 
                           late_time_array = late_schools_df[visualize])
    [circle_location, pop_up_description, circle_format] = map_late_schools_fn(school, d, color_code, visualize)
    seattle.draw(circle_location, pop_up_description, circle_format)
    
seattle

number of schools with late buses:  22


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
