# Exploring the routes

After we pulled all the .json files, we can have a lot of fun trying to analyze them

In [76]:
# import all the necessary things 

import json
import dask.bag as db
from itertools import compress
from datetime import datetime
import pytz
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# working directory 

json_loc =  "//Users//afan//Desktop//Misc//HMW_Transit//prep//int_data//bus_routes//"
cleaned_data = "//Users//afan//Desktop//Misc//HMW_Transit//cleaned_data//"

# time zones!

cst = pytz.timezone('US/Central')


Let's try navigating one of these beasts first 

In [30]:
a_str = json_loc + "/'101912002_BELLAIREHS_arr.json"
f = open(a_str)

a = json.load(f)

In [None]:
a.keys()

In [None]:
type(a['routes'])

In [None]:
# let's just pick the first route
a['routes'][0]

In [None]:
a['routes'][0].keys()

In [None]:
a['routes'][0]['legs']

In [None]:
a['routes'][0]['legs'][0]

In [None]:
a['routes'][0]['legs'][0].keys()

In [None]:
a['routes'][0]['legs'][0]['steps']

In [None]:
type(a['routes'][0]['legs'][0]['steps'])

In [None]:
len(a['routes'][0]['legs'][0]['steps'])

In [None]:
a['routes'][0]['legs'][0]['steps'][0]

In [None]:
a['routes'][0]['legs'][0]['steps'][0].keys()

In [None]:
a['routes'][0]['legs'][0]['steps'][0]['travel_mode']

In [None]:
a['routes'][0]['legs'][0]['steps'][0]['steps']

In [None]:
len(a['routes'][0]['legs'][0]['steps'][0]['steps'])

In [None]:
a['routes'][0]['legs'][0]['steps'][0]['steps'][2]

## Figuring out the relevant information

Clearly there is a TON of information here. Let's write out specifically what we want. 

The file we have is for arrivals at Austin High School

### 1) What is the latest you can leave to get to the board meeting on time? 

In [None]:
for i in range(0, len(a['routes'])):

    print(a['routes'][i]['legs'][0]['departure_time']['text'])
    print(a['routes'][i]['legs'][0]['duration']['text'])

###  2) What time will you get the board meeting if you leave as soon as school ends? 

In [None]:
b_str = json_loc + "/'101912002_BELLAIREHS_leav.json"

f = open(b_str)

b = json.load(f)

In [None]:
for i in range(0, len(b['routes'])):

    print(b['routes'][i]['legs'][0]['arrival_time']['value'])
    print(b['routes'][i]['legs'][0]['duration']['text'])

### 3) How long is the shortest ride? 

In [None]:
print(min([b['routes'][i]['legs'][0]['duration']['value'] for i in range(0,4)]))
print([b['routes'][i]['legs'][0]['duration']['text'] for i in range(0,4)])

### Other potential questions 

4. What is the fewest number of transfers? 
5. Driving distances? Uber prices? 

## DASK TIME!!!

We'll try using a Dask Bag here to parallelize things

In [2]:
# What is the latest time you can leave to arrive at HMW on time? 

filename = json_loc + "*_arr.json"
arr = db.read_text(filename).map(json.loads)
print(arr.count().compute())

271


In [51]:
def get_value_from_json(temp_file, feature, agg_method):
    
    place_id = temp_file['geocoded_waypoints'][0]['place_id']
    
    length = len(temp_file['routes'])
    if length == 0:

        final = [place_id, 0] 
        return final

    # make sure each of the routes actually has a departure time (aka if it's not walking directions) 
    dt_bool = [feature in temp_file['routes'][i]['legs'][0].keys() for i in range(0,length)]
    indices = list(compress(list(range(0, length)), dt_bool))

    time = agg_method([temp_file['routes'][i]['legs'][0][feature]['value'] for i in indices])

    final = [place_id, time]
    return final

def get_attribute_from_bag(glob_str, attribute, agg_method):

    bag = db.read_text(glob_str).map(json.loads)
    print(bag.count().compute())

    values = bag.map(get_value_from_json, attribute, agg_method).compute()

    return values


In [67]:
arr_times_str = json_loc + "*_arr.json"
dep_times_str = json_loc + "*_leav.json"

# what's the latest you can leave to get to the meeting on time? 
arr_times = get_attribute_from_bag(arr_times_str, 'departure_time', max)
arr = pd.DataFrame(arr_times, columns = ['Place_Id', 'latest_depart_time'])
#print([datetime.fromtimestamp(t).astimezone(cst).strftime('%-I:%M %p') for t in arr_times])


271


In [68]:
# what's the earlier you can get to a board meeting if you leave as soon as school starts? 
dep_times = get_attribute_from_bag(dep_times_str, 'arrival_time', min)
dep = pd.DataFrame(dep_times, columns = ['Place_Id', 'earliest_arrival_time'])

#print([datetime.fromtimestamp(t).astimezone(cst).strftime('%-I:%M %p') for t in dep_times])

271


In [69]:
test = arr.merge(dep)

In [70]:
test['latest_depart_time'] = [datetime.fromtimestamp(t).astimezone(cst).strftime('%-I:%M %p') for t in test.latest_depart_time]
test['earliest_arrival_time'] = [datetime.fromtimestamp(t).astimezone(cst).strftime('%-I:%M %p') for t in test.earliest_arrival_time]

In [71]:
test

Unnamed: 0,Place_Id,latest_depart_time,earliest_arrival_time
0,ChIJgaYY2_S-QIYRFmJmKC3LsfU,3:33 PM,4:34 PM
1,ChIJ2fwl0vHBQIYRN_qcsw1UoOc,3:27 PM,5:34 PM
2,ChIJa6zbsNC4QIYRvcP6oBeJ3KM,4:14 PM,4:57 PM
3,ChIJOVxSQWC7QIYRk1trHTtvlCs,3:14 PM,5:34 PM
4,ChIJsxU15YWVQIYR6wn3nxZIJLM,3:18 PM,5:19 PM
...,...,...,...
270,ChIJnzoOyMWwQIYRjnF7i0PDq6A,3:12 PM,4:49 PM
271,ChIJed5dHqPDQIYRn-CBRQWax_A,3:42 PM,4:16 PM
272,ChIJ7Zb80IbBQIYRDNmE11kjQYM,3:33 PM,5:34 PM
273,ChIJv-UZ9b--QIYRF6aziOTP424,3:34 PM,5:04 PM


In [80]:
sdg_str = cleaned_data + "school_demo_geo.csv"
sdg = pd.read_csv(sdg_str, parse_dates = ['End Time'])
sdg['End Time'] = [t.strftime('%-I:%M %p') for t in sdg['End Time']]

In [81]:
all = sdg.merge(test)

In [175]:
all[['Campus Short Name', 'End Time', 'latest_depart_time', 'earliest_arrival_time']]

Unnamed: 0,Campus Short Name,End Time,latest_depart_time,earliest_arrival_time
0,Austin HS,3:30 PM,3:33 PM,4:34 PM
1,Bellaire HS,4:10 PM,3:27 PM,5:34 PM
2,Northside HS,4:10 PM,4:14 PM,4:57 PM
3,Furr HS,4:10 PM,3:14 PM,5:34 PM
4,Jones HS,3:30 PM,3:18 PM,5:19 PM
5,Kashmere HS,4:10 PM,3:45 PM,5:19 PM
6,Lamar HS,4:10 PM,4:00 PM,5:04 PM
7,Wisdom HS,4:10 PM,3:56 PM,5:05 PM
8,Madison HS,4:10 PM,3:14 PM,5:49 PM
9,Milby HS,3:30 PM,3:02 PM,5:19 PM


In [173]:
all['makeable'] = [datetime.strptime(t, '%I:%M %p') < datetime.strptime("5:00 PM", '%I:%M %p') for t in all['earliest_arrival_time']]
all['gap_makeable'] = [datetime.strptime(t, '%I:%M %p') - datetime.strptime("5:00 PM", '%I:%M %p') for t in all['earliest_arrival_time']]

In [101]:
datetime.time(17,00)

TypeError: descriptor 'time' for 'datetime.datetime' objects doesn't apply to a 'int' object

In [102]:
datetime.strptime("3:33 PM", '%I:%M %p')

datetime.datetime(1900, 1, 1, 15, 33)

In [107]:
all.makeable.value_counts()

True     184
False    100
Name: makeable, dtype: int64

In [111]:
makeable.columns

Index(['Unnamed: 0', 'School_Num', 'School_Nam', 'Place_addr', 'Grade_Rang',
       'X', 'Y', 'Campus Short Name', 'End Time', 'CAMPUS', 'CAMPNAME', 'D504',
       'All', 'Asian', 'Attrition', 'Attrition_Denom', 'Black', 'DAEP',
       'Dyslexia', 'Econ_Disadv', 'Female', 'Foster_Care', 'Hispanic',
       'Homeless', 'Immigrant', 'Am_Ind', 'Eng_Learner', 'Male', 'Migrant',
       'Military_Conn', 'Non_Ed_Disadv', 'Pacific_Is', 'At_Risk', 'Title_I',
       'Two_Or_More', 'White', 'DISTNAME', 'DISTRICT', '_merge', 'Place_Id',
       'latest_depart_time', 'earliest_arrival_time', 'makeable'],
      dtype='object')

In [143]:
totals = all.groupby(['Grade_Rang','makeable']).sum()/all.groupby('Grade_Rang').sum()
totals.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,All,Am_Ind,Asian,At_Risk,Black,D504,DAEP,Dyslexia,Econ_Disadv,Eng_Learner,Female,Foster_Care,Hispanic,Homeless,Immigrant,Male,Migrant,Military_Conn,Non_Ed_Disadv,Pacific_Is,Title_I,Two_Or_More,Unnamed: 0,White,X,Y,makeable
Grade_Rang,makeable,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
'01-05,False,1.0,,,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,
'02-05,True,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,
'03-12,False,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,
'04-08,False,1.0,,1.0,1.0,1.0,,,,1.0,1.0,1.0,,1.0,,1.0,1.0,,,1.0,,1.0,,1.0,1.0,1.0,1.0,
'06-08,False,0.758455,0.6875,0.638243,0.82219,0.791493,0.673219,0.824286,0.701493,0.803291,0.853237,0.757939,0.952381,0.781757,0.848611,0.836493,0.758953,0.694444,0.166667,0.558968,0.7,0.78096,0.509524,0.895144,0.545052,0.760905,0.760428,


In [134]:
totals2 = totals.drop('makeable', axis = 1).reset_index()

In [141]:
totals2[totals2.Grade_Rang == "\'09-12"].T

Unnamed: 0,8,9
Grade_Rang,'09-12,'09-12
makeable,False,True
All,0.725929,0.274071
Am_Ind,0.715686,0.284314
Asian,0.747714,0.252286
At_Risk,0.772172,0.227828
Black,0.796287,0.203713
D504,0.662875,0.337125
DAEP,0.809045,0.190955
Dyslexia,0.72351,0.27649


In [138]:
totals2

Unnamed: 0.1,Grade_Rang,makeable,All,Am_Ind,Asian,At_Risk,Black,D504,DAEP,Dyslexia,Econ_Disadv,Eng_Learner,Female,Foster_Care,Hispanic,Homeless,Immigrant,Male,Migrant,Military_Conn,Non_Ed_Disadv,Pacific_Is,Title_I,Two_Or_More,Unnamed: 0,White,X,Y
0,'01-05,False,1.0,,,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,,1.0,,1.0,1.0,1.0,1.0,1.0,1.0
1,'02-05,True,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,,1.0,,1.0,1.0,1.0,1.0,1.0,1.0
2,'03-12,False,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,'04-08,False,1.0,,1.0,1.0,1.0,,,,1.0,1.0,1.0,,1.0,,1.0,1.0,,,1.0,,1.0,,1.0,1.0,1.0,1.0
4,'06-08,False,0.758455,0.6875,0.638243,0.82219,0.791493,0.673219,0.824286,0.701493,0.803291,0.853237,0.757939,0.952381,0.781757,0.848611,0.836493,0.758953,0.694444,0.166667,0.558968,0.7,0.78096,0.509524,0.895144,0.545052,0.760905,0.760428
5,'06-08,True,0.241545,0.3125,0.361757,0.17781,0.208507,0.326781,0.175714,0.298507,0.196709,0.146763,0.242061,0.047619,0.218243,0.151389,0.163507,0.241047,0.305556,0.833333,0.441032,0.3,0.21904,0.490476,0.104856,0.454948,0.239095,0.239572
6,'06-12,False,0.992219,1.0,1.0,0.983769,0.981081,1.0,0.785714,0.957447,0.991395,0.997866,0.996375,,0.995379,1.0,1.0,0.987805,1.0,,0.997642,1.0,0.992219,1.0,0.559964,0.987342,0.666803,0.66611
7,'06-12,True,0.007781,0.0,0.0,0.016231,0.018919,0.0,0.214286,0.042553,0.008605,0.002134,0.003625,,0.004621,0.0,0.0,0.012195,0.0,,0.002358,0.0,0.007781,0.0,0.440036,0.012658,0.333197,0.33389
8,'09-12,False,0.725929,0.715686,0.747714,0.772172,0.796287,0.662875,0.809045,0.72351,0.745498,0.819768,0.715586,1.0,0.704019,0.872079,0.90958,0.736071,0.735849,0.333333,0.65305,0.702128,0.732423,0.691466,0.701645,0.694215,0.666694,0.666296
9,'09-12,True,0.274071,0.284314,0.252286,0.227828,0.203713,0.337125,0.190955,0.27649,0.254502,0.180232,0.284414,0.0,0.295981,0.127921,0.09042,0.263929,0.264151,0.666667,0.34695,0.297872,0.267577,0.308534,0.298355,0.305785,0.333306,0.333704


In [154]:
temp = all.groupby(['Grade_Rang','makeable']).sum().reset_index()
temp[temp.Grade_Rang == "\'09-12"].T.head(12)

Unnamed: 0,8,9
Grade_Rang,'09-12,'09-12
makeable,False,True
Unnamed: 0,3029,1288
X,-2671.128545,-1335.398616
Y,832.084348,416.736411
D504,1282.0,652.0
All,36099.0,13629.0
Asian,1390.0,469.0
Black,9049.0,2315.0
DAEP,805.0,190.0


In [156]:
nses_f = (36099.0 - 29225.0)
nses_t = (13629.0 - 9977.00

In [157]:
nses_t/(nses_t + nses_f)

0.3469504085122554

In [163]:
all[all.Grade_Rang == "\'09-12"].earliest_arrival_time.value_counts()

5:34 PM    12
4:34 PM     6
5:04 PM     5
5:19 PM     4
5:49 PM     4
4:49 PM     4
4:19 PM     2
6:04 PM     2
4:57 PM     1
5:05 PM     1
4:51 PM     1
Name: earliest_arrival_time, dtype: int64

In [164]:
all[['Campus Short Name', 'End Time', 'latest_depart_time', 'earliest_arrival_time']].head()

Unnamed: 0,Campus Short Name,End Time,latest_depart_time,earliest_arrival_time
0,Austin HS,3:30 PM,3:33 PM,4:34 PM
1,Bellaire HS,4:10 PM,3:27 PM,5:34 PM
2,Northside HS,4:10 PM,4:14 PM,4:57 PM
3,Furr HS,4:10 PM,3:14 PM,5:34 PM
4,Jones HS,3:30 PM,3:18 PM,5:19 PM


In [174]:
all[all.Grade_Rang == "\'09-12"].gap_makeable.value_counts()

0 days 00:34:00      12
-1 days +23:34:00     6
0 days 00:04:00       5
0 days 00:19:00       4
0 days 00:49:00       4
-1 days +23:49:00     4
-1 days +23:19:00     2
0 days 01:04:00       2
-1 days +23:57:00     1
0 days 00:05:00       1
-1 days +23:51:00     1
Name: gap_makeable, dtype: int64