## Creating Visualizations

##### The following series of cells will show feature engineering resulting in a heat map and histogram for a given stop over a specific range of time

In [18]:
!pip install mysql.connector;

Processing /root/.cache/pip/wheels/8c/83/a1/f8b6d4bb1bd6208bbde1608bbfa7557504bed9eaf2ecf8c175/mysql_connector-2.2.9-cp36-cp36m-linux_x86_64.whl
Installing collected packages: mysql.connector
Successfully installed mysql.connector


In [0]:
import mysql.connector
from mysql.connector import Error
import requests
import pandas as pd

In [0]:
# Credentials to access the datase should all be provided to you by your TL

connection = mysql.connector.connect(host='',
                                     database='',
                                     user='',
                                     password='')


In [0]:
# Database call 
# The following vars provide the route id (rid) and direction (i_or_o)
# Route id can be found with the following
'''
stops['route_id'].values()
'''

rid = '1'
i_or_o = '%_I_%'

cur = connection.cursor()

# Query to use when the complete new table has been added to the DF
# sql_query = ('''SELECT id, datetime, rid, vid, secs, kph, head, lat, lon, dir, stop_lat, stop_lon, stop_id
#                 FROM historic_location_stops
#                 WHERE rid = %s
#                 AND 
#                 dir LIKE %s;
#                 ''')

# Query to use with temporary DF for single day analysis
sql_query = ('''SELECT id, datetime, rid, vid, secs, kph, head, lat, lon, dir
                FROM historic_location
                WHERE rid = %s
                AND
                dir LIKE %s;
                ''')

cur.execute(sql_query, (rid, i_or_o))

In [0]:
results = cur.fetchall()
df_inbound = pd.DataFrame(results,
                          columns=['id','datetime','rid','vid','secs','kph','head','lat','lon','dir'])

In [7]:
stops = pd.read_csv('https://raw.githubusercontent.com/Lambda-School-Labs/sfmta-data-analysis-ds/master/datasets/route_info.csv')
stops.head()

Unnamed: 0,route_id,lat,lon,stopId,tag,title,dir
0,E,37.80713,-122.41732,15184,5184,Jones St & Beach St,Outbound
1,E,37.80741,-122.41412,13092,3092,Beach St & Mason St,Outbound
2,E,37.80784,-122.41081,13095,3095,Beach St & Stockton St,Outbound
3,E,37.80663,-122.40603,14502,4502,The Embarcadero & Bay St,Outbound
4,E,37.80502,-122.40331,14529,4529,The Embarcadero & Sansome St,Outbound


In [10]:
df_inbound.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 281335 entries, 0 to 281334
Data columns (total 10 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   id        281335 non-null  int64         
 1   datetime  281335 non-null  datetime64[ns]
 2   rid       281335 non-null  object        
 3   vid       281335 non-null  int64         
 4   secs      281335 non-null  int64         
 5   kph       281335 non-null  int64         
 6   head      281335 non-null  int64         
 7   lat       281335 non-null  object        
 8   lon       281335 non-null  object        
 9   dir       281335 non-null  object        
dtypes: datetime64[ns](1), int64(5), object(4)
memory usage: 21.5+ MB


### Route 1 Analysis
> In the following analysis we will attempt to find ways to better tackle the question, what and where is bunching and gapping occuring?

>> One way to find the answer is to dig through the data in an incredibly granular fashion.  We will isolate inbound route 1, and within that route 
isolate a single stop, then a single day; and breakdown that day by hours.  This approach should lead us to finding which hours of the day contain the highest density of vehicles for the given stop; which can then be further dissected.  After this thorough dissection we can then scale it to the entirity of route 1, and then to all of the routes containing bus vehicles.







In [12]:
# Selecting Route 1 for analysis

route_1_stops = stops[stops['route_id'] == '1']
print(route_1_stops.shape)
route_1_stops.info()

(96, 7)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 96 entries, 745 to 840
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   route_id  96 non-null     object 
 1   lat       96 non-null     float64
 2   lon       96 non-null     float64
 3   stopId    96 non-null     int64  
 4   tag       96 non-null     int64  
 5   title     96 non-null     object 
 6   dir       96 non-null     object 
dtypes: float64(2), int64(2), object(3)
memory usage: 6.0+ KB


In [27]:
# Isolating inbound stops

route_1_stops_I = route_1_stops[route_1_stops['dir'] == 'Inbound']
route_1_stops_I_array = route_1_stops_I[['lat', 'lon', 'stopId']]
route_1_stops_I_dict = route_1_stops_I_array.to_dict()
route_1_stops_I_dict.keys()

dict_keys(['lat', 'lon', 'stopId'])

In [0]:
# creating a list of dicts to perform proximity analysis

route_1_stops_list_dict = []

for key, values in route_1_stops_I_dict['lon'].items():
  # print(values)
  route_1_stops_list_dict.append({'stop_lat': route_1_stops_I_dict['lat'][key],
                                  'stop_lon': values,
                                  'stopId'  : route_1_stops_I_dict['stopId'][key]})

In [29]:
# cell is simply to confirm values were added as expected

route_1_stops_list_dict

[{'stopId': 13848,
  'stop_lat': 37.787620000000004,
  'stop_lon': -122.44337990000001},
 {'stopId': 13859, 'stop_lat': 37.7879499, 'stop_lon': -122.44072},
 {'stopId': 13885, 'stop_lat': 37.78846, 'stop_lon': -122.43680990000001},
 {'stopId': 16489, 'stop_lat': 37.78933, 'stop_lon': -122.43556000000001},
 {'stopId': 16296, 'stop_lat': 37.7898199, 'stop_lon': -122.43399},
 {'stopId': 16320, 'stop_lat': 37.78999, 'stop_lon': -122.43249990000001},
 {'stopId': 16292, 'stop_lat': 37.790189899999994, 'stop_lon': -122.43085},
 {'stopId': 16306, 'stop_lat': 37.7903599, 'stop_lon': -122.42918},
 {'stopId': 16310, 'stop_lat': 37.79057, 'stop_lon': -122.42760990000002},
 {'stopId': 14905, 'stop_lat': 37.791039899999994, 'stop_lon': -122.42577},
 {'stopId': 14016, 'stop_lat': 37.7919099, 'stop_lon': -122.42446000000001},
 {'stopId': 14031, 'stop_lat': 37.79211, 'stop_lon': -122.42290990000002},
 {'stopId': 14026, 'stop_lat': 37.7923599, 'stop_lon': -122.42101000000001},
 {'stopId': 14022, 'stop_l

In [0]:
# The following formula assigns stops to vehicles for a given route.

# Stops were previously assigned by route, and direction; so the following code
# set the groundwork for analysis of vehicles traveling in relatively similar
# directions, for the same route.

from math import cos, asin, sqrt

# Math functions to calculate closest stop
def distance(lat1, lon1, lat2, lon2, stopId):
  p = 0.017453292519943295
  a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
  return [12742 * asin(sqrt(a)), stopId]

def closest(data, v):
  return min(data, key=lambda p: distance(v['lat'],v['lon'],p['stop_lat'],p['stop_lon'], p['stopId']))
  return c_temp[0], c_temp[1]

In [0]:
# The following loop takes route 1 stop positions (lat/lon) and compares 
# relative distances between stops and vehicle positions (lat/lon), it then 
# assigns vehicles to stops based on the relative distance; in other words, if 
# a vehicle is considered to be closer to one stop than any other on that route, 
# that vehicle is assigned that stop.

new_values = []

for index, row in df_inbound.iterrows():
    lat = row['lat']
    lon = row['lon']
    v = {'lat': float(lat), 'lon': float(lon)}
    new_values.append(closest(route_1_stops_list_dict, v))

In [37]:
new_values_df = pd.DataFrame(new_values)
print(new_values_df.head())
new_values_df.nunique()

   stop_lat   stop_lon  stopId
0  37.78680 -122.44998   13876
1  37.79336 -122.41274   14030
2  37.78394 -122.48189   13838
3  37.79478 -122.40134   14028
4  37.79211 -122.42291   14031


stop_lat    48
stop_lon    48
stopId      48
dtype: int64

In [38]:
# Reincorporating route names into the dataframe.

stop_titles = route_1_stops_I[['stopId', 'title']]
new_values_df = pd.merge(new_values_df, stop_titles, on='stopId', how='left')
new_values_df.head()

Unnamed: 0,stop_lat,stop_lon,stopId,title
0,37.7868,-122.44998,13876,California St & Laurel St
1,37.79336,-122.41274,14030,Clay St & Taylor St
2,37.78394,-122.48189,13838,California St & 22nd Ave
3,37.79478,-122.40134,14028,Clay St & Sansome St
4,37.79211,-122.42291,14031,Clay St & Van Ness Ave


In [39]:
new_values_df.shape

(281335, 4)

In [40]:
route_1_and_stops_df = pd.concat([df_inbound, new_values_df], axis=1)
route_1_and_stops_df_sorted = route_1_and_stops_df.sort_values(by=['stopId',
                                                                   'datetime'])
route_1_and_stops_df_sorted.head()

Unnamed: 0,id,datetime,rid,vid,secs,kph,head,lat,lon,dir,stop_lat,stop_lon,stopId,title
112,6368,2020-01-20 15:06:53,1,5792,26,10,15,37.783501,-122.492996,1____I_F00,37.78342,-122.49244,13546,32nd Ave & California St
270,15922,2020-01-20 15:31:29,1,5844,19,32,345,37.783001,-122.491997,1____I_F00,37.78342,-122.49244,13546,32nd Ave & California St
418,24743,2020-01-20 15:53:03,1,5860,12,26,90,37.7836,-122.491997,1____I_F00,37.78342,-122.49244,13546,32nd Ave & California St
489,28936,2020-01-20 16:02:19,1,5767,31,39,345,37.782799,-122.491997,1____I_F00,37.78342,-122.49244,13546,32nd Ave & California St
630,38306,2020-02-23 18:45:50,1,5823,59,19,90,37.7836,-122.491997,1____I_E00,37.78342,-122.49244,13546,32nd Ave & California St


# Code used to generate visualizations

##### Feature engineering


*   Create feature that looks at distance between set of lat/lon values between stop and vehicle.
*   Look at vehicle counts per stop.
*   Isolate a stop.
*   Sort by DateTime.
*   Isolate a single span of time (3 minutes on a given day, and apply to entire route).
*   Potentially look at the same 3 minutes over all days for the same stop.

##### A question to ask, is 'How do we know that a vehicle is making the specific stop we are interested in (and thus could be incorporated into analysis of bunching and gapping).  We might need to take diretion into account as well.

*   Something to take into account: The stops dataframes were cleaned to have stops correspond to specific routes, and thus partially answering the above question.
*   Identifying the direction of the vehicles could further solidfy the answer to the question.








In [41]:
import numpy as np

# drop the 'dir' column; completely unnesseary for this analysis because all 
# vehicles are going inbound, and if we want specific orientation, we can use
# the 'head' column to isolate based on orientation.

route_1_and_stops_df_sorted = route_1_and_stops_df_sorted.drop(columns='dir')
route_1_and_stops_df_sorted.shape

(281335, 13)

In [42]:
# Selecting a stop-id that is not a station, or near a station; choosing stop
# id's at or near a stations results in poor analysis

route_1_and_stops_df_sorted['title'].describe()

count                    281335
unique                       48
top       Geary Blvd & 33rd Ave
freq                      54653
Name: title, dtype: object

In [43]:
# Choosing stop w/count at 50th percentile from 'stopId' column.

# Remember to verify that stop is not a/near station.
# Stop 14021 ()
# I noticed there are several stops in this instersection, so I will have to 
# incorporate the 'head' column into the analysis to asure that all vehicles are
# headed in the same direction

stop_14021 = route_1_and_stops_df_sorted[route_1_and_stops_df_sorted['stopId'] == 14021]
stop_14021.head(3)

Unnamed: 0,id,datetime,rid,vid,secs,kph,head,lat,lon,stop_lat,stop_lon,stopId,title
42,2394,2020-01-20 14:58:37,1,5796,38,16,75,37.794601,-122.403999,37.79448,-122.40441,14021,Clay St & Kearny St
77,4502,2020-01-20 15:02:46,1,5828,32,0,80,37.794498,-122.404999,37.79448,-122.40441,14021,Clay St & Kearny St
149,8387,2020-01-20 15:13:59,1,5816,3,0,80,37.794498,-122.404999,37.79448,-122.40441,14021,Clay St & Kearny St


In [44]:
# Looking at direction for vehicles traveling relatively close to stop 14021

# Seems most vehicles (~48.29% for route 1) are traveling in the direction 75 
# when they're near this stop

print(stop_14021['head'].value_counts(sorted).head())


# StopId 14021 for all vehicles in direction 75 and 80; which should encompass
# the vast majority of vehicles in for this stop (~91%)

stop_14021_75_80 = stop_14021[(stop_14021['head'] == 75) | (stop_14021['head'] == 80)]
print(stop_14021_75_80.head(3))


# Values show that all vehicles in and around this location are idle, or
# at a complete stop; but the fluctuation in latitudes implies some are
# relatively closer than others.  
# Area might simlpy be densely populated, and has a high volume of patrons.

stop_14021_75_80.describe(include='all')

75     0.482855
80     0.439212
81     0.053322
60     0.020509
219    0.002953
Name: head, dtype: float64
       id            datetime rid  ...   stop_lon  stopId                title
42   2394 2020-01-20 14:58:37   1  ... -122.40441   14021  Clay St & Kearny St
77   4502 2020-01-20 15:02:46   1  ... -122.40441   14021  Clay St & Kearny St
149  8387 2020-01-20 15:13:59   1  ... -122.40441   14021  Clay St & Kearny St

[3 rows x 13 columns]


Unnamed: 0,id,datetime,rid,vid,secs,kph,head,lat,lon,stop_lat,stop_lon,stopId,title
count,5620.0,5620,5620.0,5620.0,5620.0,5620.0,5620.0,5620.0,5620.0,5620.0,5620.0,5620.0,5620
unique,,5510,1.0,,,,,8.0,2.0,,,,1
top,,2020-03-12 11:27:11,1.0,,,,,37.794399,-122.404999,,,,Clay St & Kearny St
freq,,3,5620.0,,,,,2869.0,4307.0,,,,5620
first,,2020-01-20 14:58:37,,,,,,,,,,,
last,,2020-04-13 09:28:16,,,,,,,,,,,
mean,7919115.0,,,5819.905872,31.193594,8.208185,77.381673,,,37.79448,-122.4044,14021.0,
std,4688268.0,,,38.774879,20.762993,9.874767,2.49742,,,2.941909e-12,5.684848e-12,0.0,
min,2394.0,,,5754.0,0.0,0.0,75.0,,,37.79448,-122.4044,14021.0,
25%,3768335.0,,,5785.0,15.0,0.0,75.0,,,37.79448,-122.4044,14021.0,


In [45]:
# Engineering column containing distance between vehicle and stop using the 
# geopy library distance formula

# Coordinates need to be entered as tuples
import geopy.distance

distance_column = []

for key, p in stop_14021_75_80['lat'].items():
  coords_1 = (stop_14021_75_80['stop_lat'][key], stop_14021_75_80['stop_lon'][key])
  coords_2 = (p, stop_14021_75_80['lon'][key])
  distance_column.append(geopy.distance.vincenty(coords_1, coords_2).km)

distance_column

[0.038602368338070514,
 0.05192461267728309,
 0.05192461267728309,
 0.037290830131244436,
 0.05265932787210973,
 0.03624594385974663,
 0.05265932787210973,
 0.038602368338070514,
 0.05265932787210973,
 0.038602368338070514,
 0.05192461267728309,
 0.05192461267728309,
 0.05192461267728309,
 0.05192461267728309,
 0.05192461267728309,
 0.05265932787210973,
 0.05265932787210973,
 0.05192461267728309,
 0.05265932787210973,
 0.05192461267728309,
 0.05265932787210973,
 0.03624594385974663,
 0.05265932787210973,
 0.05265932787210973,
 0.05265932787210973,
 0.038602368338070514,
 0.05265932787210973,
 0.038602368338070514,
 0.05192461267728309,
 0.05265932787210973,
 0.038602368338070514,
 0.05265932787210973,
 0.03624594385974663,
 0.05265932787210973,
 0.05265932787210973,
 0.05265932787210973,
 0.05265932787210973,
 0.03624594385974663,
 0.05192461267728309,
 0.038602368338070514,
 0.038602368338070514,
 0.05265932787210973,
 0.05265932787210973,
 0.05265932787210973,
 0.05265932787210973,
 

In [46]:
# Incorporating distance_column into dataframe

stop_14021_75_80['dist_km'] = distance_column

stop_14021_75_80['kph'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


count    5620.000000
mean        8.208185
std         9.874767
min         0.000000
25%         0.000000
50%         2.000000
75%        16.000000
max        43.000000
Name: kph, dtype: float64

In [47]:
# Potentially usefull: this code generates time duration to/from stops, 
# calculated using the 'kph' and 'dist_km' columns.  Generated output could be
# used in the future for calculating estimated arrival times for a given vehicle
# food for thought.  Consider kph != 0

stop_14021_75_80['dist_min'] = (stop_14021_75_80['kph']/stop_14021_75_80['dist_km'])/60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [48]:
# generating columns based on a single day

# Sort DF by stopId and datetime
results_sorted = stop_14021_75_80.sort_values(by=['stopId', 'datetime'])

# Isolate data for the requested date
cali1_3_29 = results_sorted.loc[(results_sorted['datetime'].dt.month==3) & (results_sorted['datetime'].dt.day==29)]
cali1_3_29['hours'] = cali1_3_29['datetime'].dt.hour
stop_14021_75_80['hours'] = stop_14021_75_80['datetime'].dt.hour
route_1_and_stops_df_sorted['hours'] = route_1_and_stops_df_sorted['datetime'].dt.hour
route_1_and_stops_df_sorted = route_1_and_stops_df_sorted.loc[(route_1_and_stops_df_sorted['datetime'].dt.month==3) & (route_1_and_stops_df_sorted['datetime'].dt.day==29)]


# df cali1_3_29
print(cali1_3_29.shape)
cali1_3_29.head()

(38, 16)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,datetime,rid,vid,secs,kph,head,lat,lon,stop_lat,stop_lon,stopId,title,dist_km,dist_min,hours
197960,11932114,2020-03-29 14:29:06,1,5834,50,0,80,37.794498,-122.404999,37.79448,-122.40441,14021,Clay St & Kearny St,0.051925,0.0,14
198034,11937591,2020-03-29 14:48:32,1,5789,13,0,80,37.794601,-122.404999,37.79448,-122.40441,14021,Clay St & Kearny St,0.053596,0.0,14
198145,11944160,2020-03-29 15:11:58,1,5767,48,27,75,37.794601,-122.403999,37.79448,-122.40441,14021,Clay St & Kearny St,0.038602,11.657316,15
198345,11953399,2020-03-29 15:44:47,1,5842,28,0,80,37.794399,-122.404999,37.79448,-122.40441,14021,Clay St & Kearny St,0.052659,0.0,15
198353,11953697,2020-03-29 15:45:48,1,5842,21,0,80,37.794399,-122.404999,37.79448,-122.40441,14021,Clay St & Kearny St,0.052659,0.0,15


In [56]:
import pandas as pd
import plotly.express as px

fig = px.density_mapbox(stop_14021_75_80, 
                        lat='lat', 
                        lon='stop_lon', 
                        z='hours', 
                        radius=20,
                        center=dict(lat=37.79448, 
                                    lon=-122.40441), 
                        zoom=16.7,
                        mapbox_style="stamen-terrain")
fig.show()

In [57]:
# Creating distance columns for route_1 dataframe

import geopy.distance

distance_column = []

for key, p in route_1_and_stops_df_sorted['lat'].items():
  coords_1 = (route_1_and_stops_df_sorted['stop_lat'][key], route_1_and_stops_df_sorted['stop_lon'][key])
  coords_2 = (p, route_1_and_stops_df_sorted['lon'][key])
  distance_column.append(geopy.distance.vincenty(coords_1, coords_2).km)

route_1_and_stops_df_sorted['dist_km'] = distance_column
route_1_and_stops_df_sorted['dist_min'] = (route_1_and_stops_df_sorted['kph']/route_1_and_stops_df_sorted['dist_km'])/60

route_1_and_stops_df_sorted.head(3)

Unnamed: 0,id,datetime,rid,vid,secs,kph,head,lat,lon,stop_lat,stop_lon,stopId,title,hours,dist_km,dist_min
197993,11934501,2020-03-29 14:38:19,1,5767,18,10,0,37.783501,-122.491997,37.78342,-122.49244,13546,32nd Ave & California St,14,0.040046,4.161867
198099,11941983,2020-03-29 15:04:49,1,5842,23,6,90,37.7836,-122.491997,37.78342,-122.49244,13546,32nd Ave & California St,15,0.043841,2.280984
198148,11944322,2020-03-29 15:12:59,1,5797,37,11,0,37.783501,-122.492996,37.78342,-122.49244,13546,32nd Ave & California St,15,0.049796,3.681659


In [74]:
# plotting bus_2021 vehicle route against velocity (in kph)

fig = px.density_mapbox(route_1_and_stops_df_sorted, 
                        lat='lat', 
                        lon='stop_lon', 
                        z='hours', 
                        radius=20,
                        center=dict(lat=37.79, 
                                    lon=-122.425), 
                        zoom=12,
                        mapbox_style="stamen-terrain")
fig.show()

In [64]:
# Limiting distance in minutes to <= 7 min allows for a more precise analysis of where the vehicles are relative to a stop

test = route_1_and_stops_df_sorted[route_1_and_stops_df_sorted['dist_min'] <= 7.0]
test.head()

Unnamed: 0,id,datetime,rid,vid,secs,kph,head,lat,lon,stop_lat,stop_lon,stopId,title,hours,dist_km,dist_min
197993,11934501,2020-03-29 14:38:19,1,5767,18,10,0,37.783501,-122.491997,37.78342,-122.49244,13546,32nd Ave & California St,14,0.040046,4.161867
198099,11941983,2020-03-29 15:04:49,1,5842,23,6,90,37.7836,-122.491997,37.78342,-122.49244,13546,32nd Ave & California St,15,0.043841,2.280984
198148,11944322,2020-03-29 15:12:59,1,5797,37,11,0,37.783501,-122.492996,37.78342,-122.49244,13546,32nd Ave & California St,15,0.049796,3.681659
198203,11947128,2020-03-29 15:22:20,1,5812,23,14,345,37.783401,-122.492996,37.78342,-122.49244,13546,32nd Ave & California St,15,0.049024,4.759622
198236,11948741,2020-03-29 15:28:26,1,5824,61,5,345,37.783401,-122.491997,37.78342,-122.49244,13546,32nd Ave & California St,15,0.039081,2.13233


In [65]:
test.shape

(1634, 16)

In [70]:
fig = px.density_mapbox(test, 
                        lat='lat', 
                        lon='stop_lon', 
                        z='dist_min', 
                        radius=20,
                        center=dict(lat=37.79, 
                                    lon=-122.425), 
                        zoom=12,
                        mapbox_style="stamen-terrain")
fig.show()