## Exercise 2.5 Advanced Geospatial Mapping 

In [6]:
#importing libraries
import pandas as pd
import os
from keplergl import KeplerGl
from pyproj import CRS
import numpy as np
from matplotlib import pyplot as plt

In [8]:
df = pd.read_csv('weather_and_daily_bike_rides_data_smallest_subset.csv', index_col = 0)

In [10]:
df.shape

(149191, 14)

In [12]:
df.columns

Index(['date', 'ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'end_station_name', 'start_lat', 'start_lng',
       'end_lat', 'end_lng', 'member_casual', 'avgTemp', 'daily_bike_rides'],
      dtype='object')

## Data Preprocessing

In [15]:
# Creating a value column and group by start and end station
df['value'] = 1

In [17]:
#grouping trips by start_station_name & end_station_name
df_group = df.groupby(['start_station_name', 'end_station_name'])['value'].count().reset_index()

In [19]:
df_group

Unnamed: 0,start_station_name,end_station_name,value
0,1 Ave & E 110 St,1 Ave & E 110 St,3
1,1 Ave & E 110 St,1 Ave & E 94 St,1
2,1 Ave & E 110 St,2 Ave & E 104 St,1
3,1 Ave & E 110 St,2 Ave & E 125 St,1
4,1 Ave & E 110 St,2 Ave & E 29 St,2
...,...,...,...
93830,Yankee Ferry Terminal,Hudson St & W 13 St,1
93831,Yankee Ferry Terminal,Picnic Point,13
93832,Yankee Ferry Terminal,Soissons Landing,27
93833,Yankee Ferry Terminal,South St & Whitehall St,2


In [21]:
df_group.rename(columns = {'value' : 'trips'}, inplace = True)

In [25]:
df_group

Unnamed: 0,start_station_name,end_station_name,trips
0,1 Ave & E 110 St,1 Ave & E 110 St,3
1,1 Ave & E 110 St,1 Ave & E 94 St,1
2,1 Ave & E 110 St,2 Ave & E 104 St,1
3,1 Ave & E 110 St,2 Ave & E 125 St,1
4,1 Ave & E 110 St,2 Ave & E 29 St,2
...,...,...,...
93830,Yankee Ferry Terminal,Hudson St & W 13 St,1
93831,Yankee Ferry Terminal,Picnic Point,13
93832,Yankee Ferry Terminal,Soissons Landing,27
93833,Yankee Ferry Terminal,South St & Whitehall St,2


In [27]:
df_group['trips'].sort_index().value_counts()

trips
1     66250
2     15890
3      5766
4      2647
5      1337
6       721
7       404
8       255
9       147
10      108
11       73
12       51
13       41
14       29
16       26
15       16
17       14
21       11
20       10
18        9
19        6
23        4
27        3
33        2
34        2
40        2
25        2
24        2
22        1
56        1
26        1
31        1
28        1
37        1
29        1
Name: count, dtype: int64

In [29]:
#exporting trip data
df_group.to_csv('trip_data.csv')

In [31]:
#comparing the sum of the value column with the total number of rows to see if they match 
#if so, the groupby function has been done correctly
print(df_group['trips'].sum())
print(df.shape)

148867
(149191, 15)


In [33]:
#totals are not equal
#checking why this is 
df.isnull().sum()

date                    0
ride_id                 0
rideable_type           0
started_at              0
ended_at                0
start_station_name      0
end_station_name      324
start_lat               0
start_lng               0
end_lat               168
end_lng               168
member_casual           0
avgTemp                 0
daily_bike_rides        0
value                   0
dtype: int64

In [35]:
#creating a new df with no missing values
df.dropna(inplace = True)

In [37]:
df.shape

(148867, 15)

In [39]:
#comparing the sum of the trips column with the total number of rows again
print(df_group['trips'].sum())
print(df.shape)

148867
(148867, 15)


In [41]:
df.head()

Unnamed: 0,date,ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,avgTemp,daily_bike_rides,value
0,2022-01-01,69E36EE94F201D31,electric_bike,2022-01-01 16:15:45.492,2022-01-01 16:33:09.229,E 58 St & 3 Ave,W 87 St & Amsterdam Ave,40.760958,-73.967245,40.78839,-73.9747,member,11.6,1028,1
1,2022-01-01,12E108C5D4E30944,electric_bike,2022-01-01 17:18:32.640,2022-01-01 17:20:12.257,Marcus Garvey Blvd & Macon St,Albany Ave & Fulton St,40.682601,-73.938037,40.680011,-73.938475,member,11.6,1028,1
2,2022-01-01,2095EF3D254EB940,classic_bike,2022-01-01 18:21:43.082,2022-01-01 18:29:56.749,W 21 St & 6 Ave,8 Ave & W 27 St,40.74174,-73.994156,40.747968,-73.996637,member,11.6,1028,1
3,2022-01-01,10822F0E35739F43,classic_bike,2022-01-01 09:21:09.002,2022-01-01 09:24:39.216,W 70 St & Amsterdam Ave,West End Ave & W 60 St,40.77748,-73.982886,40.77237,-73.99005,member,11.6,1028,1
4,2022-01-01,A8F8740F4C893FC8,classic_bike,2022-01-01 17:06:59.480,2022-01-01 17:34:11.478,Washington Pl & 6 Ave,E 2 St & Avenue A,40.732241,-74.000264,40.723077,-73.985836,member,11.6,1028,1


In [43]:
#creating a list of just the stations and their coordinates
df_stations = df[['start_station_name', 'start_lat', 'start_lng', 'end_station_name', 'end_lat', 'end_lng']]

In [45]:
df_stations.head()

Unnamed: 0,start_station_name,start_lat,start_lng,end_station_name,end_lat,end_lng
0,E 58 St & 3 Ave,40.760958,-73.967245,W 87 St & Amsterdam Ave,40.78839,-73.9747
1,Marcus Garvey Blvd & Macon St,40.682601,-73.938037,Albany Ave & Fulton St,40.680011,-73.938475
2,W 21 St & 6 Ave,40.74174,-73.994156,8 Ave & W 27 St,40.747968,-73.996637
3,W 70 St & Amsterdam Ave,40.77748,-73.982886,West End Ave & W 60 St,40.77237,-73.99005
4,Washington Pl & 6 Ave,40.732241,-74.000264,E 2 St & Avenue A,40.723077,-73.985836


In [47]:
df_stations=df_stations.drop_duplicates(subset=['start_station_name', 'end_station_name'])

In [49]:
df_stations.head()

Unnamed: 0,start_station_name,start_lat,start_lng,end_station_name,end_lat,end_lng
0,E 58 St & 3 Ave,40.760958,-73.967245,W 87 St & Amsterdam Ave,40.78839,-73.9747
1,Marcus Garvey Blvd & Macon St,40.682601,-73.938037,Albany Ave & Fulton St,40.680011,-73.938475
2,W 21 St & 6 Ave,40.74174,-73.994156,8 Ave & W 27 St,40.747968,-73.996637
3,W 70 St & Amsterdam Ave,40.77748,-73.982886,West End Ave & W 60 St,40.77237,-73.99005
4,Washington Pl & 6 Ave,40.732241,-74.000264,E 2 St & Avenue A,40.723077,-73.985836


In [51]:
df_2 = df_group.merge(df_stations, on = ["start_station_name", "end_station_name"], how = 'outer',indicator = True)

In [53]:
df_2.head()

Unnamed: 0,start_station_name,end_station_name,trips,start_lat,start_lng,end_lat,end_lng,_merge
0,1 Ave & E 110 St,1 Ave & E 110 St,3,40.792327,-73.9383,40.792327,-73.9383,both
1,1 Ave & E 110 St,1 Ave & E 94 St,1,40.792327,-73.9383,40.781721,-73.94594,both
2,1 Ave & E 110 St,2 Ave & E 104 St,1,40.792375,-73.938088,40.789211,-73.943708,both
3,1 Ave & E 110 St,2 Ave & E 125 St,1,40.792327,-73.9383,40.802554,-73.933509,both
4,1 Ave & E 110 St,2 Ave & E 29 St,2,40.792327,-73.9383,40.741724,-73.978093,both


In [55]:
df_2['_merge'].value_counts(dropna = False)

_merge
both          93835
left_only         0
right_only        0
Name: count, dtype: int64

In [57]:
df_2.to_csv('station_coordinates_and_trips.csv') 

In [59]:
# initializing a kepler map
m = KeplerGl(height = 700, data={"data_1": df_2})

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


In [61]:
m

KeplerGl(data={'data_1':           start_station_name         end_station_name  trips  start_lat  \
0         …

At first, the only layer that was active was the start layer so only the start stations were visible.  I activated the end layer by clicking on the eye icon so that now the end stations were also visible.  I then changed the colors for the start stations and end stations (purple for the start stations and yellow for the end stations) so that it was easier to distinguish between the two.  Prior to this, a color scheme had been applied to the start and end stations, making them various different colors.  Because this assignment is concerned with trips BETWEEN stations, I also activated the third layer (start --> end arc) and chose purple for source, yellow for target. This resulted in a line for each of the trips taken between two stations.  

As I increase the number of trips in the filter, the fewer trips there are outside of Manhattan.  One of most common trips is between West St. & Chambers St. and Pier 40 Hudson Park.  This makes sense because it would take bike riders down along the Hudson River.  Another common bike trip is between Roosevelt Island Tramway and Motorgate on Roosevelt Island.  This makes sense as Roosevelt Island has great views of Manhattan but is much less busy.  Unsuprisingly, some of the most active stations, including THE most active one, are located around Central Park - one of New York's most popular attractions.  

In [64]:
config = m.config

In [66]:
config

{'version': 'v1',
 'config': {'visState': {'filters': [{'dataId': ['data_1'],
     'id': 'bw4mbbdyh',
     'name': ['trips'],
     'type': 'range',
     'value': [1, 56],
     'plotType': 'histogram',
     'animationWindow': 'free',
     'yAxis': None,
     'view': 'side',
     'speed': 1,
     'enabled': True}],
   'layers': [{'id': 'pzj03q8',
     'type': 'point',
     'config': {'dataId': 'data_1',
      'label': 'start',
      'color': [122, 13, 166],
      'highlightColor': [252, 242, 26, 255],
      'columns': {'lat': 'start_lat', 'lng': 'start_lng'},
      'isVisible': True,
      'visConfig': {'radius': 10,
       'fixedRadius': False,
       'opacity': 0.8,
       'outline': False,
       'thickness': 2,
       'strokeColor': None,
       'colorRange': {'name': 'Global Warming',
        'type': 'sequential',
        'category': 'Uber',
        'colors': ['#5A1846',
         '#900C3F',
         '#C70039',
         '#E3611C',
         '#F1920E',
         '#FFC300']},
       'str

In [68]:
import json
with open("config.json", "w") as outfile:
    json.dump(config, outfile)

In [70]:
m.save_to_html(file_name = 'New_York_Citibike_Trips.html', read_only = False, config = config)

Map saved to New_York_Citibike_Trips.html!
