In [1]:
import pandas as pd
import os
from keplergl import KeplerGl
from pyproj import CRS 
import numpy as np
from matplotlib import pyplot as plt


  from pkg_resources import resource_string


In [2]:
# Importing the Bike dataframe

# Using pyarrow to read the CSV to save on RAM
df_bike = pd.read_csv(
    "Output/bike_data_cleaned_2022.csv",
    engine="pyarrow",                 # faster & lower memory
    dtype_backend="pyarrow",          # compact Arrow dtypes for ints/strings
    parse_dates=["started_at"],       # avoids huge object strings
)

df_bike.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual'],
      dtype='object')

In [3]:
# Count the trips according to their start and end 
df_bike['trips'] = 1
df_tripcount = df_bike.groupby(['start_station_name', 'end_station_name'])['trips'].count().reset_index()

# Check the data
print(df_tripcount['trips'].sum())
print(df_tripcount.shape)

29768282
(1013397, 3)


In [4]:
# In order to display the stations and trips, we need to match the stations to lat/lng coordinates
# Unfortunately there are some small variations in the dataset for the coordinates of a same station
# That means we must first normalize the coordinates of each station, before we can match them to the stations in our count dataframe

# --- 1) Build a small station->(lat,lng) table quickly (first occurrence) ---
# To make it faster and less RAM hungry, we just take the first occurence of a station name and collect its coordinates
starts = (df_bike[['start_station_name','start_lat','start_lng']]
          .dropna()
          .drop_duplicates('start_station_name', keep='first')
          .rename(columns={'start_station_name':'station',
                           'start_lat':'lat','start_lng':'lng'})
          .set_index('station'))

ends = (df_bike[['end_station_name','end_lat','end_lng']]
        .dropna()
        .drop_duplicates('end_station_name', keep='first')
        .rename(columns={'end_station_name':'station',
                         'end_lat':'lat','end_lng':'lng'})
        .set_index('station'))

# prefer start coords; fill any missing from end coords
coords = starts.combine_first(ends)
coords.index = coords.index.str.strip()            # normalize names (small table)

# --- 2) Super-fast mapping using categoricals (no per-row Python dict lookups) ---
# Start side
s_cat = df_tripcount['start_station_name'].astype('category')
# strip only the category labels (cheap)
s_cat = s_cat.cat.rename_categories(lambda x: x.strip() if isinstance(x, str) else x)

lat_indexer_s = coords['lat'].reindex(s_cat.cat.categories).to_numpy()
lng_indexer_s = coords['lng'].reindex(s_cat.cat.categories).to_numpy()

df_tripcount['start_lat'] = lat_indexer_s[s_cat.cat.codes]
df_tripcount['start_lng'] = lng_indexer_s[s_cat.cat.codes]

# End side
e_cat = df_tripcount['end_station_name'].astype('category')
e_cat = e_cat.cat.rename_categories(lambda x: x.strip() if isinstance(x, str) else x)

lat_indexer_e = coords['lat'].reindex(e_cat.cat.categories).to_numpy()
lng_indexer_e = coords['lng'].reindex(e_cat.cat.categories).to_numpy()

df_tripcount['end_lat'] = lat_indexer_e[e_cat.cat.codes]
df_tripcount['end_lng'] = lng_indexer_e[e_cat.cat.codes]

In [5]:
df_tripcount.to_csv('Output/Trip count by stations - NYC 2022.csv')

df_tripcount

Unnamed: 0,start_station_name,end_station_name,trips,start_lat,start_lng,end_lat,end_lng
0,1 Ave & E 110 St,1 Ave & E 110 St,791,40.792337,-73.938240,40.792337,-73.938240
1,1 Ave & E 110 St,1 Ave & E 18 St,2,40.792337,-73.938240,40.733812,-73.980544
2,1 Ave & E 110 St,1 Ave & E 30 St,4,40.792337,-73.938240,40.741549,-73.975329
3,1 Ave & E 110 St,1 Ave & E 39 St,1,40.792337,-73.938240,40.747140,-73.971130
4,1 Ave & E 110 St,1 Ave & E 44 St,12,40.792337,-73.938240,40.750020,-73.969053
...,...,...,...,...,...,...,...
1013392,Yankee Ferry Terminal,Water St & Main St,4,40.687066,-74.016756,40.703212,-73.990409
1013393,Yankee Ferry Terminal,West St & Chambers St,6,40.687066,-74.016756,40.717548,-74.013221
1013394,Yankee Ferry Terminal,West St & Liberty St,4,40.687066,-74.016756,40.711444,-74.014847
1013395,Yankee Ferry Terminal,West Thames St,1,40.687066,-74.016756,40.708347,-74.017134


In [10]:
# Create KeplerGl instance
m = KeplerGl(height = 700, data={"Trip Count": df_tripcount})

# Save in HTML to open via browser directly
m.save_to_html(file_name="Output/kepler_map.html", read_only=False)

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
Map saved to kepler_map.html!


# Customization made to the Kepler map in the HTML file:
- Uniformized the color of all stations (the density of trips will be shown on the arcs connecting each station)
- Added arcs between station to show density of trips between the stations (using a color gradient to visualize that density easily)
- Filtered by Trips > 500 so the map isn't too densely populated and we can make sense of the data
- Similarly, reduced the width of the arcs to make it less busy

# Analysis of busy areas

By increasing the selectiveness of the filter we can see which areas are the most active (looking at only start/end pairs with more than 1500 trips gives good legibility).

Unsuprisingly, these are mainly focused in and around Manhattan. 

By looking more closely, we can see a particularly high density of trips around green areas or areas next to the river:
- Central Park
- Governors Island
- Hudson bank in Manhattan
- East River bank in Brooklyn

We can deduce that a nice environment conditions how people use the citibikes:
- They may use the bikes for recreational purposes more than just going from point A to point B (an analysis of the times and days with highest activity could give more info here)
- They may warp their itineraries around "nice spots"
- They may choose the bike if the itinerary is nice, but take another means if not

# Config file

Despite much debugging efforts, I was not able to open the Kepler mab in Jupyter => couldn't use 
import json
with open("keplerconfig.json", "w") as outfile:
    json.dump(config, outfile)

I had to create an HTML and open it in browser directly... but no way to get the config file from this HTML file (no UI to export it, and no way to find it in the local files).

So I went to the web kepler app, imported the dataset and got the config file from there directly (there was an UI for it!)