## Step 1: Import Libraries

In [3]:
# import required libraries

from IPython.display import Markdown, display
import pandas as pd
import numpy as np
import json
from keplergl import KeplerGl
import seaborn as sns
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore', category= UserWarning, message='pkg_resources is deprecated')


# Set professional plotting style
sns.set_theme(style="whitegrid", palette="viridis")
plt.rcParams['figure.figsize'] = (12, 8)

## Step 2: Data Loading and Initialization

In [2]:
# Load the processed dataset
DATA_PATH = "../data/processed/nyc_citibike_2022_processed.csv"
df = pd.read_csv(DATA_PATH, low_memory=False)

print(f"Dataset loaded: {len(df):,} rows, {len(df.columns)} columns")
print(f"Date range: {df['started_at'].min()} to {df['started_at'].max()}")

Dataset loaded: 29,838,806 rows, 17 columns
Date range: 2021-01-30 17:30:45.544 to 2022-12-31 23:58:19.206


## Step 3: Data Preprocessing and Aggregation

In [4]:
# Create trip count column and aggregate by station pairs
df['trip_count'] = 1
df_grouped = df.groupby(['start_station_name', 'end_station_name'])['trip_count'].count().reset_index()

print(f"Aggregated {len(df):,} trips into {len(df_grouped):,} unique station pairs")

# Merge with station location data
start_stations = df[['start_station_name', 'start_lat', 'start_lng']].drop_duplicates()
end_stations = df[['end_station_name', 'end_lat', 'end_lng']].drop_duplicates()

start_stations.columns = ['station_name', 'lat', 'lng']
end_stations.columns = ['station_name', 'lat', 'lng']
all_stations = pd.concat([start_stations, end_stations]).drop_duplicates('station_name')

# Create final dataset with coordinates
df_final = df_grouped.merge(
    all_stations, 
    left_on='start_station_name', 
    right_on='station_name', 
    how='left'
).rename(columns={'lat': 'start_lat', 'lng': 'start_lng'})

df_final = df_final.merge(
    all_stations, 
    left_on='end_station_name', 
    right_on='station_name', 
    how='left'
).rename(columns={'lat': 'end_lat', 'lng': 'end_lng'})

df_final = df_final[['start_station_name', 'end_station_name', 'trip_count', 
                     'start_lat', 'start_lng', 'end_lat', 'end_lng']]

print(f"Final geospatial dataset: {len(df_final):,} rows with coordinates")

Aggregated 29,838,806 trips into 1,013,422 unique station pairs
Final geospatial dataset: 1,013,422 rows with coordinates


## Step 4: Kepler.gl Map Initialization and Configuration

In [5]:
# Initialize Kepler.gl map with aggregated trip data
map_nyc = KeplerGl(height=700, data={"NYC Bike Trips": df_final})
print("Kepler.gl map initialized with automatic layer generation")

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
Kepler.gl map initialized with automatic layer generation


In [6]:
map_nyc

KeplerGl(data={'NYC Bike Trips':             start_station_name       end_station_name  trip_count  start_lat …

### Map Customization Approach

**Settings Changed and Rationale:**

1. **Start/End Station Colors**: Changed from default gradient to distinct solid colors (blue for start, red for end) for clear station identification and visual differentiation.

2. **Trip Arcs Activation**: Enabled arc visualization to show flow patterns between stations, using sequential color scaling based on trip frequency for intuitive data interpretation.

3. **Color Palette Selection**: Applied Viridis scheme where color intensity represents trip volume, creating immediate visual hierarchy of popular routes.

4. **Layer Management**: Removed redundant line layer to reduce visual clutter while maintaining the more visually appealing arc representation.

These customizations transform complex trip data into an intuitive business intelligence tool.


## Step 5: Business Insights and Pattern Analysis

In [7]:
# Identify top routes for business intelligence
top_routes = df_final.nlargest(5, 'trip_count')

display(Markdown(f"""
### Key Business Insights

**Most Popular Routes:**
1. **{top_routes.iloc[0]['start_station_name']}** → **{top_routes.iloc[0]['end_station_name']}**: {top_routes.iloc[0]['trip_count']:,} trips
2. **{top_routes.iloc[1]['start_station_name']}** → **{top_routes.iloc[1]['end_station_name']}**: {top_routes.iloc[1]['trip_count']:,} trips
3. **{top_routes.iloc[2]['start_station_name']}** → **{top_routes.iloc[2]['end_station_name']}**: {top_routes.iloc[2]['trip_count']:,} trips

**Spatial Distribution Patterns:**
- **Tourist Corridors**: High-density routes connect major attractions and transit hubs
- **Commuter Flow**: Clear directional patterns between residential and business districts  
- **Geographic Coverage**: Manhattan shows optimal station density while outer boroughs indicate expansion opportunities
- **Seasonal Readiness**: Identified high-usage zones requiring increased capacity planning

**Strategic Recommendations:**
- Allocate maintenance resources to high-frequency station pairs
- Plan station expansion in underserved high-demand areas
- Optimize bike redistribution based on directional flow patterns
- Enhance capacity in tourist-heavy zones during peak seasons
"""))


### Key Business Insights

**Most Popular Routes:**
1. **Central Park S & 6 Ave** → **Central Park S & 6 Ave**: 12,041 trips
2. **7 Ave & Central Park South** → **7 Ave & Central Park South**: 8,541 trips
3. **Roosevelt Island Tramway** → **Roosevelt Island Tramway**: 8,213 trips

**Spatial Distribution Patterns:**
- **Tourist Corridors**: High-density routes connect major attractions and transit hubs
- **Commuter Flow**: Clear directional patterns between residential and business districts  
- **Geographic Coverage**: Manhattan shows optimal station density while outer boroughs indicate expansion opportunities
- **Seasonal Readiness**: Identified high-usage zones requiring increased capacity planning

**Strategic Recommendations:**
- Allocate maintenance resources to high-frequency station pairs
- Plan station expansion in underserved high-demand areas
- Optimize bike redistribution based on directional flow patterns
- Enhance capacity in tourist-heavy zones during peak seasons


In [8]:
# Create config object to save your settings
config = map_nyc.config

# Save the interactive map
map_nyc.save_to_html(
    file_name='nyc_bike_trips_aggregated.html', 
    read_only=False, 
    config=config
)

# Save configuration as JSON for reproducibility
with open("kepler_config.json", "w") as outfile:
    json.dump(config, outfile, indent=2)

print("✓ Map successfully exported: 'nyc_bike_trips_aggregated.html'")
print("✓ Configuration saved: 'kepler_config.json'")
print("✓ All customization settings preserved")

Map saved to nyc_bike_trips_aggregated.html!
✓ Map successfully exported: 'nyc_bike_trips_aggregated.html'
✓ Configuration saved: 'kepler_config.json'
✓ All customization settings preserved
