## Step 1: Import Required Libraries

In [1]:
# import libraries

from IPython.display import Markdown, display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from keplergl import KeplerGl
import json
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="keplergl")

# set professional plot style
sns.set_theme(style="whitegrid", palette="viridis")
plt.rcParams['figure.figsize'] = (12, 8)

  from pkg_resources import resource_string


## Step 2: Load Data

In [None]:
# Load the processed dataset
DATA_PATH = "../data/processed/nyc_citibike_2022_processed.csv"
df = pd.read_csv(DATA_PATH, low_memory=False)

print(f"Dataset loaded: {len(df):,} rows, {len(df.columns)} columns")
print(f"Date range: {df['started_at'].min()} to {df['started_at'].max()}")

## Step 3: Data Preprocessing and Aggregation

In [None]:
# Data Preprocessing and Aggregation
df['trip_count'] = 1

# Create aggregated dataframe with starting station, ending station, and trip count
df_grouped = df.groupby(['start_station_name', 'end_station_name'])['trip_count'].count().reset_index()

print(f"✓ Aggregated {len(df):,} trips into {len(df_grouped):,} unique station pairs")

# Merge with station location data
start_stations = df[['start_station_name', 'start_lat', 'start_lng']].drop_duplicates()
end_stations = df[['end_station_name', 'end_lat', 'end_lng']].drop_duplicates()

start_stations.columns = ['station_name', 'lat', 'lng']
end_stations.columns = ['station_name', 'lat', 'lng']
all_stations = pd.concat([start_stations, end_stations]).drop_duplicates('station_name')

# Create final dataset with coordinates
df_final = df_grouped.merge(
    all_stations, 
    left_on='start_station_name', 
    right_on='station_name', 
    how='left'
).rename(columns={'lat': 'start_lat', 'lng': 'start_lng'})

df_final = df_final.merge(
    all_stations, 
    left_on='end_station_name', 
    right_on='station_name', 
    how='left'
).rename(columns={'lat': 'end_lat', 'lng': 'end_lng'})

df_final = df_final[['start_station_name', 'end_station_name', 'trip_count', 
                     'start_lat', 'start_lng', 'end_lat', 'end_lng']]

print(f"✓ Final geospatial dataset: {len(df_final):,} rows with coordinates")

# Save the aggregated data for future use
df_final.to_csv('aggregated_station_pairs.csv', index=False)
print("✓ Aggregated data saved as 'aggregated_station_pairs.csv'")

✓ Aggregated 29,838,806 trips into 1,013,422 unique station pairs
✓ Final geospatial dataset: 1,013,422 rows with coordinates
✓ Aggregated data saved as 'aggregated_station_pairs.csv'


## Step 4: Kepler.gl Initialization

In [None]:
# Initialize Kepler.gl Map
map_nyc = KeplerGl(height=700, data={"NYC Bike Trips": df_final})
print("✓ Kepler.gl map initialized with automatic layer generation")


User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
✓ Kepler.gl map initialized with automatic layer generation


In [None]:
# Display the map for customization
map_nyc

NameError: name 'map_nyc' is not defined

**Map Customization Settings:**

1. **Start/End Station Colors**: Changed to solid blue (start) and red (end) for clear differentiation
2. **Arc Layer**: Enabled with Viridis color palette based on trip_count
3. **Filter Added**: trip_count filter set to 1000+ rides to highlight most significant routes
4. **Layer Management**: Removed redundant line layer to reduce clutter

## Step 5: Filter Analysis and Business Insights

In [None]:
# Analyze filtered data
high_frequency_routes = df_final[df_final['trip_count'] >= 1000]
top_10_routes = high_frequency_routes.nlargest(10, 'trip_count')

# save filtered data
high_frequency_routes.to_csv('high_frequency_routes.csv', index=False)

### Filter Analysis: Most Common Trips (1000+ Rides)

**Top High-Frequency Routes Identified:**
1. **Central Park S & 6 Ave** → **Central Park S & 6 Ave**: 12,041 trips
2. **7 Ave & Central Park South** → **7 Ave & Central Park South**: 8,541 trips  
3. **Roosevelt Island Tramway** → **Roosevelt Island Tramway**: 8,213 trips

**Geographic Patterns Revealed by Filter:**

**Manhattan Core Dominance:**
- The filter reveals extremely high concentration in **Midtown and Downtown Manhattan**
- **Central Park perimeter stations** show the highest trip volumes, indicating heavy tourist/recreational usage
- **Transportation hubs** (Penn Station, Grand Central) serve as major trip origins/destinations

**Tourist Corridor Identification:**
- **Central Park South & 6th Avenue** area emerges as the busiest corridor
- **Waterfront areas** (West St, Hudson River Park) show consistent high usage
- **Theater District/Times Square** stations demonstrate high connectivity

**Commuter Flow Patterns:**
- Clear **directional patterns** from residential areas (Upper East/West Side) to business districts (Midtown)
- **Brooklyn Bridge** approach shows significant cross-borough traffic
- **Limited high-frequency routes** in outer boroughs, indicating potential expansion opportunities

**Business Implications:**
- **Resource Allocation**: Focus maintenance on Manhattan core stations showing 10,000+ annual trips
- **Redistribution Strategy**: Morning flow into business districts, evening reverse flow requires rebalancing
- **Expansion Planning**: Brooklyn/Queens waterfront areas show potential for new station development
- **Seasonal Planning**: Tourist-heavy zones need enhanced capacity during peak seasons

## Step 7: Save and  Export Configuration

In [None]:
# Save configuration and export map with filter settings preserved
config = map_nyc.config

# Save interactive map
map_nyc.save_to_html(
    file_name='nyc_bike_trips_aggregated.html', 
    read_only=False, 
    config=config
)

# Save configuration
with open("kepler_config.json", "w") as outfile:
    json.dump(config, outfile, indent=2)

print("✓ Map with filter settings exported: 'nyc_bike_trips_aggregated.html'")
print("✓ Configuration with filter preserved: 'kepler_config.json'")

Map saved to nyc_bike_trips_aggregated.html!
✓ Map with filter settings exported: 'nyc_bike_trips_aggregated.html'
✓ Configuration with filter preserved: 'kepler_config.json'
