# Animated Bike Route Visualization

This notebook creates an animated map showing Citi Bikes moving along actual cycling routes from start to end stations.

**Approach:**
1. Filter data to November 1st, 2024 (for now)
2. Identify unique start/end station pairs
3. Fetch cycling routes from Google Maps API for unique pairs only
4. Map routes to all trips
5. Interpolate bike positions at 1-minute intervals
6. Create animated visualization with Plotly

## Setup

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import googlemaps
import polyline
import os
import pickle
from scipy.interpolate import interp1d
from datetime import datetime, timedelta
from math import radians, cos, sin, asin, sqrt

## 1. Data Preparation

Load the filtered dataset and filter to November 1st, 2024.

In [None]:
# Load the filtered data
data_path = os.path.join('..', 'data', 'columbia_filtered_citibike.csv')
df = pd.read_csv(data_path, parse_dates=['started_at', 'ended_at'])

print(f"Total dataset: {len(df):,} trips")
print(f"Date range: {df['started_at'].min()} to {df['started_at'].max()}")

In [None]:
# Filter to November 1st, 2024
target_date = pd.to_datetime('2024-11-01').date()
nov1_df = df[df['started_at'].dt.date == target_date].copy()

print(f"\nTrips on November 1st, 2024: {len(nov1_df):,}")

if len(nov1_df) == 0:
	print("\nWarning: No trips found for November 1st, 2024.")
	print("Available dates:")
	print(df['started_at'].dt.date.value_counts().head(10))
else:
	print(f"Time range: {nov1_df['started_at'].min()} to {nov1_df['started_at'].max()}")

In [None]:
# Calculate trip duration in seconds
nov1_df['duration_seconds'] = (nov1_df['ended_at'] - nov1_df['started_at']).dt.total_seconds()

print(f"\nTrip duration statistics:")
print(f"Mean: {nov1_df['duration_seconds'].mean() / 60:.1f} minutes")
print(f"Median: {nov1_df['duration_seconds'].median() / 60:.1f} minutes")
print(f"Min: {nov1_df['duration_seconds'].min() / 60:.1f} minutes")
print(f"Max: {nov1_df['duration_seconds'].max() / 60:.1f} minutes")

print(f"\nUser type distribution:")
print(nov1_df['member_casual'].value_counts())

---

## 2. Identify Unique Routes

Extract unique start/end station pairs to minimize API calls.

In [None]:
# Create route key for each trip
nov1_df['route_key'] = nov1_df.apply(
	lambda row: f"{row['start_lat']:.6f},{row['start_lng']:.6f}_{row['end_lat']:.6f},{row['end_lng']:.6f}",
	axis=1
)

# Identify unique routes
unique_routes = nov1_df[['route_key', 'start_lat', 'start_lng', 'end_lat', 'end_lng']].drop_duplicates(subset='route_key')

print(f"Total trips: {len(nov1_df):,}")
print(f"Unique routes: {len(unique_routes):,}")
print(f"API call reduction: {len(nov1_df) / len(unique_routes):.1f}x")

# Identify round trips (same start and end)
round_trips = unique_routes[
	(unique_routes['start_lat'] == unique_routes['end_lat']) & 
	(unique_routes['start_lng'] == unique_routes['end_lng'])
]

print(f"\nRound trips (same start/end): {len(round_trips)}")
print(f"Routes needing API call: {len(unique_routes) - len(round_trips)}")

In [None]:
# Show top start/end station pairs
route_counts = nov1_df['route_key'].value_counts().head(10)
print("\nTop 10 most common routes:")
for route_key, count in route_counts.items():
	print(f"{count} trips: {route_key}")

---

## 3. Google Maps API Integration

Fetch cycling routes for unique start/end pairs.

**Setup Instructions:**
1. Go to [Google Cloud Console](https://console.cloud.google.com/)
2. Create a project or select existing one
3. Enable "Directions API"
4. Create credentials (API Key)
5. Copy your API key below

**Cost Estimate:**
- Directions API: $5 per 1,000 requests after $200 free credit
- For ~50 unique routes: $0.25 (effectively free with credits)

In [None]:
import os

# Set your Google Maps API key here
GOOGLE_MAPS_API_KEY = os.environ["GOOGLE_MAPS_API_KEY"]

# Initialize Google Maps client
try:
	gmaps = googlemaps.Client(key=GOOGLE_MAPS_API_KEY)
	print("Google Maps client initialized successfully")
except Exception as e:
	print(f"Error initializing Google Maps client: {e}")
	print("\nPlease set your API key in the cell above.")

In [None]:
# Helper function: Haversine distance
def haversine(lat1, lon1, lat2, lon2):
	"""
	Calculate the great circle distance between two points
	on the earth (specified in decimal degrees).
	Returns distance in meters.
	"""
	# Convert decimal degrees to radians
	lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
	
	# Haversine formula
	dlat = lat2 - lat1
	dlon = lon2 - lon1
	a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
	c = 2 * asin(sqrt(a))
	r = 6371000  # Radius of earth in meters
	
	return c * r

In [None]:
# Check if routes are already cached
cache_file = os.path.join('..', 'data', 'nov1_2024_unique_routes.pkl')

if os.path.exists(cache_file):
	print(f"Loading cached routes from {cache_file}...")
	with open(cache_file, 'rb') as f:
		route_cache = pickle.load(f)
	print(f"Loaded {len(route_cache)} cached routes")
else:
	print("No cached routes found. Will fetch from API.")
	route_cache = {}

In [None]:
# Fetch routes for unique start/end pairs
from tqdm.notebook import tqdm

route_data = {}
api_calls_made = 0
errors = []

for idx, route in tqdm(unique_routes.iterrows(), total=len(unique_routes), desc="Fetching routes"):
	route_key = route['route_key']
	
	# Check if already cached
	if route_key in route_cache:
		route_data[route_key] = route_cache[route_key]
		continue
	
	# Handle round trips (same start and end)
	if route['start_lat'] == route['end_lat'] and route['start_lng'] == route['end_lng']:
		route_data[route_key] = {
			'coords': [(route['start_lat'], route['start_lng'])],
			'distance': 0,
			'is_round_trip': True
		}
		continue
	
	# Fetch route from Google Maps API
	try:
		directions = gmaps.directions(
			origin=(route['start_lat'], route['start_lng']),
			destination=(route['end_lat'], route['end_lng']),
			mode='bicycling'
		)
		
		if directions and len(directions) > 0:
			# Extract polyline
			polyline_str = directions[0]['overview_polyline']['points']
			coords = polyline.decode(polyline_str, 5)
			
			# Get distance
			distance = directions[0]['legs'][0]['distance']['value']  # meters
			
			route_data[route_key] = {
				'coords': coords,
				'distance': distance,
				'is_round_trip': False
			}
			
			api_calls_made += 1
		else:
			# Fallback: straight line
			route_data[route_key] = {
				'coords': [(route['start_lat'], route['start_lng']), (route['end_lat'], route['end_lng'])],
				'distance': haversine(route['start_lat'], route['start_lng'], route['end_lat'], route['end_lng']),
				'is_round_trip': False,
				'fallback': True
			}
			errorrs.append(f"No route found for {route_key}")
			
	except Exception as e:
		# Fallback: straight line
		route_data[route_key] = {
			'coords': [(route['start_lat'], route['start_lng']), (route['end_lat'], route['end_lng'])],
			'distance': haversine(route['start_lat'], route['start_lng'], route['end_lat'], route['end_lng']),
			'is_round_trip': False,
			'fallback': True
		}
		errors.append(f"Error fetching {route_key}: {e}")

print(f"\nAPI calls made: {api_calls_made}")
print(f"Routes from cache: {len(route_data) - api_calls_made - len(round_trips)}")
print(f"Round trips: {len(round_trips)}")
print(f"Errors: {len(errors)}")

if errors:
	print("\nFirst 5 errors:")
	for error in errors[:5]:
		print(f"  - {error}")

In [None]:
# Save routes to cache
with open(cache_file, 'wb') as f:
	pickle.dump(route_data, f)

print(f"Saved {len(route_data)} routes to {cache_file}")

---

## 4. Map Routes to All Trips

Assign route data to each trip based on its route_key.

In [None]:
# Map route data to trips
nov1_df['route_coords'] = nov1_df['route_key'].map(lambda k: route_data.get(k, {}).get('coords', []))
nov1_df['route_distance'] = nov1_df['route_key'].map(lambda k: route_data.get(k, {}).get('distance', 0))
nov1_df['is_round_trip'] = nov1_df['route_key'].map(lambda k: route_data.get(k, {}).get('is_round_trip', False))

# Calculate straight-line distance for comparison
nov1_df['straight_distance'] = nov1_df.apply(
	lambda row: haversine(row['start_lat'], row['start_lng'], row['end_lat'], row['end_lng']),
	axis=1
)

# Calculate average speed (m/s)
nov1_df['avg_speed_mps'] = nov1_df['route_distance'] / nov1_df['duration_seconds']
nov1_df['avg_speed_mph'] = nov1_df['avg_speed_mps'] * 2.23694  # Convert to mph

print(f"Trips with routes: {(nov1_df['route_coords'].str.len() > 0).sum()}")
print(f"\nRoute distance vs straight-line distance:")
print(f"Average ratio: {(nov1_df['route_distance'] / nov1_df['straight_distance']).mean():.2f}x")
print(f"\nAverage speed statistics:")
print(f"Mean: {nov1_df['avg_speed_mph'].mean():.1f} mph")
print(f"Median: {nov1_df['avg_speed_mph'].median():.1f} mph")

In [None]:
# Filter out round trips and trips with no route data for animation
animatable_trips = nov1_df[
	(~nov1_df['is_round_trip']) & 
	(nov1_df['route_coords'].str.len() > 1)
].copy()

print(f"Trips to animate: {len(animatable_trips)} (excluding {len(nov1_df) - len(animatable_trips)} round trips or invalid routes)")

---

## 5. Position Interpolation

Interpolate bike positions along routes at 1-minute intervals.

In [None]:
def interpolate_trip_positions(coords, start_time, duration_seconds, time_step=60):
	"""
	Interpolate positions along route at regular time intervals.
	
	Args:
		coords: List of (lat, lon) tuples
		start_time: Trip start timestamp
		duration_seconds: Trip duration in seconds
		time_step: Seconds between position updates (default: 60)
	
	Returns:
		List of dicts with 'timestamp', 'lat', 'lon'
	"""
	if len(coords) < 2:
		return []
	
	# Calculate cumulative distance along route
	distances = [0]
	for i in range(1, len(coords)):
		dist = haversine(coords[i-1][0], coords[i-1][1], coords[i][0], coords[i][1])
		distances.append(distances[-1] + dist)
	
	total_distance = distances[-1]
	
	if total_distance == 0:
		return []
	
	# Extract lats and lons
	lats = [c[0] for c in coords]
	lons = [c[1] for c in coords]
	
	# Create interpolation functions
	try:
		f_lat = interp1d(distances, lats, kind='linear', bounds_error=False, fill_value='extrapolate')
		f_lon = interp1d(distances, lons, kind='linear', bounds_error=False, fill_value='extrapolate')
	except Exception as e:
		return []
	
	# Calculate speed
	speed = total_distance / duration_seconds  # meters per second
	
	# Generate positions at regular time intervals
	positions = []
	for t in range(0, int(duration_seconds) + time_step, time_step):
		dist_traveled = min(speed * t, total_distance)
		
		positions.append({
			'timestamp': start_time + pd.Timedelta(seconds=t),
			'lat': float(f_lat(dist_traveled)),
			'lon': float(f_lon(dist_traveled))
		})
	
	return positions

In [None]:
# Generate animation data for all trips
animation_data = []

for idx, trip in tqdm(animatable_trips.iterrows(), total=len(animatable_trips), desc="Interpolating positions"):
	positions = interpolate_trip_positions(
		trip['route_coords'],
		trip['started_at'],
		trip['duration_seconds'],
		time_step=30
	)
	
	for pos in positions:
		animation_data.append({
			'trip_id': idx,
			'timestamp': pos['timestamp'],
			'lat': pos['lat'],
			'lon': pos['lon'],
			'member_casual': trip['member_casual'],
			'start_station': trip.get('start_station_name', 'Unknown'),
			'end_station': trip.get('end_station_name', 'Unknown')
		})

animation_df = pd.DataFrame(animation_data)

print(f"\nGenerated {len(animation_df):,} position points for {len(animatable_trips)} trips")
print(f"Average points per trip: {len(animation_df) / len(animatable_trips):.1f}")
print(f"\nTime range: {animation_df['timestamp'].min()} to {animation_df['timestamp'].max()}")

In [None]:
# Create time bins for animation frames (1-minute intervals)
animation_df['time_bin'] = animation_df['timestamp'].dt.floor('30s')

# Count active bikes per time bin
active_bikes = animation_df.groupby('time_bin')['trip_id'].nunique()

print(f"\nAnimation frames: {len(active_bikes)}")
print(f"Max concurrent bikes: {active_bikes.max()}")
print(f"Mean concurrent bikes: {active_bikes.mean():.1f}")

---

## 6. Animated Visualization

Create the animated map showing bikes moving along routes.

In [None]:
# Create animated scatter map
fig = px.scatter_map(
	animation_df,
	lat='lat',
	lon='lon',
	animation_frame='time_bin',
	animation_group='trip_id',
	color='member_casual',
	color_discrete_map={'member': '#1f77b4', 'casual': '#ff7f0e'},
	hover_data={'start_station': True, 'end_station': True, 'lat': ':.5f', 'lon': ':.5f'},
	zoom=12,
	height=800,
	title='Citi Bike Animation - November 1st, 2024 (Columbia Area)'
)

# Update map style and center
fig.update_layout(
	map_style='carto-darkmatter',
	map_center={'lat': 40.78, 'lon': -73.97}
)

# Update marker appearance
fig.update_traces(marker=dict(size=10, opacity=0.8))

# Adjust animation speed
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 50
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 50

fig.show()

---

## 7. Summary Statistics

In [None]:
print("=" * 80)
print("ANIMATED BIKE ROUTE VISUALIZATION - SUMMARY")
print("=" * 80)

print(f"\nDate: November 1st, 2024")
print(f"\nData Summary:")
print(f"  Total trips on Nov 1st: {len(nov1_df):,}")
print(f"  Trips animated: {len(animatable_trips):,}")
print(f"  Round trips excluded: {nov1_df['is_round_trip'].sum():,}")

print(f"\nAPI Efficiency:")
print(f"  Unique routes: {len(unique_routes):,}")
print(f"  API calls made: {api_calls_made:,}")
print(f"  Trips per unique route: {len(nov1_df) / len(unique_routes):.1f}x")
print(f"  Estimated API cost: ${api_calls_made * 0.005:.2f}")

print(f"\nAnimation Details:")
print(f"  Position points generated: {len(animation_df):,}")
print(f"  Animation frames: {len(active_bikes):,}")
print(f"  Time bin interval: 1 minute")
print(f"  Time range: {animation_df['time_bin'].min().strftime('%H:%M')} - {animation_df['time_bin'].max().strftime('%H:%M')}")
print(f"  Max concurrent bikes: {active_bikes.max()}")

print(f"\nDistance & Speed:")
print(f"  Avg route distance: {nov1_df['route_distance'].mean():.0f} meters")
print(f"  Avg straight distance: {nov1_df['straight_distance'].mean():.0f} meters")
print(f"  Route/straight ratio: {(nov1_df['route_distance'] / nov1_df['straight_distance']).mean():.2f}x")
print(f"  Avg bike speed: {nov1_df['avg_speed_mph'].mean():.1f} mph")

print(f"\nUser Distribution:")
member_pct = (nov1_df['member_casual'] == 'member').sum() / len(nov1_df) * 100
print(f"  Members: {(nov1_df['member_casual'] == 'member').sum():,} ({member_pct:.1f}%)")
print(f"  Casual: {(nov1_df['member_casual'] == 'casual').sum():,} ({100-member_pct:.1f}%)")

print("\n" + "=" * 80)