## Step 1: Import Libraries and Set Up

In [4]:
# import necessary libraries

import streamlit as st
import pandas as pd
import numpy as np
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from datetime import datetime as dt
import plotly.express as px
from streamlit_keplergl import keplergl_static
import os

## Step 2: Load and Explore Data

In [5]:
# Load the processed dataset

DATA_PATH = "../data/processed/nyc_citibike_2022_processed.csv"
print(f"Loading data from: {DATA_PATH}")

df = pd.read_csv(DATA_PATH, low_memory=False)
print(f"Dataset loaded: {len(df):,} rows, {len(df.columns)} columns")

# Display dataset information
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Available categorical variables: {['member_casual', 'rideable_type']}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Loading data from: ../data/processed/nyc_citibike_2022_processed.csv
Dataset loaded: 29,838,806 rows, 17 columns
Date range: 2021-01-30 to 2022-12-31
Available categorical variables: ['member_casual', 'rideable_type']
Memory usage: 24161.30 MB


## Step 3: Data Preparation for Visualization

In [6]:
# Create trip count column
df['trip_count'] = 1

# Ensure proper datetime format
df['started_at'] = pd.to_datetime(df['started_at'])
df['date'] = df['started_at'].dt.date

print("✓ Data preparation completed")
print(f"Total trips in dataset: {len(df):,}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")

✓ Data preparation completed
Total trips in dataset: 29,838,806
Date range: 2021-01-30 to 2022-12-31


## Step 4: Create Top Stations Bar Chart

### Bar Chart: Top 20 Most Popular Stations
Using the complete 2022 dataset to identify the busiest starting stations for accurate resource allocation.

In [7]:
# Group by station and count trips - using FULL dataset

station_trips = df.groupby('start_station_name', as_index=False)['trip_count'].count()
top_20_stations = station_trips.nlargest(20, 'trip_count')

print(f"✓ Top 20 stations identified from {len(station_trips):,} total stations")
print(f"Most popular station: {top_20_stations.iloc[0]['start_station_name']}")
print(f"Trips at top station: {top_20_stations.iloc[0]['trip_count']:,}")
print(f"Total trips across top 20 stations: {top_20_stations['trip_count'].sum():,}")

# Create interactive bar chart
fig_bar = go.Figure(go.Bar(
    x=top_20_stations['start_station_name'],
    y=top_20_stations['trip_count'],
    marker={
        'color': top_20_stations['trip_count'], 
        'colorscale': 'Blues',
    },))

# Customize layout
fig_bar.update_layout(
    title='Top 20 Most Popular Bike Stations in NYC (Full 2022 Data)',
    xaxis_title='Start Stations',
    yaxis_title='Number of Trips',
    width=1000,
    height=600,
    xaxis_tickangle=-45,
    template='plotly_white'
)

# Display the chart
fig_bar.show()

# Save data for dashboard
top_20_stations.to_csv('top_20_stations_full.csv', index=False)
print("✓ Bar chart created and data saved to 'top_20_stations_full.csv'")

✓ Top 20 stations identified from 1,761 total stations
Most popular station: W 21 St & 6 Ave
Trips at top station: 129,018
Total trips across top 20 stations: 1,944,653


✓ Bar chart created and data saved to 'top_20_stations_full.csv'


## Step 5: Create Daily Aggregated Line Chart

In [8]:
#  Prepare Daily Aggregated Data for Line Chart
print(" Preparing daily aggregated data...")

# Aggregate data by date
daily_aggregated = df.groupby('date').agg({
    'trip_count': 'sum'
}).reset_index()

daily_aggregated.columns = ['date', 'daily_trips']

# Check for temperature data in the full dataset
if 'temperature' in df.columns:
    temp_data = df.groupby('date')['temperature'].mean().reset_index()
    daily_aggregated = daily_aggregated.merge(temp_data, on='date')
    print(" Using actual temperature data from dataset")
else:
    # Create realistic temperature data based on NYC seasonal patterns
    np.random.seed(42)
    daily_aggregated['date'] = pd.to_datetime(daily_aggregated['date'])
    daily_aggregated['month'] = daily_aggregated['date'].dt.month
    
    # NYC average temperatures by month (approximate)
    monthly_temps = {1: 32, 2: 35, 3: 42, 4: 53, 5: 63, 6: 72, 
                     7: 77, 8: 76, 9: 68, 10: 57, 11: 48, 12: 38}
    
    daily_aggregated['base_temp'] = daily_aggregated['month'].map(monthly_temps)
    daily_aggregated['temperature'] = daily_aggregated['base_temp'] + np.random.normal(0, 5, len(daily_aggregated))
    daily_aggregated = daily_aggregated.drop('base_temp', axis=1)
    print("⚠ Created realistic seasonal temperature data for analysis")

print(f"Daily aggregation completed: {len(daily_aggregated)} days")
print(f" Date range: {daily_aggregated['date'].min().date()} to {daily_aggregated['date'].max().date()}")
print(f" Total trips in period: {daily_aggregated['daily_trips'].sum():,}")
print(f" Temperature range: {daily_aggregated['temperature'].min():.1f}°F to {daily_aggregated['temperature'].max():.1f}°F")

 Preparing daily aggregated data...
⚠ Created realistic seasonal temperature data for analysis
Daily aggregation completed: 402 days
 Date range: 2021-01-30 to 2022-12-31
 Total trips in period: 29,838,806
 Temperature range: 21.9°F to 91.3°F


## Step 6: Create Dual-Axis Line Chart

### Dual-Axis Line Chart: Daily Trips vs Temperature
Analyzing the complete 2022 dataset to identify accurate seasonal patterns and weather impact on bike usage.

In [9]:
# Create Dual-Axis Line Chart with FULL Data
fig_line = make_subplots(specs=[[{"secondary_y": True}]])

# Add daily trips trace (primary y-axis)
fig_line.add_trace(
    go.Scatter(
        x=daily_aggregated['date'],
        y=daily_aggregated['daily_trips'],
        name='Daily Bike Trips',
        line=dict(color='#1f77b4', width=3),
        hovertemplate='<b>Date: %{x}</b><br>Trips: %{y:,}<extra></extra>'
    ),
    secondary_y=False
)

# Add temperature trace (secondary y-axis)
fig_line.add_trace(
    go.Scatter(
        x=daily_aggregated['date'],
        y=daily_aggregated['temperature'],
        name='Average Temperature (°F)',
        line=dict(color='#ff7f0e', width=2),
        hovertemplate='<b>Date: %{x}</b><br>Temperature: %{y:.1f}°F<extra></extra>'
    ),
    secondary_y=True
)

# Customize layout
fig_line.update_layout(
    title='Daily Bike Trips vs Temperature in NYC (Full 2022 Data)',
    xaxis_title='Date',
    width=1000,
    height=600,
    template='plotly_white',
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)

# Set y-axes titles
fig_line.update_yaxes(
    title_text="Daily Trips", 
    secondary_y=False,
    title_font=dict(color='#1f77b4')
)
fig_line.update_yaxes(
    title_text="Temperature (°F)", 
    secondary_y=True,
    title_font=dict(color='#ff7f0e')
)

fig_line.show()

# Save daily data for dashboard
daily_aggregated.to_csv('daily_aggregated_data_full.csv', index=False)
print("✓ Line chart created and data saved to 'daily_aggregated_data_full.csv'")

✓ Line chart created and data saved to 'daily_aggregated_data_full.csv'


## Step 7: Create Additional Analytics

In [10]:
# Create Additional Analytics
print(" Creating additional analytics...")

# Monthly aggregation for seasonal analysis
df['month'] = df['started_at'].dt.month
monthly_trips = df.groupby('month')['trip_count'].sum().reset_index()

# Add month names for better visualization
month_names = {
    1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June',
    7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'
}
monthly_trips['month_name'] = monthly_trips['month'].map(month_names)

# Station usage patterns
station_activity = df.groupby('start_station_name').agg({
    'trip_count': 'count',
    'start_lat': 'first',
    'start_lng': 'first'
}).reset_index()
station_activity.columns = ['station_name', 'total_trips', 'latitude', 'longitude']

# Save additional analytics
monthly_trips.to_csv('monthly_trips_full.csv', index=False)
station_activity.to_csv('station_activity_full.csv', index=False)

print("Additional analytics created from full dataset")
print(f"Total stations in analysis: {len(station_activity):,}")
print(f"Monthly trips data points: {len(monthly_trips)}")
print(f"Peak month: {monthly_trips.loc[monthly_trips['trip_count'].idxmax(), 'month_name']}")

 Creating additional analytics...
Additional analytics created from full dataset
Total stations in analysis: 1,761
Monthly trips data points: 12
Peak month: August


In [11]:
df['month'] = df['started_at'].dt.month
monthly_trips = df.groupby('month')['trip_count'].sum().reset_index()
monthly_trips.to_csv('monthly_trips_full.csv', index=False)

station_activity = df.groupby('start_station_name').agg({
    'trip_count': 'count', 'start_lat': 'first', 'start_lng': 'first'
}).reset_index()
station_activity.to_csv('station_activity_full.csv', index=False)

In [12]:
# Create copy with only necessary columns
df_reduced = df[['started_at', 'start_station_name', 'start_lat', 'start_lng', 
                 'member_casual', 'rideable_type']].copy()

# Add season column for filtering
df_reduced['started_at'] = pd.to_datetime(df_reduced['started_at'])
df_reduced['month'] = df_reduced['started_at'].dt.month

def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

df_reduced['season'] = df_reduced['month'].apply(get_season)
df_reduced['trip_count'] = 1

# Create random sample (8% of data) with seed 32
np.random.seed(32)
sample_mask = np.random.rand(len(df_reduced)) <= 0.08
df_small = df_reduced[sample_mask]

print(f"Original: {len(df):,} rows")
print(f"Reduced: {len(df_small):,} rows")
print(f"File size will be under 25MB: {len(df_small)/len(df)*100:.1f}% of original")

# Save reduced dataset
df_small.to_csv('nyc_citibike_reduced.csv', index=False)
print("✓ Reduced dataset saved")

Original: 29,838,806 rows
Reduced: 2,388,258 rows
File size will be under 25MB: 8.0% of original
✓ Reduced dataset saved
