In [4]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load your final dataset
df = pd.read_csv("../temp_storage/data_raw/citibike_weather_2022.csv")
print("Data loaded:", df.shape)


Data loaded: (895485, 17)


In [10]:
import plotly.express as px

# Create the top stations dataframe
top_stations = (
    df["start_station_name"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "station_name", "start_station_name": "trip_count"})
    .head(20)
)

# If column names are wrong, fix them explicitly
if "count" in top_stations.columns:
    top_stations = top_stations.rename(columns={"trip_count": "station_name", "count": "trip_count"})
elif "trip_count" in top_stations.columns and "station_name" not in top_stations.columns:
    # swap names if misaligned
    cols = ["station_name", "trip_count"]
    top_stations.columns = cols

print(top_stations.columns)
print(top_stations.head())

# Create interactive bar chart
fig_bar = px.bar(
    top_stations,
    x="station_name",
    y="trip_count",
    title="Top 20 Most Popular Start Stations – CitiBike 2022",
    color="trip_count",
    color_continuous_scale="Blues",
)

fig_bar.update_layout(
    xaxis_title="Station Name",
    yaxis_title="Number of Trips",
    xaxis_tickangle=45,
    template="plotly_white",
    title_x=0.5
)

fig_bar.show()


Index(['station_name', 'trip_count'], dtype='object')
                                   station_name  trip_count
0                                 Grove St PATH       42556
1  South Waterfront Walkway - Sinatra Dr & 1 St       34245
2       Hoboken Terminal - River St & Hudson Pl       33020
3      Hoboken Terminal - Hudson St & Hudson Pl       30244
4              City Hall - Washington St & 1 St       23289


In [11]:
import plotly.graph_objects as go

# Ensure 'date' column is in datetime format
df["date"] = pd.to_datetime(df["date"])

# Aggregate: daily trip counts
daily_trips = df.groupby("date").size().reset_index(name="trip_count")

# Aggregate: daily average temperature
daily_temp = df.groupby("date")["avgTemp"].mean().reset_index()

# Merge both on 'date'
daily_data = pd.merge(daily_trips, daily_temp, on="date")

# Create dual-axis line chart
fig_dual = go.Figure()

# Add line for trip counts
fig_dual.add_trace(
    go.Scatter(
        x=daily_data["date"],
        y=daily_data["trip_count"],
        name="Daily Trips",
        mode="lines",
        line=dict(color="royalblue")
    )
)

# Add line for temperature
fig_dual.add_trace(
    go.Scatter(
        x=daily_data["date"],
        y=daily_data["avgTemp"],
        name="Average Temperature (°C)",
        mode="lines",
        line=dict(color="tomato"),
        yaxis="y2"
    )
)

# Configure layout with two y-axes
fig_dual.update_layout(
    title="Daily CitiBike Trips vs Average Temperature (2022)",
    xaxis_title="Date",
    yaxis_title="Trip Count",
    yaxis2=dict(
        title="Temperature (°C)",
        overlaying="y",
        side="right"
    ),
    legend=dict(x=0.02, y=0.98),
    template="plotly_white",
    title_x=0.5
)

fig_dual.show()
