In [None]:
##Bike Demand Analysis — NYC CitiBike Jan–Mar 2025
#Analyzes effect of rain and weekends on ridership and tests ML prediction on March data.


# --- SETUP ---
import pandas as pd
import matplotlib.pyplot as plt
import requests
from scipy.stats import ttest_ind

# READ CITIBIKE TRIP DATA
bike_df = pd.read_csv("citibike_merged.csv")
bike_df["started_at"] = pd.to_datetime(bike_df["started_at"])
bike_df["date"] = bike_df["started_at"].dt.date

# FETCH WEATHER DATA (Jan+Feb 2025)
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": 40.71,
    "longitude": -74.01,
    "start_date": "2025-01-01",
    "end_date": "2025-02-28",
    "daily": "temperature_2m_max,temperature_2m_min,precipitation_sum",
    "timezone": "America/New_York"
}

response = requests.get(url, params=params)
weather_json = response.json()

if "daily" in weather_json:
    weather_df = pd.DataFrame(weather_json["daily"])
    weather_df.rename(columns={"time": "date"}, inplace=True)
    weather_df["date"] = pd.to_datetime(weather_df["date"]).dt.date
    weather_df["rainy"] = weather_df["precipitation_sum"] > 1.0
else:
    print(" Failed to retrieve daily weather data.")
    weather_df = None

#ANALYSIS
if weather_df is not None:
    # Count rides per day
    daily_trips = bike_df.groupby("date").size().reset_index(name="trip_count")

    # Merge trips and weather
    merged = pd.merge(weather_df, daily_trips, on="date", how="left")
    merged["trip_count"] = merged["trip_count"].fillna(0).astype(int)

    # Add weekend info
    merged["weekend"] = pd.to_datetime(merged["date"]).dt.weekday >= 5  # Saturday=5, Sunday=6

    #PLOTTING

    ## (1) Daily Total Trips (Rainy Days Highlighted)
    plt.figure(figsize=(14,6))
    colors = ["red" if r else "steelblue" for r in merged["rainy"]]
    plt.bar(merged["date"], merged["trip_count"], color=colors)
    plt.title("Daily Total Bike Rides - Jan + Feb 2025 (Red = Rainy Day)")
    plt.xlabel("Date")
    plt.ylabel("Number of Bike Rides")
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("daily_trip_counts.png")
    plt.show()

    ## (2) Boxplot: Dry vs Rainy Days
    plt.figure(figsize=(8,6))
    data_rain = [merged[~merged["rainy"]]["trip_count"], merged[merged["rainy"]]["trip_count"]]
    plt.boxplot(data_rain, labels=["Dry Days", "Rainy Days"])
    plt.title("Bike Rides Distribution - Dry vs Rainy Days")
    plt.ylabel("Number of Bike Rides")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("boxplot_rainy_vs_dry.png")
    plt.show()

    ## (3) Boxplot: Weekday vs Weekend
    plt.figure(figsize=(8,6))
    data_weekend = [merged[~merged["weekend"]]["trip_count"], merged[merged["weekend"]]["trip_count"]]
    plt.boxplot(data_weekend, labels=["Weekday", "Weekend"])
    plt.title("Bike Rides Distribution - Weekday vs Weekend")
    plt.ylabel("Number of Bike Rides")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("boxplot_weekday_vs_weekend.png")
    plt.show()

    #HYPOTHESIS TESTING

    ## (a) Rainy Days vs Dry Days
    rainy = merged[merged["rainy"]]["trip_count"]
    non_rainy = merged[~merged["rainy"]]["trip_count"]

    t_stat_rain, p_val_rain = ttest_ind(rainy, non_rainy, equal_var=False)

    print(f"\nT-Statistic (Rain Effect): {t_stat_rain:.2f}")
    print(f"P-Value (Rain Effect): {p_val_rain:.4f}")
    if p_val_rain < 0.05:
        print(" Statistically significant difference: Rain impacts bike rentals.")
    else:
        print(" No significant difference: Rain does not significantly affect rentals.")

    ## (b) Weekend vs Weekday
    weekend_days = merged[merged["weekend"]]["trip_count"]
    weekday_days = merged[~merged["weekend"]]["trip_count"]

    t_stat_weekend, p_val_weekend = ttest_ind(weekend_days, weekday_days, equal_var=False)

    print(f"\nT-Statistic (Weekend Effect): {t_stat_weekend:.2f}")
    print(f"P-Value (Weekend Effect): {p_val_weekend:.4f}")
    if p_val_weekend < 0.05:
        print(" Statistically significant difference: Weekends impact bike rentals.")
    else:
        print(" No significant difference: Weekends do not significantly affect rentals.")





else:
    print(" Skipping analysis because weather data could not be retrieved.")


# ML PREDICTIONS

import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
import matplotlib.dates as mdates

print("Loading March data...")
# --- LOAD MARCH DATA ---
march_df = pd.read_csv("citibike_march.csv", low_memory=False)
march_df["started_at"] = pd.to_datetime(march_df["started_at"])
march_df["date"] = march_df["started_at"].dt.date

# --- FETCH MARCH WEATHER DATA ---
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": 40.71,
    "longitude": -74.01,
    "start_date": "2025-03-01",
    "end_date": "2025-03-31",
    "daily": "temperature_2m_max,temperature_2m_min,precipitation_sum",
    "timezone": "America/New_York"
}

response = requests.get(url, params=params)
weather_march = response.json()

if "daily" not in weather_march:
    print(" Could not fetch weather data for March.")
else:
    weather_df_march = pd.DataFrame(weather_march["daily"])
    weather_df_march.rename(columns={"time": "date"}, inplace=True)
    weather_df_march["date"] = pd.to_datetime(weather_df_march["date"]).dt.date

    # --- MERGE WEATHER + BIKE TRIPS ---
    trip_counts_march = march_df.groupby("date").size().reset_index(name="trip_count")
    merged_march = pd.merge(weather_df_march, trip_counts_march, on="date", how="left")
    merged_march["trip_count"] = merged_march["trip_count"].fillna(0).astype(int)

    # --- FEATURE ---
    def add_features(df):
        df['dayofweek'] = pd.to_datetime(df['date']).dt.weekday
        df['is_weekend'] = df['dayofweek'] >= 5
        df['temp_range'] = df['temperature_2m_max'] - df['temperature_2m_min']
        return df

    merged = add_features(merged)
    merged_march = add_features(merged_march)

    feature_cols = [
        'precipitation_sum',
        'temperature_2m_max',
        'temperature_2m_min',
        'temp_range',
        'dayofweek',
        'is_weekend'
    ]

    X_train = merged[feature_cols]
    y_train = merged["trip_count"]
    X_test = merged_march[feature_cols]
    y_test = merged_march["trip_count"]

    # --- RANDOM FOREST REGRESSION ---
    rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)
    r2_rf = r2_score(y_test, y_pred_rf)
    mae_rf = mean_absolute_error(y_test, y_pred_rf)

    # --- LINEAR REGRESSION ---
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    y_pred_lr = lr_model.predict(X_test)
    r2_lr = r2_score(y_test, y_pred_lr)
    mae_lr = mean_absolute_error(y_test, y_pred_lr)

    # --- EVALUATION ---
    print(f"\nRandom Forest: R² = {r2_rf:.3f}, MAE = {mae_rf:.1f} trips")
    print("Feature importances (Random Forest):")
    for name, imp in zip(feature_cols, rf_model.feature_importances_):
        print(f"  {name}: {imp:.3f}")

    print(f"\nLinear Regression: R² = {r2_lr:.3f}, MAE = {mae_lr:.1f} trips")
    print("Coefficients (Linear Regression):")
    for name, coef in zip(feature_cols, lr_model.coef_):
        print(f"  {name}: {coef:.2f}")
    print(f"  Intercept: {lr_model.intercept_:.2f}")

    # --- VISUALIZATION ---
    plt.figure(figsize=(14,6))
    plt.plot(merged_march["date"], y_test, label="Actual Trips", marker="o")
    plt.plot(merged_march["date"], y_pred_rf, label="Random Forest Prediction", linestyle="--", marker="x")
    plt.plot(merged_march["date"], y_pred_lr, label="Linear Regression Prediction", linestyle=":", marker="s")
    plt.title("Bike Trip Prediction for March 2025\n(Random Forest & Linear Regression)")
    plt.xlabel("Date")
    plt.ylabel("Number of Trips")
    plt.xticks(rotation=45)
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.savefig("march_prediction_vs_actual_rf_lr.png")
    plt.show()

    # --- OUTLIER ANALYSIS ---
    merged_march['rf_predicted'] = y_pred_rf
    merged_march['lr_predicted'] = y_pred_lr
    merged_march['abs_error_rf'] = np.abs(merged_march['trip_count'] - merged_march['rf_predicted'])
    merged_march['abs_error_lr'] = np.abs(merged_march['trip_count'] - merged_march['lr_predicted'])
    print("\nTop 5 days with largest absolute errors (Random Forest):")
    print(merged_march.sort_values('abs_error_rf', ascending=False)[['date','trip_count','rf_predicted','abs_error_rf']].head())
    print("\nTop 5 days with largest absolute errors (Linear Regression):")
    print(merged_march.sort_values('abs_error_lr', ascending=False)[['date','trip_count','lr_predicted','abs_error_lr']].head())




print("\n Model Evaluation Metrics:")
print(f" Random Forest Regression:\n  R² Score: {r2_score(y_test, y_pred_rf):.4f}\n  MAE: {mean_absolute_error(y_test, y_pred_rf):.1f} trips")
print(f" Linear Regression:\n  R² Score: {r2_score(y_test, y_pred_lr):.4f}\n  MAE: {mean_absolute_error(y_test, y_pred_lr):.1f} trips")


avg_rainy = rainy.mean()
avg_dry = non_rainy.mean()
drop_pct = ((avg_dry - avg_rainy) / avg_dry) * 100

print(f"\n Average rides on dry days: {avg_dry:.1f}")
print(f" Average rides on rainy days: {avg_rainy:.1f}")
print(f" Estimated drop due to rain: {drop_pct:.1f}%")