# Weather impact on ride demand

This notebook studies whether weather conditions significantly affect ride demand in New York City. Weather data is joined with the zone-hour demand dataset and formal statistical hypothesis tests are applied.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

AGG_PATH = "../../data/processed/nyc_demand_zone_hour_2019_q1.parquet"

# Show all columns
pd.set_option("display.max_columns", None)

# Load aggregated dataset
df = pd.read_parquet(AGG_PATH)
df.head()

Unnamed: 0,zone_id,pickup_hour_ts,demand,avg_fare,avg_distance,hour,day_of_week,is_weekend,day,month
0,1,2019-01-01 10:00:00,2,61.25,16.9,10,1,0,1,1
1,1,2019-01-01 12:00:00,1,135.0,19.3,12,1,0,1,1
2,1,2019-01-01 15:00:00,1,106.0,41.28,15,1,0,1,1
3,1,2019-01-02 02:00:00,1,30.0,1.27,2,2,0,2,1
4,1,2019-01-02 03:00:00,1,15.0,12.65,3,2,0,2,1


## Hypothesis

Test whether adverse weather conditions are associated with statistically significant changes in ride demand.

H0: Mean zone-hour demand is the same under normal and adverse weather.

H1: Mean zone-hour demand differs under adverse weather conditions.

In [6]:
# Load weather dataset
weather_df = pd.read_csv("../../data/raw/72505394728.csv", low_memory=False)

# Filter columns
weather_df = weather_df[["DATE", "TMP", "AA1"]]
weather_df.head(35)

Unnamed: 0,DATE,TMP,AA1
0,2019-01-01T00:10:00,675,1001031.0
1,2019-01-01T00:51:00,675,1004195.0
2,2019-01-01T01:02:00,675,1000531.0
3,2019-01-01T01:31:00,725,1001231.0
4,2019-01-01T01:51:00,725,1003895.0
5,2019-01-01T02:15:00,725,1002731.0
6,2019-01-01T02:27:00,725,1004031.0
7,2019-01-01T02:41:00,725,1005331.0
8,2019-01-01T02:49:00,705,1005531.0
9,2019-01-01T02:51:00,725,1004895.0


In [11]:
# Extract hour from timestamp
weather_df["timestamp"] = pd.to_datetime(weather_df["DATE"])
weather_df["hour_ts"] = weather_df["timestamp"].dt.floor("h")
weather_df = weather_df.drop(columns=["DATE"])
weather_df.head(15)

Unnamed: 0,TMP,AA1,timestamp,hour_ts
0,675,1001031.0,2019-01-01 00:10:00,2019-01-01 00:00:00
1,675,1004195.0,2019-01-01 00:51:00,2019-01-01 00:00:00
2,675,1000531.0,2019-01-01 01:02:00,2019-01-01 01:00:00
3,725,1001231.0,2019-01-01 01:31:00,2019-01-01 01:00:00
4,725,1003895.0,2019-01-01 01:51:00,2019-01-01 01:00:00
5,725,1002731.0,2019-01-01 02:15:00,2019-01-01 02:00:00
6,725,1004031.0,2019-01-01 02:27:00,2019-01-01 02:00:00
7,725,1005331.0,2019-01-01 02:41:00,2019-01-01 02:00:00
8,705,1005531.0,2019-01-01 02:49:00,2019-01-01 02:00:00
9,725,1004895.0,2019-01-01 02:51:00,2019-01-01 02:00:00


In [12]:
weather_df.tail(10)

Unnamed: 0,TMP,AA1,timestamp,hour_ts
11988,335,,2019-12-31 15:27:00,2019-12-31 15:00:00
11989,335,1000095.0,2019-12-31 15:51:00,2019-12-31 15:00:00
11990,445,1000095.0,2019-12-31 16:51:00,2019-12-31 16:00:00
11991,505,1000095.0,2019-12-31 17:51:00,2019-12-31 17:00:00
11992,615,1000095.0,2019-12-31 18:51:00,2019-12-31 18:00:00
11993,615,1000095.0,2019-12-31 19:51:00,2019-12-31 19:00:00
11994,675,1000095.0,2019-12-31 20:51:00,2019-12-31 20:00:00
11995,675,1000095.0,2019-12-31 21:51:00,2019-12-31 21:00:00
11996,615,1000095.0,2019-12-31 22:51:00,2019-12-31 22:00:00
11997,675,1000095.0,2019-12-31 23:51:00,2019-12-31 23:00:00


In [27]:
# Get data only for first quarter
mask = (weather_df["timestamp"] >= "2019-01-01") & (weather_df["timestamp"] <= "2019-03-31")

weather_df = weather_df.loc[mask]
weather_df.tail()

Unnamed: 0,TMP,AA1,timestamp,hour_ts
2822,1945,1000095,2019-03-30 19:51:00,2019-03-30 19:00:00
2823,1835,1000095,2019-03-30 20:51:00,2019-03-30 20:00:00
2824,1785,1000095,2019-03-30 21:51:00,2019-03-30 21:00:00
2825,1565,1000095,2019-03-30 22:51:00,2019-03-30 22:00:00
2826,1505,1000095,2019-03-30 23:51:00,2019-03-30 23:00:00


In [28]:
weather_df.head()

Unnamed: 0,TMP,AA1,timestamp,hour_ts
0,675,1001031,2019-01-01 00:10:00,2019-01-01 00:00:00
1,675,1004195,2019-01-01 00:51:00,2019-01-01 00:00:00
2,675,1000531,2019-01-01 01:02:00,2019-01-01 01:00:00
3,725,1001231,2019-01-01 01:31:00,2019-01-01 01:00:00
4,725,1003895,2019-01-01 01:51:00,2019-01-01 01:00:00


In [31]:
# Extract temperature
weather_df["temp_c"] = (
    weather_df["TMP"].str.slice(0,5).astype(int) / 10
)
weather_df.head()

Unnamed: 0,TMP,AA1,timestamp,hour_ts,temp_c
0,675,1001031,2019-01-01 00:10:00,2019-01-01 00:00:00,6.7
1,675,1004195,2019-01-01 00:51:00,2019-01-01 00:00:00,6.7
2,675,1000531,2019-01-01 01:02:00,2019-01-01 01:00:00,6.7
3,725,1001231,2019-01-01 01:31:00,2019-01-01 01:00:00,7.2
4,725,1003895,2019-01-01 01:51:00,2019-01-01 01:00:00,7.2


In [32]:
# Column TMP is not needed anymore
weather_df = weather_df.drop(columns=["TMP"])
weather_df.head()

Unnamed: 0,AA1,timestamp,hour_ts,temp_c
0,1001031,2019-01-01 00:10:00,2019-01-01 00:00:00,6.7
1,1004195,2019-01-01 00:51:00,2019-01-01 00:00:00,6.7
2,1000531,2019-01-01 01:02:00,2019-01-01 01:00:00,6.7
3,1001231,2019-01-01 01:31:00,2019-01-01 01:00:00,7.2
4,1003895,2019-01-01 01:51:00,2019-01-01 01:00:00,7.2


In [35]:
# Extract precipitation 
def parse_precip(val):
    if pd.isna(val):
        return 0.0
    
    parts = val.split(",")
    
    if len(parts) < 2:
        return 0.0
    
    try:
        return int(parts[1]) / 10
    except:
        return 0.0

weather_df["precip_mm"] = weather_df["AA1"].apply(parse_precip)
weather_df.head()

Unnamed: 0,AA1,timestamp,hour_ts,temp_c,precip_mm
0,1001031,2019-01-01 00:10:00,2019-01-01 00:00:00,6.7,1.0
1,1004195,2019-01-01 00:51:00,2019-01-01 00:00:00,6.7,4.1
2,1000531,2019-01-01 01:02:00,2019-01-01 01:00:00,6.7,0.5
3,1001231,2019-01-01 01:31:00,2019-01-01 01:00:00,7.2,1.2
4,1003895,2019-01-01 01:51:00,2019-01-01 01:00:00,7.2,3.8


In [38]:
# Column AA1 is not needed anymore - removed
weather_df.head(30)

Unnamed: 0,timestamp,hour_ts,temp_c,precip_mm
0,2019-01-01 00:10:00,2019-01-01 00:00:00,6.7,1.0
1,2019-01-01 00:51:00,2019-01-01 00:00:00,6.7,4.1
2,2019-01-01 01:02:00,2019-01-01 01:00:00,6.7,0.5
3,2019-01-01 01:31:00,2019-01-01 01:00:00,7.2,1.2
4,2019-01-01 01:51:00,2019-01-01 01:00:00,7.2,3.8
5,2019-01-01 02:15:00,2019-01-01 02:00:00,7.2,2.7
6,2019-01-01 02:27:00,2019-01-01 02:00:00,7.2,4.0
7,2019-01-01 02:41:00,2019-01-01 02:00:00,7.2,5.3
8,2019-01-01 02:49:00,2019-01-01 02:00:00,7.0,5.5
9,2019-01-01 02:51:00,2019-01-01 02:00:00,7.2,4.8


Multiple weather observations can occur within the same hour.
Weather variables are therefore aggregated to hourly resolution before being joined with zone-hour demand data.

In [39]:
# Aggregate to hourly
hourly_weather = (
    weather_df
    .groupby("hour_ts")
    .agg(
        precip_mm=("precip_mm", "sum"),
        temperature_c=("temp_c", "mean")
    )
    .reset_index()
)

In [41]:
hourly_weather.head(10)

Unnamed: 0,hour_ts,precip_mm,temperature_c
0,2019-01-01 00:00:00,5.1,6.7
1,2019-01-01 01:00:00,5.5,7.033333
2,2019-01-01 02:00:00,22.3,7.16
3,2019-01-01 03:00:00,2.5,7.8
4,2019-01-01 04:00:00,31.9,504.1
5,2019-01-01 05:00:00,3.3,8.375
6,2019-01-01 06:00:00,0.5,10.0
7,2019-01-01 07:00:00,0.0,11.1
8,2019-01-01 08:00:00,0.5,10.15
9,2019-01-01 09:00:00,0.0,11.15


In [44]:
# Rain vs No rain
hourly_weather["bad_weather"] = hourly_weather["precip_mm"] > 0

In [43]:
hourly_weather.head(10)

Unnamed: 0,hour_ts,precip_mm,temperature_c,bad_weather
0,2019-01-01 00:00:00,5.1,6.7,True
1,2019-01-01 01:00:00,5.5,7.033333,True
2,2019-01-01 02:00:00,22.3,7.16,True
3,2019-01-01 03:00:00,2.5,7.8,True
4,2019-01-01 04:00:00,31.9,504.1,True
5,2019-01-01 05:00:00,3.3,8.375,True
6,2019-01-01 06:00:00,0.5,10.0,True
7,2019-01-01 07:00:00,0.0,11.1,False
8,2019-01-01 08:00:00,0.5,10.15,True
9,2019-01-01 09:00:00,0.0,11.15,False


In [49]:
# Join weather with demand data
merged_df = df.merge(
    hourly_weather,
    left_on="pickup_hour_ts",
    right_on="hour_ts",
    how="left"
)

In [50]:
merged_df.head()

Unnamed: 0,zone_id,pickup_hour_ts,demand,avg_fare,avg_distance,hour,day_of_week,is_weekend,day,month,hour_ts,precip_mm,temperature_c,bad_weather
0,1,2019-01-01 10:00:00,2,61.25,16.9,10,1,0,1,1,2019-01-01 10:00:00,0.0,12.133333,False
1,1,2019-01-01 12:00:00,1,135.0,19.3,12,1,0,1,1,2019-01-01 12:00:00,0.0,13.9,False
2,1,2019-01-01 15:00:00,1,106.0,41.28,15,1,0,1,1,2019-01-01 15:00:00,0.0,14.4,False
3,1,2019-01-02 02:00:00,1,30.0,1.27,2,2,0,2,1,2019-01-02 02:00:00,0.0,5.6,False
4,1,2019-01-02 03:00:00,1,15.0,12.65,3,2,0,2,1,2019-01-02 03:00:00,0.0,5.0,False


In [51]:
merged_df["bad_weather"].isna().mean()

np.float64(0.011320487724346124)

Weather observations are merged with the zoneâ€“hour demand dataset using hourly timestamps. Each zone-hour record receives the corresponding
city-level weather conditions.


In [52]:
merged_df[["precip_mm", "temperature_c", "bad_weather"]].describe()

Unnamed: 0,precip_mm,temperature_c
count,293447.0,293447.0
mean,0.392877,20.277785
std,2.143115,95.389645
min,0.0,-16.7
25%,0.0,-1.05
50%,0.0,2.8
75%,0.0,6.7
max,36.7,999.9


In [54]:
merged_df["bad_weather"].value_counts(normalize=True)

bad_weather
False    0.894567
True     0.105433
Name: proportion, dtype: float64

In [55]:
# Filter unrealistic weather conditions
merged_df = merged_df[(merged_df["temperature_c"] > -30) & (merged_df["temperature_c"] < 50)]

In [56]:
merged_df.head(30)

Unnamed: 0,zone_id,pickup_hour_ts,demand,avg_fare,avg_distance,hour,day_of_week,is_weekend,day,month,hour_ts,precip_mm,temperature_c,bad_weather
0,1,2019-01-01 10:00:00,2,61.25,16.9,10,1,0,1,1,2019-01-01 10:00:00,0.0,12.133333,False
1,1,2019-01-01 12:00:00,1,135.0,19.3,12,1,0,1,1,2019-01-01 12:00:00,0.0,13.9,False
2,1,2019-01-01 15:00:00,1,106.0,41.28,15,1,0,1,1,2019-01-01 15:00:00,0.0,14.4,False
3,1,2019-01-02 02:00:00,1,30.0,1.27,2,2,0,2,1,2019-01-02 02:00:00,0.0,5.6,False
4,1,2019-01-02 03:00:00,1,15.0,12.65,3,2,0,2,1,2019-01-02 03:00:00,0.0,5.0,False
5,1,2019-01-02 13:00:00,1,70.5,18.73,13,2,0,2,1,2019-01-02 13:00:00,0.0,2.2,False
6,1,2019-01-02 14:00:00,2,38.25,4.035,14,2,0,2,1,2019-01-02 14:00:00,0.0,2.2,False
7,1,2019-01-02 17:00:00,1,40.0,0.01,17,2,0,2,1,2019-01-02 17:00:00,0.0,3.3,False
8,1,2019-01-02 18:00:00,2,87.5,2.85,18,2,0,2,1,2019-01-02 18:00:00,0.0,3.9,False
9,1,2019-01-03 13:00:00,2,90.0,0.685,13,3,0,3,1,2019-01-03 13:00:00,0.0,6.066667,False


## Create two samples - normal and bad weather
Two samples are created representing zone-hours with adverse and normal weather conditions.

In [57]:
bad = merged_df.loc[merged_df["bad_weather"] == True, "demand"]
normal = merged_df.loc[merged_df["bad_weather"] == False, "demand"]

In [58]:
len(bad), len(normal)

(27244, 255558)

## Perform statistical test
A Welch two-sample t-test is applied to test whether mean demand differs between normal and adverse weather conditions.

In [59]:
from scipy.stats import ttest_ind

t_stat, p_value = ttest_ind(bad, normal, equal_var=False)
t_stat, p_value

(np.float64(-10.673149280119201), np.float64(1.4949054792904727e-26))

The difference in average demand between weather conditions is reported to quantify the practical magnitude of the effect.


In [61]:
bad_mean = bad.mean()
normal_mean = normal.mean()
mean_diff = bad_mean - normal_mean

bad_mean, normal_mean, mean_diff

(np.float64(67.7878431948319),
 np.float64(76.77667300573647),
 np.float64(-8.988829810904576))

## Summary
This analysis identifies associations between weather conditions and ride demand, but does not establish causality.

Weather effects may be confounded by:
- time of day
- weekday vs weekend
- seasonal effects
- location-specific characteristics

In [63]:
# Save processed weather data
hourly_weather.to_csv("../../data/processed/weather_nyc_2019_q1.csv", index=False)