# Create Weather Variables

In [1]:
import pandas as pd
import os
from pathlib import Path
from glob import glob
import matplotlib.pyplot as plt
import numpy as np

In [4]:
path_weather = "../data/weather_data/"

In [6]:
# Weather data
weather = pd.read_csv(path_weather + "weather.csv", usecols = ["timestamp",
                                                                  "site_id",
                                                                  "airTemperature"
                                                                  ])

def reduce_mem_usage(df):
    """Downcast numeric columns to reduce memory usage."""
    start_mem = df.memory_usage(deep=True).sum()
    for col in df.columns:
        col_type = df[col].dtype
        if pd.api.types.is_numeric_dtype(col_type):
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.api.types.is_float_dtype(col_type):
                # Downcast float
                df[col] = pd.to_numeric(df[col], downcast='float')
            else:
                # Downcast int
                df[col] = pd.to_numeric(df[col], downcast='integer')
        # leave object columns unchanged
    end_mem = df.memory_usage(deep=True).sum()
    return df

weather = reduce_mem_usage(weather)
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331166 entries, 0 to 331165
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   timestamp       331166 non-null  object 
 1   site_id         331166 non-null  object 
 2   airTemperature  331038 non-null  float32
dtypes: float32(1), object(2)
memory usage: 6.3+ MB


In [7]:
weather.head()

Unnamed: 0,timestamp,site_id,airTemperature
0,2016-01-01 00:00:00,Panther,19.4
1,2016-01-01 01:00:00,Panther,21.1
2,2016-01-01 02:00:00,Panther,21.1
3,2016-01-01 03:00:00,Panther,20.6
4,2016-01-01 04:00:00,Panther,21.1


In [8]:
weather.isna().sum()*100/len(weather)

timestamp         0.000000
site_id           0.000000
airTemperature    0.038651
dtype: float64

In [9]:
# Fill NaN with interpolation
weather.airTemperature.interpolate(method="linear",inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  weather.airTemperature.interpolate(method="linear",inplace=True)


In [10]:
weather.isna().sum()*100/len(weather)

timestamp         0.0
site_id           0.0
airTemperature    0.0
dtype: float64

In [12]:
start = pd.to_datetime(weather["timestamp"]).min()
end = pd.to_datetime(weather["timestamp"]).max()
print(f"Timestamp date range: {start} to {end}")

Timestamp date range: 2016-01-01 00:00:00 to 2017-12-31 23:00:00


In [16]:
import pandas as pd
import numpy as np

def calculate_degree_days(temperature_series, base_temp, is_heating=True):
    """
    Calculates hourly Heating Degree Days (HDD) or Cooling Degree Days (CDD).
    """
    if is_heating:
        # HDD: max(0, Base_Temp - Actual_Temp)
        return np.maximum(0, base_temp - temperature_series)
    else:
        # CDD: max(0, Actual_Temp - Base_Temp)
        return np.maximum(0, temperature_series - base_temp)


# ---------------------------------------------------------------------

# Ensure 'timestamp' is a datetime object
weather['timestamp'] = pd.to_datetime(weather['timestamp'])

# Set the base temperature for degree day calculations (e.g., 20°C or 68°F)
BASE_TEMP_HDD = 18.0
BASE_TEMP_CDD = 24.0

# Calculate hourly HDD and CDD
weather['HDD_hourly'] = calculate_degree_days(weather['airTemperature'], BASE_TEMP_HDD, is_heating=True)
weather['CDD_hourly'] = calculate_degree_days(weather['airTemperature'], BASE_TEMP_CDD, is_heating=False)
# Extract the year from the timestamp
weather['year'] = weather['timestamp'].dt.year
# Aggregate to yearly metrics per site
yearly_weather_aggregates = weather.groupby(['site_id', 'year']).agg(
    Avg_AirTemp_Annual=('airTemperature', 'mean'),
    Total_HDD_Annual=('HDD_hourly', 'sum'),
    Total_CDD_Annual=('CDD_hourly', 'sum')
).reset_index()

print(yearly_weather_aggregates)


     site_id  year  Avg_AirTemp_Annual  Total_HDD_Annual  Total_CDD_Annual
0       Bear  2016           15.238575      29345.798828        459.000031
1       Bear  2017           14.966572      32817.199219        880.500000
2     Bobcat  2016           11.663602      74400.507812       6934.350098
3     Bobcat  2017           11.465086      77074.000000       7608.950195
4       Bull  2016           21.369135      17007.199219      17155.900391
5       Bull  2017           21.787615      16196.263672      18436.414062
6   Cockatoo  2016            9.357398      84526.796875       2571.400146
7   Cockatoo  2017            9.180041      83449.750000       1233.300049
8       Crow  2016            7.884769      98418.601562       2405.600098
9       Crow  2017            7.707470      92370.703125       1066.800049
10     Eagle  2016           13.260192      61877.601562       5882.700195
11     Eagle  2017           13.082863      60794.500000       4211.600098
12       Fox  2016       

**Variables**
- **HDD** stands for **Heating Degree Days**. It’s a simple proxy for heating demand: when the outside temperature is below a chosen base temperature, the difference indicates how many "degrees" heating is needed that day.
- **CDD** stands for **Cooling Degree Days**. It’s a simple proxy for cooling demand: when the outside temperature goes above a chosen base temperature, the difference indicates how many "degrees" cooling is needed in theory that day.

Of course, the choice of BASE_TEMP matters and influence the outcomes of HDD and CDD, respectively.

In [25]:
import pandas as pd
import numpy as np


# Ensure 'timestamp' is a datetime object
weather['timestamp'] = pd.to_datetime(weather['timestamp'])

# Set the base temperature for degree day calculations (e.g., 20°C)
BASE_TEMP_HDD = 15.0
BASE_TEMP_CDD = 24.0

# --- Step 1: Aggregate to daily averages first ---
daily_weather_avg = weather.groupby([
    'site_id', 
    weather['timestamp'].dt.year.rename('year'),
    weather['timestamp'].dt.date.rename('date') # Use date for daily grouping
])['airTemperature'].mean().reset_index()

# --- Step 2: Determine if each day was a 'heating day' or 'cooling day' ---
# Create binary flags (1 if true, 0 otherwise)
daily_weather_avg['is_heating_day'] = (daily_weather_avg['airTemperature'] < BASE_TEMP_HDD).astype(int)
daily_weather_avg['is_cooling_day'] = (daily_weather_avg['airTemperature'] > BASE_TEMP_CDD).astype(int)

# --- Step 3: Recalculate HDD/CDD daily and sum annually ---
daily_weather_avg['HDD_daily'] = np.maximum(0, BASE_TEMP_HDD - daily_weather_avg['airTemperature'])
daily_weather_avg['CDD_daily'] = np.maximum(0, daily_weather_avg['airTemperature'] - BASE_TEMP_CDD)


# --- Step 4: Aggregate daily metrics to yearly metrics per site ---
yearly_weather_aggregates = daily_weather_avg.groupby(['site_id', 'year']).agg(
    Avg_AirTemp_Annual=('airTemperature', 'mean'),
    Total_HDD_Annual=('HDD_daily', 'sum'),
    Total_CDD_Annual=('CDD_daily', 'sum'),
    Count_Heating_Days_Annual=('is_heating_day', 'sum'),
    Count_Cooling_Days_Annual=('is_cooling_day', 'sum')
).reset_index()

yearly_weather_aggregates.head(50)


Unnamed: 0,site_id,year,Avg_AirTemp_Annual,Total_HDD_Annual,Total_CDD_Annual,Count_Heating_Days_Annual,Count_Cooling_Days_Annual
0,Bear,2016,15.237956,388.216492,0.025,146,1
1,Bear,2017,14.965894,516.453064,8.72917,160,4
2,Bobcat,2016,11.659762,2285.002197,160.281265,225,63
3,Bobcat,2017,11.485664,2419.551025,193.476837,235,68
4,Bull,2016,21.368526,337.325012,611.795837,71,146
5,Bull,2017,21.778797,310.833313,629.99292,65,155
6,Cockatoo,2016,9.377469,2768.982422,18.607815,240,14
7,Cockatoo,2017,9.133105,2717.791016,2.746799,239,5
8,Crow,2016,7.637231,3422.980225,21.502167,240,14
9,Crow,2017,7.531336,3189.713867,7.706705,234,6


In [23]:
# Generate the cleaned file for metadata analysis
yearly_weather_aggregates.to_csv(path_weather + "weather_data.csv", index=False)

In [27]:
# Load the cleaned file to verify
df_weather = pd.read_csv(path_weather + "weather_data.csv")
df_weather.head()

Unnamed: 0,site_id,year,Avg_AirTemp_Annual,Total_HDD_Annual,Total_CDD_Annual,Count_Heating_Days_Annual,Count_Cooling_Days_Annual
0,Bear,2016,15.237956,388.2165,0.025,146,1
1,Bear,2017,14.965894,516.45306,8.72917,160,4
2,Bobcat,2016,11.659762,2285.0022,160.28127,225,63
3,Bobcat,2017,11.485664,2419.551,193.47684,235,68
4,Bull,2016,21.368526,337.325,611.79584,71,146


In [28]:
df_weather['site_id'].nunique()

19