In [4]:
import pandas as pd
import numpy as np
from datetime import datetime
import jpholiday
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [7]:
demand_df = pd.read_csv('demand.csv')
demand_df['datetime'] = pd.to_datetime(demand_data['datetime'] , format='%m/%d/%Y %H:%M')
demand_df.rename(columns={'actual_performance(10000 kW)': 'demand'}, inplace=True)

In [8]:
weather_files = {
    'osaka': 'osaka.csv',
    'hikone': 'hikone.csv',
    'kyoto': 'kyoto.csv',
    'kobe': 'kobe.csv',
    'shionomisaki': 'shionomisaki.csv',
    'toyooka': 'toyooka.csv',
    'wakayama': 'wakayama.csv'
}

weather_weights = {
    'osaka': 0.25,
    'hikone': 0.15,
    'kyoto': 0.2,
    'kobe': 0.15,
    'shionomisaki': 0.1,
    'toyooka': 0.1,
    'wakayama': 0.05
}

In [9]:
# Load each weather file into a dictionary.
weather_dfs = {}
for loc, file in weather_files.items():
    df = pd.read_csv(file)
    df['datetime'] = pd.to_datetime(df['datetime'])
    weather_dfs[loc] = df

In [10]:
# Combine all wind_direction values from each location for a common LabelEncoder.
all_wind = pd.concat([df['wind_direction'] for df in weather_dfs.values()])
le = LabelEncoder()
le.fit(all_wind)

In [11]:
# Transform wind_direction in each dataframe.
for loc in weather_dfs:
    weather_dfs[loc]['wind_direction'] = le.transform(weather_dfs[loc]['wind_direction'])

In [12]:
# List of numeric weather variables to aggregate.
weather_vars = ['precipitation', 'temperature', 'dew_point_temperature', 
                'humidity', 'wind_speed', 'snowfall']

In [13]:
# Create an aggregated dataframe using the datetime column from one of the files.
aggregated_weather = weather_dfs[next(iter(weather_dfs))][['datetime']].copy()
for var in weather_vars:
    aggregated_weather[var] = 0.0
# For wind_direction, we’ll compute a weighted average of the encoded values.
aggregated_weather['wind_direction'] = 0.0

In [15]:
# Loop through each location and accumulate weighted values.
for loc, df in weather_dfs.items():
    w = weather_weights[loc]
    for var in weather_vars:
        aggregated_weather[var] += w * df[var]
    aggregated_weather['wind_direction'] += w * df['wind_direction']

# Optionally, round wind_direction to the nearest integer.
aggregated_weather['wind_direction'] = aggregated_weather['wind_direction'].round().astype(int)

In [16]:
merged_df = pd.merge(demand_df, aggregated_weather, on='datetime', how='inner')

In [18]:
# Extract time-based features.
merged_df['hour'] = merged_df['datetime'].dt.hour
merged_df['day'] = merged_df['datetime'].dt.day
merged_df['month'] = merged_df['datetime'].dt.month
merged_df['weekday'] = merged_df['datetime'].dt.weekday

merged_df.info()
print(merged_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26280 entries, 0 to 26279
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   datetime               26280 non-null  datetime64[ns]
 1   demand                 26280 non-null  int64         
 2   precipitation          26280 non-null  float64       
 3   temperature            26280 non-null  float64       
 4   dew_point_temperature  26280 non-null  float64       
 5   humidity               26280 non-null  float64       
 6   wind_speed             26280 non-null  float64       
 7   snowfall               26280 non-null  float64       
 8   wind_direction         26280 non-null  int32         
 9   hour                   26280 non-null  int32         
 10  day                    26280 non-null  int32         
 11  month                  26280 non-null  int32         
 12  weekday                26280 non-null  int32         
dtypes

In [19]:
# Create a holiday indicator using jpholiday.
merged_df['is_holiday'] = merged_df['datetime'].dt.date.apply(lambda d: 1 if jpholiday.is_holiday(d) else 0)

# List of features to use in the model.
features = [
    'hour', 'day', 'month', 'weekday', 'is_holiday',
    'precipitation', 'temperature', 'dew_point_temperature',
    'humidity', 'wind_speed', 'wind_direction', 'snowfall'
]

In [20]:
# Forecast target period: January 1, 2023, 00:00:00 to December 31, 2023, 23:00:00
forecast_start = pd.to_datetime('2023-01-01')
forecast_end   = pd.to_datetime('2023-12-31 23:00:00')

In [21]:
# For training, use data up to December 31, 2022.
train_data = merged_df[merged_df['datetime'] < forecast_start]

# For forecasting (or evaluation), select data in 2023.
test_data = merged_df[(merged_df['datetime'] >= forecast_start) & (merged_df['datetime'] <= forecast_end)]

X_train = train_data[features]
y_train = train_data['demand']
X_test  = test_data[features]
y_test  = test_data['demand']  

In [22]:
# Initialize XGBoost Regressor (adjust hyperparameters as needed).
model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
model.fit(X_train, y_train)

In [23]:
# Forecast electricity demand for 2023.
forecast = model.predict(X_test)

In [24]:
# Evaluate forecast performance if actual values are available.
mae = mean_absolute_error(y_test, forecast)
print("Mean Absolute Error on 2023 forecast:", mae)

Mean Absolute Error on 2023 forecast: 58.624667400203336


In [26]:
# Append forecast to the test data and save to CSV.
test_data['forecast_demand'] = forecast
test_data[['datetime', 'forecast_demand']].to_csv('forecast_2023.csv', index=False)
print("Forecast saved to 'forecast_2023.csv'.")

Forecast saved to 'forecast_2023.csv'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['forecast_demand'] = forecast
