# RTE - Forecast energy consumption in French areas


In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Load data

In [2]:
train = pd.read_csv("../input/rte-forecast-energy-consumption-in-french-areas/train.csv")
test = pd.read_csv("../input/rte-forecast-energy-consumption-in-french-areas/test.csv")

train.shape, test.shape

((717414, 3), (60870, 3))

# Understanding Data

In [3]:
train.head()

Unnamed: 0,metropolitan_area_code,date,energy_consumption
0,3,2017-09-11 02:15:00,477.0
1,3,2017-09-11 02:30:00,454.0
2,3,2017-09-11 03:45:00,398.0
3,3,2017-09-11 04:45:00,398.0
4,3,2017-09-11 05:15:00,409.0


In [4]:
test.head()

Unnamed: 0,id,metropolitan_area_code,date
0,0,3,2022-01-01 00:45:00
1,1,3,2022-01-01 01:15:00
2,2,3,2022-01-01 01:30:00
3,3,3,2022-01-01 02:00:00
4,4,3,2022-01-01 02:15:00


- Is Null sum

In [5]:
train.isna().sum().any(), test.isna().sum().any()

(False, False)

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 717414 entries, 0 to 717413
Data columns (total 3 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   metropolitan_area_code  717414 non-null  int64  
 1   date                    717414 non-null  object 
 2   energy_consumption      717414 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 16.4+ MB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60870 entries, 0 to 60869
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      60870 non-null  int64 
 1   metropolitan_area_code  60870 non-null  int64 
 2   date                    60870 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.4+ MB


In [8]:
print(len(train['metropolitan_area_code'].unique()))

train['metropolitan_area_code'].value_counts()

17


2     78477
16    65256
3     62417
11    59587
0     55893
4     54914
14    54364
9     52229
1     49754
13    38024
15    29976
12    26392
8     24992
6     23541
10    18971
7     18654
5      3973
Name: metropolitan_area_code, dtype: int64

In [9]:
print(len(test['metropolitan_area_code'].unique()))

test['metropolitan_area_code'].value_counts()

17


2     6858
7     4696
10    4644
8     4581
0     4553
3     4533
4     4433
6     4271
5     3559
13    2974
9     2742
14    2741
11    2732
1     2279
16    2262
15    2244
12     768
Name: metropolitan_area_code, dtype: int64

# Preprocessing Data

In [10]:
train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])

train.head()

Unnamed: 0,metropolitan_area_code,date,energy_consumption
0,3,2017-09-11 02:15:00,477.0
1,3,2017-09-11 02:30:00,454.0
2,3,2017-09-11 03:45:00,398.0
3,3,2017-09-11 04:45:00,398.0
4,3,2017-09-11 05:15:00,409.0


# Feature Engineering

In [11]:
!pip install fast_ml

Collecting fast_ml
  Downloading fast_ml-3.68-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 kB[0m [31m184.1 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fast_ml
Successfully installed fast_ml-3.68
[0m

In [12]:
from fast_ml.feature_engineering import FeatureEngineering_DateTime
dt_fe = FeatureEngineering_DateTime()

dt_fe.fit(train, datetime_variables=['date'])
train = dt_fe.transform(train)

train.head(3)

  df[pfx+f] = getattr(df[var].dt, f)


Unnamed: 0,metropolitan_area_code,date,energy_consumption,date:year,date:quarter,date:month,date:day,date:day_of_week,date:day_of_year,date:weekofyear,...,date:is_quarter_end,date:is_quarter_start,date:is_year_end,date:is_year_start,date:time,date:hour,date:minute,date:second,date:is_weekend,date:day_part
0,3,2017-09-11 02:15:00,477.0,2017,3,9,11,0,254,37,...,False,False,False,False,02:15:00,2,15,0,0,midnight
1,3,2017-09-11 02:30:00,454.0,2017,3,9,11,0,254,37,...,False,False,False,False,02:30:00,2,30,0,0,midnight
2,3,2017-09-11 03:45:00,398.0,2017,3,9,11,0,254,37,...,False,False,False,False,03:45:00,3,45,0,0,midnight


#### FE : Test Data

In [13]:
dt_fe.fit(test, datetime_variables=['date'])
test = dt_fe.transform(test)

test.head(3)

  df[pfx+f] = getattr(df[var].dt, f)


Unnamed: 0,id,metropolitan_area_code,date,date:year,date:quarter,date:month,date:day,date:day_of_week,date:day_of_year,date:weekofyear,...,date:is_quarter_end,date:is_quarter_start,date:is_year_end,date:is_year_start,date:time,date:hour,date:minute,date:second,date:is_weekend,date:day_part
0,0,3,2022-01-01 00:45:00,2022,1,1,1,5,1,52,...,False,True,False,True,00:45:00,0,45,0,1,
1,1,3,2022-01-01 01:15:00,2022,1,1,1,5,1,52,...,False,True,False,True,01:15:00,1,15,0,1,midnight
2,2,3,2022-01-01 01:30:00,2022,1,1,1,5,1,52,...,False,True,False,True,01:30:00,1,30,0,1,midnight


### Drop unique columns

In [14]:
nunique_train=train.nunique().reset_index()
remove_col=nunique_train[(nunique_train[0]==len(train)) | (nunique_train[0]==0) | (nunique_train[0]==1) ]['index'].tolist()
remove_col

['date:second']

In [15]:
print("Before :", train.shape)
train = train.drop(remove_col,axis=1)
print("After :", train.shape)

train.head(3)

Before : (717414, 22)
After : (717414, 21)


Unnamed: 0,metropolitan_area_code,date,energy_consumption,date:year,date:quarter,date:month,date:day,date:day_of_week,date:day_of_year,date:weekofyear,...,date:is_month_start,date:is_quarter_end,date:is_quarter_start,date:is_year_end,date:is_year_start,date:time,date:hour,date:minute,date:is_weekend,date:day_part
0,3,2017-09-11 02:15:00,477.0,2017,3,9,11,0,254,37,...,False,False,False,False,False,02:15:00,2,15,0,midnight
1,3,2017-09-11 02:30:00,454.0,2017,3,9,11,0,254,37,...,False,False,False,False,False,02:30:00,2,30,0,midnight
2,3,2017-09-11 03:45:00,398.0,2017,3,9,11,0,254,37,...,False,False,False,False,False,03:45:00,3,45,0,midnight


In [16]:
print("Before :", test.shape)
test = test.drop(remove_col,axis=1)
print("After :", test.shape)

test.head(3)

Before : (60870, 22)
After : (60870, 21)


Unnamed: 0,id,metropolitan_area_code,date,date:year,date:quarter,date:month,date:day,date:day_of_week,date:day_of_year,date:weekofyear,...,date:is_month_start,date:is_quarter_end,date:is_quarter_start,date:is_year_end,date:is_year_start,date:time,date:hour,date:minute,date:is_weekend,date:day_part
0,0,3,2022-01-01 00:45:00,2022,1,1,1,5,1,52,...,True,False,True,False,True,00:45:00,0,45,1,
1,1,3,2022-01-01 01:15:00,2022,1,1,1,5,1,52,...,True,False,True,False,True,01:15:00,1,15,1,midnight
2,2,3,2022-01-01 01:30:00,2022,1,1,1,5,1,52,...,True,False,True,False,True,01:30:00,1,30,1,midnight


In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 717414 entries, 0 to 717413
Data columns (total 21 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   metropolitan_area_code  717414 non-null  int64         
 1   date                    717414 non-null  datetime64[ns]
 2   energy_consumption      717414 non-null  float64       
 3   date:year               717414 non-null  int64         
 4   date:quarter            717414 non-null  int64         
 5   date:month              717414 non-null  int64         
 6   date:day                717414 non-null  int64         
 7   date:day_of_week        717414 non-null  int64         
 8   date:day_of_year        717414 non-null  int64         
 9   date:weekofyear         717414 non-null  int64         
 10  date:is_month_end       717414 non-null  bool          
 11  date:is_month_start     717414 non-null  bool          
 12  date:is_quarter_end     717414

In [18]:
import re

train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

train.head(3)

Unnamed: 0,metropolitan_area_code,date,energy_consumption,dateyear,datequarter,datemonth,dateday,dateday_of_week,dateday_of_year,dateweekofyear,...,dateis_month_start,dateis_quarter_end,dateis_quarter_start,dateis_year_end,dateis_year_start,datetime,datehour,dateminute,dateis_weekend,dateday_part
0,3,2017-09-11 02:15:00,477.0,2017,3,9,11,0,254,37,...,False,False,False,False,False,02:15:00,2,15,0,midnight
1,3,2017-09-11 02:30:00,454.0,2017,3,9,11,0,254,37,...,False,False,False,False,False,02:30:00,2,30,0,midnight
2,3,2017-09-11 03:45:00,398.0,2017,3,9,11,0,254,37,...,False,False,False,False,False,03:45:00,3,45,0,midnight


In [19]:
bool_cols = [col for col in train.columns if train[col].dtypes == 'bool']

for col in bool_cols:
    train[col] = train[col].map({1:True, 0:False})
    test[col] = test[col].map({1:True, 0:False})

In [20]:
test.isna().sum()

id                            0
metropolitan_area_code        0
date                          0
dateyear                      0
datequarter                   0
datemonth                     0
dateday                       0
dateday_of_week               0
dateday_of_year               0
dateweekofyear                0
dateis_month_end          60870
dateis_month_start        60870
dateis_quarter_end        60870
dateis_quarter_start      60870
dateis_year_end           60870
dateis_year_start         60870
datetime                      0
datehour                      0
dateminute                    0
dateis_weekend                0
dateday_part               2505
dtype: int64

In [21]:
train.isna().sum()

metropolitan_area_code         0
date                           0
energy_consumption             0
dateyear                       0
datequarter                    0
datemonth                      0
dateday                        0
dateday_of_week                0
dateday_of_year                0
dateweekofyear                 0
dateis_month_end          717414
dateis_month_start        717414
dateis_quarter_end        717414
dateis_quarter_start      717414
dateis_year_end           717414
dateis_year_start         717414
datetime                       0
datehour                       0
dateminute                     0
dateis_weekend                 0
dateday_part               30234
dtype: int64

In [22]:
drop_cols = ['dateis_month_end','dateis_month_start','dateis_quarter_end','dateis_quarter_start',
            'dateis_year_end','dateis_year_start','dateday_part','datetime','date']

train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)

train.shape, test.shape

((717414, 12), (60870, 12))

In [23]:
test = test.drop(['id'],axis=1)

#train.shape, test.shape

In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 717414 entries, 0 to 717413
Data columns (total 12 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   metropolitan_area_code  717414 non-null  int64  
 1   energy_consumption      717414 non-null  float64
 2   dateyear                717414 non-null  int64  
 3   datequarter             717414 non-null  int64  
 4   datemonth               717414 non-null  int64  
 5   dateday                 717414 non-null  int64  
 6   dateday_of_week         717414 non-null  int64  
 7   dateday_of_year         717414 non-null  int64  
 8   dateweekofyear          717414 non-null  int64  
 9   datehour                717414 non-null  int64  
 10  dateminute              717414 non-null  int64  
 11  dateis_weekend          717414 non-null  int64  
dtypes: float64(1), int64(11)
memory usage: 65.7 MB


In [25]:
test.head(5)

Unnamed: 0,metropolitan_area_code,dateyear,datequarter,datemonth,dateday,dateday_of_week,dateday_of_year,dateweekofyear,datehour,dateminute,dateis_weekend
0,3,2022,1,1,1,5,1,52,0,45,1
1,3,2022,1,1,1,5,1,52,1,15,1
2,3,2022,1,1,1,5,1,52,1,30,1
3,3,2022,1,1,1,5,1,52,2,0,1
4,3,2022,1,1,1,5,1,52,2,15,1


# Splitting Data

In [26]:
X = train.copy()
y = X.pop('energy_consumption')

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((645672, 11), (71742, 11), (645672,), (71742,))

# Modeling : FLAML

In [28]:
!pip install flaml

Collecting flaml
  Downloading FLAML-1.0.7-py3-none-any.whl (196 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.4/196.4 kB[0m [31m438.3 kB/s[0m eta [36m0:00:00[0m
Collecting xgboost<=1.3.3,>=0.90
  Downloading xgboost-1.3.3-py3-none-manylinux2010_x86_64.whl (157.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.5/157.5 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xgboost, flaml
  Attempting uninstall: xgboost
    Found existing installation: xgboost 1.6.1
    Uninstalling xgboost-1.6.1:
      Successfully uninstalled xgboost-1.6.1
Successfully installed flaml-1.0.7 xgboost-1.3.3
[0m

In [29]:
from flaml import AutoML
automl = AutoML()

automl.fit(X_train, y_train, task="regression", metric='rmse', time_budget=1200) # 20 min

[flaml.automl: 06-22 09:03:41] {2390} INFO - task = regression
[flaml.automl: 06-22 09:03:41] {2392} INFO - Data split method: uniform
[flaml.automl: 06-22 09:03:41] {2396} INFO - Evaluation method: holdout
[flaml.automl: 06-22 09:03:41] {2465} INFO - Minimizing error metric: rmse
[flaml.automl: 06-22 09:03:41] {2605} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl: 06-22 09:03:41] {2897} INFO - iteration 0, current learner lgbm
[flaml.automl: 06-22 09:03:41] {3026} INFO - Estimated sufficient time budget=77463s. Estimated necessary time budget=662s.
[flaml.automl: 06-22 09:03:41] {3078} INFO -  at 1.9s,	estimator lgbm's best error=807.0028,	best estimator lgbm's best error=807.0028
[flaml.automl: 06-22 09:03:41] {2897} INFO - iteration 1, current learner lgbm
[flaml.automl: 06-22 09:03:41] {3078} INFO -  at 1.9s,	estimator lgbm's best error=807.0028,	best estimator lgbm's best error=807.0028
[flaml.automl: 0

In [30]:
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best log_loss on validation data: {0:.4g}'.format(automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best ML leaner: lgbm
Best hyperparmeter config: {'n_estimators': 12442, 'num_leaves': 740, 'min_child_samples': 3, 'learning_rate': 0.1458986385528343, 'log_max_bin': 8, 'colsample_bytree': 0.4932508865405488, 'reg_alpha': 0.0012603170529968141, 'reg_lambda': 0.04549584170854636, 'FLAML_sample_size': 581104}
Best log_loss on validation data: 27.99
Training duration of best run: 579.9 s


In [31]:
from flaml.ml import sklearn_metric_loss_score
print('train r2', '=', 1-sklearn_metric_loss_score('r2', automl.predict(X_train), y_train))
print(' test r2', '=', 1-sklearn_metric_loss_score('r2', automl.predict(X_test), y_test))

train r2 = 0.9999194577802002
 test r2 = 0.999516866441933


In [32]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

pred = automl.predict(X_test)

mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)

print("MAE: %.9f" % mae)
print("MSE: %.9f" % mse)
print("RMSE: %.9f" % math.sqrt(mse))

MAE: 15.364128452
MSE: 635.995087531
RMSE: 25.218943030


# Test Prediction

In [33]:
pred = automl.predict(test)
len(pred), test.shape

(60870, (60870, 11))

# Submission

In [34]:
sub = pd.read_csv("../input/rte-forecast-energy-consumption-in-french-areas/sample_submission.csv")
sub.shape

(60870, 2)

In [35]:
sub['energy_consumption'] = pred
sub.to_csv('submission.csv', index=False)

In [36]:
sub

Unnamed: 0,id,energy_consumption
0,0,761.153632
1,1,794.889488
2,2,787.449973
3,3,786.264886
4,4,781.208862
...,...,...
60865,60865,395.413645
60866,60866,402.421471
60867,60867,448.521418
60868,60868,422.101238
