In [163]:
# standard libs
import os
import sys
import logging

# project lib
PROJECT_SRC_PATH = os.path.join(os.path.abspath(''), '..', 'src')
sys.path.append(PROJECT_SRC_PATH)

import utils
import dataset
import visualizations
import energy_modeling
from prediction_age import AgePredictor, AgeClassifier, AgeClassifierComparison
from preprocessing import *

# external libs
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import geopandas as gpd
from shapely import wkt

from xgboost import XGBRegressor, XGBClassifier


In [79]:
logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s', level=logging.INFO)

In [80]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data

In [128]:
df_fr = dataset.load('france', geo=False)
df_fr_sample = utils.sample_cities(df_fr, 0.1)


In [127]:
df_nl = dataset.load('netherlands', geo=False)
df_nl_sample = utils.sample_cities(df_nl, 0.1)

## Preprocessing

#### Extract heating demand from TABULA_parameters_harmonized.csv and format csv

In [122]:
# tabula_params = tabula_params[tabula_params['country'].isin(['Netherlands', 'Spain', 'France'])]

def set_label(df):
    bins = sorted(np.unique(df[['age_min', 'age_max']].values))
    df['age_bin'] = utils.generate_labels(bins)
    return df

tabula_params = tabula_params.groupby(['country', 'residential_type']).apply(set_label)

[0, 1945, 1946, 1971, 1991, 2006, 2012, 2051]
    age_min  age_max
37        0     1945
38     1946     1971
39     1971     1991
40     1991     2006
41     2006     2012
42     2012     2051
[0, 1945, 1946, 1971, 1991, 2006, 2012, 2051]
    age_min  age_max
43        0     1945
44     1946     1971
45     1971     1991
46     1991     2006
47     2006     2012
48     2012     2051
[0, 1945, 1946, 1971, 1991, 2006, 2012, 2051]
    age_min  age_max
49        0     1945
50     1946     1971
51     1971     1991
52     1991     2006
53     2006     2012
54     2012     2051
[0, 1918, 1919, 1930, 1960, 1999, 2009, 2051]
    age_min  age_max
59        0     1918
60     1919     1930
61     1930     1960
62     1960     1999
63     1999     2009
64     2009     2051
[0, 1918, 1919, 1930, 1960, 1999, 2009, 2051]
    age_min  age_max
65        0     1918
66     1919     1930
67     1930     1960
68     1960     1999
69     1999     2009
70     2009     2051
[0, 1918, 1919, 1930, 1960, 1999, 2

In [123]:
def set_label(df):
    bins = sorted(np.unique(df[['age_min', 'age_max']].values))
    df['age_bin'] = utils.generate_labels(bins)
    return df

# load data
tabula_params_path = os.path.join('..', 'metadata', 'TABULA_parameters_harmonized.csv')
tabula_params = pd.read_csv(tabula_params_path)

tabula_params.rename(columns={'Country': 'country', 'BuildingType': 'residential_type', 'Age': 'age_bin', 'q_h_nd': 'heating_demand'}, inplace=True)

# filter for relevant countries
tabula_params = tabula_params[tabula_params['country'].isin(['Netherlands', 'Spain', 'France'])]

# add columns for upper and lower bound of age bin
tabula_params['age_min'] = tabula_params['age_bin'].str[:4].astype(int)
tabula_params['age_max'] = tabula_params['age_bin'].str[-4:].astype(int)
mask_upper_bound = (tabula_params['age_min'] == tabula_params['age_max']) & (tabula_params['age_max'] > 1990)
mask_lower_bound = (tabula_params['age_min'] == tabula_params['age_max']) & (tabula_params['age_max'] < 1990)
tabula_params.loc[mask_upper_bound, 'age_max'] = 2050
tabula_params.loc[mask_lower_bound, 'age_min'] = 0
tabula_params.loc[mask_lower_bound, 'age_max'] = tabula_params['age_max'] - 1
tabula_params['age_max'] = tabula_params['age_max'] + 1

# harmonize bin labels with labels generated by classifier
tabula_params = tabula_params.groupby(['country', 'residential_type']).apply(set_label)

# save relevant TABULA data
tabula_heating_path = os.path.join('..', 'metadata', 'TABULA_heating_demand.csv')
tabula_heating = tabula_params[['country', 'residential_type', 'age_bin', 'age_min', 'age_max', 'heating_demand']]
tabula_heating.to_csv(tabula_heating_path, index=False)

#### Defining age bins

In [None]:
tabula_params = tabula_params[tabula_params['Country'].isin(['Netherlands', 'Spain', 'France'])]

# tabula_combined_bins = sorted(tabula_bins['Age'][~tabula_bins['Age'].str.contains('<')].str[:4].unique())

tabula_combined_bins = sorted(tabula_params['Age_Harmonized'].str[:4].unique())
print(tabula_combined_bins)
tabula_combined_bins = sorted(tabula_params['Age'].str[:4].unique())
print(tabula_combined_bins)
tabula_combined_bins = sorted(tabula_params[tabula_params['Country'] == 'Netherlands']['Age'].str[:4].unique())
print(tabula_combined_bins)

tabula_params[tabula_params['Country'] == 'Spain'][['Country', 'BuildingType', 'Age', 'Age_Harmonized', 'q_h_nd']][:30]

## Experiments

In [147]:
regressor_fr = AgePredictor(
    model=XGBRegressor(),
    df=df_fr_sample,
    test_training_split=split_80_20,
    preprocessing_stages=[remove_buildings_pre_1900]
)

regressor_nl = AgePredictor(
    model=XGBRegressor(),
    df=df_nl_sample,
    test_training_split=split_80_20,
    preprocessing_stages=[remove_buildings_pre_1900]
)

regressor_fr.print_model_error()
regressor_nl.print_model_error()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
2022-06-24 16:44:12,779 | INFO : Dataset length: 4441
2022-06-24 16:44:12,785 | INFO : Dataset allocated memory: 4 MB
2022-06-24 16:44:12,789 | INFO : Dataset standard deviation: 62.68525292516289
2022-06-24 16:44:12,790 | INFO : Dataset mean age: 1936.0225174510244
2022-06-24 16:44:12,791 | INFO : Training dataset length: 3552
2022-06-24 16:44:12,792 | INFO : Test dataset length: 889
2022-06-24 16:44:12,806 | INFO : Test dataset standard deviation after preprocessing: 33.72756817834984
2022-06-24 16:44:12,807 | INFO : Test dataset mean age after preprocessing: 1963.8566978193146

MAE: 18.55 y
RMSE: 25.70 y
R2: 0.4186
MAE: 11.92 y
RMSE: 18.21 y
R2: 0.4999


In [148]:
bins_fr = sorted(np.unique(tabula_params[tabula_params['country'] == 'France'][['age_min', 'age_max']].values))
bins_nl = sorted(np.unique(tabula_params[tabula_params['country'] == 'Netherlands'][['age_min', 'age_max']].values))

classifier_fr = AgeClassifier(
    model=XGBClassifier(),
    df=df_fr_sample,
    test_training_split=split_80_20,
    mitigate_class_imbalance=True,
    bins=bins_fr,
    preprocessing_stages=[remove_buildings_pre_1900]
)

classifier_nl = AgeClassifier(
    model=XGBClassifier(),
    df=df_nl_sample,
    mitigate_class_imbalance=True,
    bins=bins_nl,
    test_training_split=split_80_20,
    preprocessing_stages=[remove_buildings_pre_1900]
)

classifier_fr.print_model_error()
classifier_nl.print_model_error()


[autoreload of prediction failed: Traceback (most recent call last):
  File "/usr/local/Caskroom/miniconda/base/envs/uf-ml/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 257, in check
    superreload(m, reload, self.old_objects)
  File "/usr/local/Caskroom/miniconda/base/envs/uf-ml/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 480, in superreload
    update_generic(old_obj, new_obj)
  File "/usr/local/Caskroom/miniconda/base/envs/uf-ml/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 377, in update_generic
    update(a, b)
  File "/usr/local/Caskroom/miniconda/base/envs/uf-ml/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 329, in update_class
    if update_generic(old_obj, new_obj):
  File "/usr/local/Caskroom/miniconda/base/envs/uf-ml/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 377, in update_generic
    update(a, b)
  File "/usr/local/Caskroom/miniconda/base/envs/uf-ml/lib/pytho

Classification report:
               precision    recall  f1-score  support
<1915          0.527778  0.606383  0.564356       94
1915-1948      0.519481  0.444444  0.479042       90
1949-1967      0.474138  0.533981  0.502283      103
1968-1974      0.382979  0.400000  0.391304       45
1975-1981      0.589744  0.500000  0.541176       92
1982-1989      0.470588  0.588235  0.522876       68
1990-1999      0.440000  0.372881  0.403670       59
2000-2005      0.382353  0.351351  0.366197       37
2006-2012      0.352941  0.333333  0.342857       36
2013-2050      0.461538  0.333333  0.387097       18
accuracy       0.481308  0.481308  0.481308        0
macro avg      0.460154  0.446394  0.450086      642
weighted avg   0.482573  0.481308  0.479015      642
Cohen’s kappa: 0.4101
Matthews correlation coefficient (MCC): 0.4107
Classification report:
               precision    recall  f1-score  support
<1965          0.740095  0.846590  0.789769     4413
1965-1974      0.858111  0.789957  

In [167]:
bins_fr

[0, 1915, 1949, 1968, 1975, 1982, 1990, 2000, 2006, 2013, 2051]

In [160]:
bins_fr = sorted(np.unique(tabula_params[tabula_params['country'] == 'France'][['age_min', 'age_max']].values))
bins_nl = sorted(np.unique(tabula_params[tabula_params['country'] == 'Netherlands'][['age_min', 'age_max']].values))

comparison_config = {
    'France': {'df': df_fr_sample, 'bins': bins_fr},
    'Netherlands': {'df': df_nl_sample, 'bins': bins_nl},
}

# has to be aligned with TABULA bins for heating demand
grid_comparison_config = {
    '': {},
    'peter': {'bins': [1900, 1945, 1970, 1980, 1990, 2000, 2010]},
    '25': {'bins': [], 'bin_config': (1900, 2025, 25)},
    '10': {'bins': [], 'bin_config': (1900, 2025, 10)},
    '5': {'bins': [], 'bin_config': (1900, 2025, 5)},
}

comparison = AgeClassifierComparison(
    model=XGBClassifier(tree_method='hist'),
    df=None,
    cross_validation_split=cross_validation,
    preprocessing_stages=[remove_buildings_pre_1900],
    bin_config=None,
    mitigate_class_imbalance=True,
    include_baseline=False,
    comparison_config=comparison_config,
    grid_comparison_config=grid_comparison_config,
)

2022-06-24 18:03:22,760 | INFO : Starting experiment France__seed_0...
2022-06-24 18:03:22,778 | INFO : Generated bins: [0, 1915, 1949, 1968, 1975, 1982, 1990, 2000, 2006, 2013, 2051]
2022-06-24 18:03:22,779 | INFO : Generated bins with the following labels: ['<1915', '1915-1948', '1949-1967', '1968-1974', '1975-1981', '1982-1989', '1990-1999', '2000-2005', '2006-2012', '2013-2050']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
2022-06-24 18:03:22,812 | INFO : Dataset length: 4441
2022-06-24 18:03:22,819 | INFO : Dataset allocated memory: 4 MB
2022-06-24 18:03:22,825 | INFO : Training dataset length: 2960
2022-06-24 18:03:22

In [165]:
comparison.evaluate(include_energy_error=False)

Note that pos_label (set to 0) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 2) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 3) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 4) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 5) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 6) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 7) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_

Unnamed: 0,name,MCC,F1,Recall_<1915,Recall_1915-1948,Recall_1949-1967,Recall_1968-1974,Recall_1975-1981,Recall_1982-1989,Recall_1990-1999,Recall_2000-2005,Recall_2006-2012,Recall_2013-2050,Recall_<1965,Recall_1965-1974,Recall_1975-1991,Recall_1992-2005,Recall_2006-2014,Recall_2015-2050
0,France__seed_0,0.411497,0.455026,0.588372,0.52439,0.547573,0.246914,0.51417,0.49863,0.425868,0.427885,0.347594,0.361446,,,,,,
1,Netherlands__seed_0,0.707859,0.743114,,,,,,,,,,,0.840732,0.779944,0.765113,0.72185,0.65476,0.679245


In [150]:
classifier_nl.aux_vars_test['country'] = 'Netherlands'
regressor_nl.aux_vars_test['country'] = 'Netherlands'
classifier_fr.aux_vars_test['country'] = 'France'
regressor_fr.aux_vars_test['country'] = 'France'

In [152]:
for predictor in [regressor_fr, classifier_fr, regressor_nl, classifier_nl]:
    aux_vars = pd.concat([predictor.aux_vars_test, predictor.X_test[['FootprintArea']]], axis=1, join="inner")
    labels = getattr(predictor, 'labels', None)
    energy_modeling.calculate_energy_error(predictor.y_test, predictor.y_predict, aux_vars, labels=labels)

2022-06-24 16:53:12,682 | INFO : R2: 0.1065
2022-06-24 16:53:12,685 | INFO : MAPE: 0.5981
2022-06-24 16:53:12,745 | INFO : R2: 0.2321
2022-06-24 16:53:12,745 | INFO : MAPE: 0.4172
2022-06-24 16:53:13,224 | INFO : R2: 0.3351
2022-06-24 16:53:13,225 | INFO : MAPE: 0.4475
2022-06-24 16:53:13,671 | INFO : R2: 0.3200
2022-06-24 16:53:13,672 | INFO : MAPE: 0.3725


In [157]:
for predictor in [regressor_fr, classifier_fr, regressor_nl, classifier_nl]:
    # aux_vars = pd.concat([predictor.aux_vars_test, predictor.X_test[['FootprintArea']]], axis=1, join="inner")
    labels = getattr(predictor, 'labels', None)
    y_true = pd.concat([predictor.y_test, predictor.aux_vars_test, predictor.X_test[['FootprintArea']]], axis=1, join="inner")
    y_pred = pd.concat([predictor.y_predict, predictor.aux_vars_test, predictor.X_test[['FootprintArea']]], axis=1, join="inner")

    energy_modeling.calculate_energy_error(y_true, y_pred, labels=labels)

2022-06-24 17:37:48,276 | INFO : R2: 0.1828
2022-06-24 17:37:48,277 | INFO : MAPE: 0.4995

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
2022-06-24 17:37:48,374 | INFO : R2: 0.3481
2022-06-24 17:37:48,374 | INFO : MAPE: 0.3235
2022-06-24 17:37:48,885 | INFO : R2: 0.3351
2022-06-24 17:37:48,886 | INFO : MAPE: 0.4475
2022-06-24 17:37:49,327 | INFO : R2: 0.3200
2022-06-24 17:37:49,328 | INFO : MAPE: 0.3725
