In [7]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularPredictor
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import matplotlib.pylab as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rc('font', size=8)
pd.set_option('display.max_columns', None)


symbols = ('x','.','o','v')

In [9]:
def load_data_orig(fname='pwh_data_orig.csv'):
    return pd.read_csv(fname,
                    parse_dates=["egain_installation"])

def load_data_with_predictions(fname='building_data_prediction_ag_13mins.csv', covid_years=True):
    
    df = load_data_orig()
    
    results_df = pd.read_csv(fname)

    
    results_df = pd.merge(df,results_df[['OID','year','autogluon_prediction']],
            how='left',left_on=['OID','year'],right_on=['OID','year'])

    # exclude outliers with heating turned off
    results_df['energy_per_sqm'] = (results_df['kWh'] * results_df['KF'])/results_df['area']
    results_df = results_df[results_df['energy_per_sqm'] > 10]
    
    # climate correct consumption before computing eco scores
    results_df['energy_per_sqm'] = (results_df['kWh'] * results_df['KF'])/results_df['area']
    
    # translate category names
    results_df['kategorie'] = results_df['kategorie'].replace({'OSZ':'School','FW':'Fire Dept.','PZ':'Police','Kultur':'Culture'})
    # drop unneccessary columns
    results_df = results_df.drop(['flaeche','hid','bbsr_unbereinigt','hzg_kWh','ww_kWh','verbrauch_kWh','f_flaeche'],axis=1)

    if not covid_years:
        results_df = results_df[results_df['year'].isin([2020,2021])==False]
    
    results_df['prediction_bbsr'] = results_df['f_ANGF'] * results_df['area'] * (results_df['TEK_HZG'] + results_df['TEK_HZG'])

    groups = results_df.groupby(['has_egain','kategorie']).agg({'energy_per_sqm':'mean'})
    eco_scores = (groups.unstack().loc[True,:] / groups.unstack().loc[False,:]).unstack().T

    for egain, category in groups.index:
        idx = (results_df['has_egain'] == egain) & (results_df['kategorie'] == category)
        results_df.loc[idx,'prediction_bbsr_a'] = results_df.loc[idx,'prediction_bbsr'] * eco_scores.loc[category,'energy_per_sqm']
        results_df.loc[idx,'prediction_bbsr_o'] =  groups.loc[(egain, category),'energy_per_sqm'] * results_df['area'] 

    results_df['autogluon_prediction'] = results_df['autogluon_prediction'] * results_df['KF']
    results_df['prediction_bbsr_a'] = results_df['prediction_bbsr_a'] # no KF correction, as the predictions are corrected already * results_df['KF']
    results_df['prediction_bbsr_o'] = results_df['prediction_bbsr_o'] # * results_df['KF']
    

    return results_df


In [None]:
df = load_data_with_predictions()
df

# Data and Features

The data frame contains for a number of public buildings (unique ids are in column ``OID``) the energy consumption (`kWh`) per year for heating and warm water. 

There are a number of building features, such as area, building type (also pseudonymized), as well as a number of features that characterise the year, such as the number of sun hours. 

The column ``verbrauch_kWh`` contains the predicted energy consumption computed with a simple but [effective model provided by the BBSR](https://www.bbsr.bund.de/BBSR/DE/veroeffentlichungen/bbsr-online/2019/bbsr-online-20-2019.html). 


# Predict Building Consumption with e-gain

The goal is to predict the energy savings for each building after installing a system for weather guided heating control. The system installed is called ``e-gain``. 

We can use the autoML package ``autogluon`` for a simple prototype. 

In [None]:

import warnings
warnings.filterwarnings("ignore")

if False:
    label='kWh'
    features=['OID', 
              'energietraeger',
              'area',
              'kWh',
              'kategorie',
              'year',
              'TM',
              'SO',
              'NM',
              'FM',
              'RFM'
             ]

    results = []

    for building in df['OID'].unique():

        train_df = df[(df['OID'] != building) | ((df['OID'] == building) & (df['has_egain'] == False))]
        test_df = df[(df['OID'] == building) & (df['has_egain'] == True)]
        if len(test_df) > 0:
            pred = TabularPredictor(label=label,
                             problem_type='regression',
                             verbosity=0)\
                              .fit(train_data=train_df[features],
                                                        verbosity=0, time_limit=300,
                                      presets='best_quality')
            new_df = test_df.copy()
            new_df['autogluon_prediction'] = pred.predict(test_df).copy()
            results.append(new_df)
            print(new_df)

    results_df = pd.concat(results)


    mape_ag = mean_absolute_error(results_df['kWh'],results_df['autogluon_prediction'])
    mape_din = mean_absolute_error(results_df['kWh'],results_df['prediction_bbsr_a'])
    mape_bbsr = mean_absolute_error(results_df['kWh'],results_df['prediction_bbsr_o'])

    print(f'MAE autogluon {mape_ag:0.2f}, MAPE DIN {mape_din:0.2f}, MAPE bbsr-cv {mape_bbsr:0.2f}')

    mape_ag_kf = mean_absolute_error(results_df['kWh_bereinigt'],results_df['autogluon_prediction'] * results_df['KF'])
    mape_din_kf = mean_absolute_error(results_df['kWh_bereinigt'],results_df['prediction_bbsr_a'] * results_df['KF'])
    mape_bbsr_kf = mean_absolute_error(results_df['kWh_bereinigt'],results_df['prediction_bbsr_o'] * results_df['KF'])


    print(f'KF: MAE autogluon {mape_ag_kf:0.2f}, MAPE DIN {mape_din_kf:0.2f}, MAPE bbsr-cv {mape_bbsr_kf:0.2f}')


    pd.merge(df.drop('autogluon_prediction', axis=1),results_df[['OID','year','autogluon_prediction']],
            how='left',left_on=['OID','year'],right_on=['OID','year']).to_csv('building_data_prediction_new_.csv',index=False)




In [None]:
df = load_data_with_predictions()

energy_savings = []
for building_id in df['OID'].unique():
    if len(df.loc[(df['OID']==building_id) & (df['has_egain']==False),'kWh_bereinigt']) > 0:
        consumption_with_egain = df.loc[(df['OID']==building_id) 
                                    & (df['has_egain']==True),'kWh_bereinigt'].median()
        consumption_without_egain = df.loc[(df['OID']==building_id) 
                                    & (df['has_egain']==False),'kWh_bereinigt'].median()
        predicted_consumption_with_egain_ag = df.loc[(df['OID']==building_id) 
                                    & (df['has_egain']==True),'autogluon_prediction'].median()
        predicted_consumption_with_egain_bbsr_a = df.loc[(df['OID']==building_id) 
                                    & (df['has_egain']==True),'prediction_bbsr_a'].median()
        predicted_consumption_with_egain_bbsr_o = df.loc[(df['OID']==building_id) 
                                    & (df['has_egain']==True),'prediction_bbsr_o'].median()
        energy_savings.append(
            {'OID':building_id,
             'category':df.loc[(df['OID']==building_id),'kategorie'].values[0],
             'consumption_with_egain': consumption_with_egain,
             'consumption_without_egain': consumption_without_egain,
             'predicted_consumption_with_egain_ag': predicted_consumption_with_egain_ag,
             'predicted_consumption_with_egain_bbsr_a': predicted_consumption_with_egain_bbsr_a,
             'predicted_consumption_with_egain_bbsr_o': predicted_consumption_with_egain_bbsr_o
            }
        )
df_savings = pd.DataFrame(energy_savings)
df_savings = df_savings.dropna(subset=['consumption_with_egain'])
df_savings['true_saving'] = 100 - 100 * df_savings['consumption_with_egain'] / df_savings['consumption_without_egain']
df_savings['predicted_saving_ag'] = 100 - 100 * df_savings['predicted_consumption_with_egain_ag'] / df_savings['consumption_without_egain']
df_savings['predicted_saving_bbsr_a'] = 100 - 100 * df_savings['predicted_consumption_with_egain_bbsr_a'] / df_savings['consumption_without_egain']
df_savings['predicted_saving_bbsr_o'] = 100 - 100 * df_savings['predicted_consumption_with_egain_bbsr_o'] / df_savings['consumption_without_egain']

plt.figure(figsize=(10,8))
# lim = 2
plt.subplot(1,3,1)
for sym, cat in zip(symbols, df_savings['category'].unique()):
    idx = df_savings['category'] == cat
    plt.plot(df_savings.loc[idx,'true_saving'], df_savings.loc[idx,'predicted_saving_bbsr_a'],sym)
plt.axis('square')

# plt.plot(df_savings.loc[idx, 'true_saving'], df_savings.loc[:, 'predicted_saving_bbsr_a'],sym)
# plt.axis('square')
plt.plot([-50,100],[-50,100],'k-')
plt.ylim([-150,70])
plt.xlim([-30,70])

plt.axis('square')
plt.xlabel('True Energy Saving (%)')
plt.ylabel('Predicted Saving BBSR (%)')

plt.subplot(1,3,2)
# plt.plot(df_savings['true_saving'],df_savings['predicted_saving_bbsr_o'],'.')
for sym, cat in zip(symbols, df_savings['category'].unique()):
    idx = df_savings['category'] == cat
    plt.plot(df_savings.loc[idx,'true_saving'], df_savings.loc[idx,'predicted_saving_bbsr_o'],sym)

# lim = 1.9e6
plt.plot([-50,70],[-50,70],'k-')
plt.ylim([-150,70])
plt.xlim([-30,70])

plt.axis('square')
plt.xlabel('True Energy Saving (%)')
plt.ylabel('Predicted Saving BBSR New(%)')

plt.subplot(1,3,3)
for sym, cat in zip(symbols, df_savings['category'].unique()):
    idx = df_savings['category'] == cat
    plt.plot(df_savings.loc[idx,'true_saving'], df_savings.loc[idx,'predicted_saving_ag'],sym)

plt.plot([-50,100],[-50,100],'k-')
plt.ylim([-150,70])
plt.xlim([-30,70])
# plt.plot(df_savings['true_saving'],df_savings['predicted_saving_ag'],'.')
plt.axis('square')
# lim = 1.9e6
plt.legend(df_savings['category'].unique())
plt.xlabel('True Energy Saving (%)')
plt.ylabel('Predicted Saving AutoGluon (%)')

plt.tight_layout()

In [None]:
mae_percent_per_category = []

for sym, cat in zip(symbols, df_savings['category'].unique()):
    idx = df_savings['category'] == cat
    errors_bbsr_o = abs(df_savings.loc[idx,'true_saving'] - df_savings.loc[idx,'predicted_saving_bbsr_o'])
    errors_bbsr_a = abs(df_savings.loc[idx,'true_saving'] - df_savings.loc[idx,'predicted_saving_bbsr_a'])
    errors_ag = abs(df_savings.loc[idx,'true_saving'] - df_savings.loc[idx,'predicted_saving_ag'])
    plt.plot(errors_bbsr_o, errors_ag, sym)
    mae_percent_per_category.append({
        'BBSR-O': errors_bbsr_o.mean(),
        'BBSR-A': errors_bbsr_a.mean(),
        'AutoML': errors_ag.mean(),
        'category': cat
    })
    
errors_bbsr_o = abs(df_savings.loc[:,'true_saving'] - df_savings.loc[:,'predicted_saving_bbsr_o'])
errors_bbsr_a = abs(df_savings.loc[:,'true_saving'] - df_savings.loc[:,'predicted_saving_bbsr_a'])
errors_ag = abs(df_savings.loc[:,'true_saving'] - df_savings.loc[:,'predicted_saving_ag'])

mae_percent_per_category.append({
    'BBSR-O': errors_bbsr_o.mean(),
    'BBSR-A': errors_bbsr_a.mean(),
    'AutoML': errors_ag.mean(),
    'category': 'total'
})

    
lim = 165
plt.plot([-10,lim],[-10,lim],'k-')
plt.axis('square')
plt.ylim([-10,lim])
plt.xlim([-10,lim])
# plt.plot(df_savings['true_saving'],df_savings['predicted_saving_ag'],'.')

# lim = 1.9e6
plt.legend(df_savings['category'].unique())
plt.title('Absolute Error Energy Saving Prediction (in %)')
plt.xlabel('BBSR-O')
plt.ylabel('AutoML')

plt.savefig('energy_savings_error.pdf')

print('Saving prediction errors per category\n')
print(pd.DataFrame(mae_percent_per_category)[['category', 'BBSR-O', 'BBSR-A', 'AutoML']].set_index('category').astype(int).to_latex())

savings_per_category = df_savings.loc[:,['category',
          'predicted_saving_bbsr_a','predicted_saving_bbsr_o',
                  'predicted_saving_ag',
                   'true_saving'
                   ]].groupby('category').agg('median')
print('Savings per category\n')
print(savings_per_category.to_latex())

In [None]:
savings_per_category

In [None]:
df = load_data_with_predictions(covid_years=True)

aa = plt.figure(figsize=(6,6))

ax = plt.subplot(2,2,1)
tmp_boxplot_df = df[['kategorie','OID']].drop_duplicates()['kategorie'].value_counts()
tmp_boxplot_df[['Culture','Fire Dept.','Police','School']].plot.bar(ax=ax,rot=0)
plt.xlabel('Building Category')
plt.ylabel('Count')
plt.title('(A) Buildings per Category')

ax = plt.subplot(2,2,2)
axs = df[['MWh/a','kategorie']].boxplot(by='kategorie',ax=ax)
axs.set_ylim([0,1.4])
axs.set_xlabel('')
axs.set_title('(B) Consumption')
axs.set_ylabel('MWh/a')
axs.set_xlabel('Building Category')
# ax.texts[0].set_text('')

ax = plt.subplot(2,2,3)
axs = df[['area','kategorie']].boxplot(by='kategorie', ax=ax)
axs.set_ylim([0,13e3])
# axs.set_xlabel('')
axs.set_title('(C) Area')
axs.set_ylabel('$m^2$')
axs.set_xlabel('Building Category')

ax = plt.subplot(2,2,4)
axs = df_savings.loc[:,['category', 'true_saving']].boxplot(by='category', ax=ax)
# axs.set_ylim([0,13e3])
# axs.set_xlabel('')
axs.set_title('(D) Energy Saving')
axs.set_ylabel('(%)')
axs.set_xlabel('Building Category')


aa.texts[0].set_text('')
plt.tight_layout()

plt.savefig('dataset.pdf')

In [None]:
df_savings

In [None]:
results_df = load_data_with_predictions().dropna(subset=['autogluon_prediction'])

mae_ag = mean_absolute_error(results_df['kWh_bereinigt'], results_df['autogluon_prediction']) / 1e6
mae_din = mean_absolute_error(results_df['kWh_bereinigt'], results_df['prediction_bbsr_a'])  / 1e6
mae_din_new = mean_absolute_error(results_df['kWh_bereinigt'], results_df['prediction_bbsr_o'])  / 1e6

symbols = ('x','.','^','v')

plt.figure(figsize=(10,4))
lim = 1.3
plt.subplot(1,3,1)
for sym, cat in zip(symbols, results_df['kategorie'].unique()):
    idx = results_df['kategorie'] == cat
    plt.plot(results_df.loc[idx,'MWh/a'],results_df.loc[idx,'prediction_bbsr_a']/1e6,sym)
plt.axis('square')
plt.ylim([0,lim])
plt.xlim([0,lim])
plt.plot([0,lim],[0,lim],'k-')
plt.xlabel('Energy Consumption (MWh/a)')
plt.ylabel('Prediction (MWh/a)')
plt.title(f'BBSR-A\nMean Absolute Error {mae_din:0.3}')

plt.subplot(1,3,2)
for sym, cat in zip(symbols, results_df['kategorie'].unique()):
    idx = results_df['kategorie'] == cat
    plt.plot(results_df.loc[idx,'MWh/a'],results_df.loc[idx,'prediction_bbsr_o']/1e6,sym)
plt.axis('square')
plt.ylim([0,lim])
plt.xlim([0,lim])
plt.plot([0,lim],[0,lim],'k-')
plt.xlabel('Energy Consumption (MWh/a)')
plt.ylabel('Prediction(MWh/a)')
plt.title(f'BBSR-O\nMean Absolute Error {mae_din_new:0.3}')


plt.subplot(1,3,3)
for sym, cat in zip(symbols, results_df['kategorie'].unique()):
    idx = results_df['kategorie'] == cat
    plt.plot(results_df.loc[idx,'MWh/a'],results_df.loc[idx,'autogluon_prediction']/1e6,sym)
plt.axis('square')
plt.ylim([0,lim])
plt.xlim([0,lim])
plt.plot([0,lim],[0,lim],'k-')
plt.xlabel('Energy Consumption (MWh/a)')
plt.ylabel('Prediction Consumption (MWh/a)')
plt.legend(results_df['kategorie'].unique())
plt.title(f'AutoML\n Mean Absolute Error {mae_ag:0.3}')

plt.tight_layout()
plt.savefig('prediction_quality_comparison_scatter.pdf')

In [None]:
results_df = load_data_with_predictions(covid_years=True).dropna(subset=['autogluon_prediction'])
# results_df = load_data_with_predictions('building_data_prediction.csv').dropna(subset=['autogluon_prediction'])
# results_df = load_data_with_predictions('building_data_prediction_new-ag-5mins.csv').dropna(subset=['autogluon_prediction'])
errors = []

for cat in results_df['kategorie'].unique():
    idx = results_df['kategorie']==cat
    mape_ag = mean_absolute_percentage_error(results_df.loc[idx,'kWh_bereinigt'], results_df.loc[idx,'autogluon_prediction'])
    mape_din = mean_absolute_percentage_error(results_df.loc[idx,'kWh_bereinigt'], results_df.loc[idx,'prediction_bbsr_a'])
    mape_din_new = mean_absolute_percentage_error(results_df.loc[idx,'kWh_bereinigt'], results_df.loc[idx,'prediction_bbsr_o'])
    mae_ag = mean_absolute_error(results_df.loc[idx,'kWh_bereinigt'], results_df.loc[idx,'autogluon_prediction'])
    mae_din = mean_absolute_error(results_df.loc[idx,'kWh_bereinigt'], results_df.loc[idx,'prediction_bbsr_a'])
    mae_din_new = mean_absolute_error(results_df.loc[idx,'kWh_bereinigt'], results_df.loc[idx,'prediction_bbsr_o'])

    errors.append([int(mae_din), int(mae_din_new), int(mae_ag), np.round(mape_din,2), np.round(mape_din_new,2), np.round(mape_ag,2)])

errors_df = pd.DataFrame(errors,index=results_df['kategorie'].unique(),columns=['MAE BBSR-A', 'MAE BBSR-O', 'MAE AutoML', 'MAPE BBSR-A', 'MAPE BBSR-O', 'MAPE AutoML'])
errors_df.loc['Total',:] = errors_df.median(axis=0).round(decimals=0).values
print(errors_df[['MAE BBSR-A', 'MAE BBSR-O', 'MAE AutoML']].astype(int).to_latex())

mae_ag = mean_absolute_error(results_df['kWh_bereinigt'], results_df['autogluon_prediction'])
mae_bbsr_a = mean_absolute_error(results_df['kWh_bereinigt'], results_df['prediction_bbsr_a'])
mae_bbsr_o = mean_absolute_error(results_df['kWh_bereinigt'], results_df['prediction_bbsr_o'])
print(f'MAE BBSR-A {mae_bbsr_a:0.2f}, MAE BBSR-O {mae_bbsr_o:0.2f} MAE autogluon {mae_ag:0.2f}')

In [None]:
from sklearn.decomposition import PCA

from shutil import rmtree
import itertools

rmtree('AutogluonModels',ignore_errors=True)

n_components = 1
n_samples = 100

df = pd.read_csv('building_data_prediction.csv')
df

features_building = ['OID','energietraeger', 'area', 'kategorie', 'has_egain']
features_weather     = ['KF', 'TM', 'SO', 'NM', 'FM', 'RFM']
features = features_building + features_weather
target = ["kWh"]

pca = PCA(n_components = n_components, whiten=True).fit(df[features_weather].drop_duplicates().values)
print(f'Explained Variance Ratio with {n_components}: {pca.explained_variance_ratio_}')

latents = np.linspace(-5, 5, n_samples)
synthesized_weather = pca.inverse_transform(np.array(latents).reshape(len(latents),1))

comparison_syn_real = pd.DataFrame(
                        [
                        synthesized_weather.min(axis=0),
                        df[features_weather].values.min(axis=0),
                        synthesized_weather.max(axis=0),
                        df[features_weather].values.max(axis=0)],
                        columns = features_weather,
                        index = ['syn_min','real_min','syn_max','real_max'])
comparison_syn_real

In [None]:
df

In [None]:
plt.figure(figsize=(3,2))
df_savings.loc[:,[
            'predicted_saving_ag',
            'true_saving'
            ]].boxplot()
plt.ylim([-25,50])
plt.xticks([1,2],['ML','True'])
plt.yticks(range(-20,50,10))
plt.ylabel('Energy Savings (%)')
plt.tight_layout()
plt.savefig('energy_savings_predicted.pdf')

In [None]:
from sklearn.decomposition import PCA

from shutil import rmtree
import itertools

rmtree('AutogluonModels',ignore_errors=True)

n_components = 1
n_samples = 100

# df = pd.read_csv('building_data_prediction_new.csv')
df = load_data_with_predictions(covid_years=True).dropna(subset=['autogluon_prediction'])
df

features_building = ['OID','energietraeger', 'area', 'kategorie', 'has_egain']
features_weather     = ['KF', 'TM', 'SO', 'NM', 'FM', 'RFM']
features = features_building + features_weather
target = ["kWh"]

pca = PCA(n_components = n_components, whiten=True).fit(df[features_weather].values)
print(f'Explained Variance Ratio with {n_components}: {pca.explained_variance_ratio_}')

latents = np.linspace(-5, 5, n_samples)
synthesized_weather = pca.inverse_transform(np.array(latents).reshape(len(latents),1))

comparison_syn_real = pd.DataFrame(
                        [
                        synthesized_weather.min(axis=0),
                        df[features_weather].values.min(axis=0),
                        synthesized_weather.max(axis=0),
                        df[features_weather].values.max(axis=0)],
                        columns = features_weather,
                        index = ['syn_min','real_min','syn_max','real_max'])
comparison_syn_real

In [None]:
df = load_data_with_predictions()

# building_id = 217601 230717
target = ['kWh']

import matplotlib.pyplot as plt
plt.style.use('ggplot')
# %config InlineBackend.figure_formats = ['svg']
# %config InlineBackend.figure_format = 'svg'
import matplotlib

matplotlib.rcParams['lines.linewidth'] = 2
matplotlib.rcParams['figure.figsize'] = [12,6]

# for building_id in df['OID'].unique():
building_id = 230717

# let's first discard all information on this building
train_df = df[df['OID'] != building_id]

# Now let's assume we get the relevant building information from the Web UI
idx = df['OID']==building_id

# first let's extract the area
area = df.loc[idx, 'area'].to_list()[0]

# now let's extract the building type
category = df.loc[idx, 'kategorie'].to_list()[0]
energietraeger = df.loc[idx, 'energietraeger'].to_list()[0]

# now extract the year and corresponding consumption
years_and_kWh = df.loc[idx, ['year', 'kWh']]

# let's first remove the rows that we're pretending to be user input
train_df_db = df.loc[df['OID']!=building_id, features + target]
train_df_db


# now let's rebuild the training data from user input and weather variables from our DB
train_data_user_input = []
for _, row in years_and_kWh.iterrows():
    print()
    tmp_df_weather = df.loc[df['year']==row['year'], features_weather].head(n=1).reset_index(drop=True)
    tmp_df = pd.concat([
                pd.Series([building_id], name='OID'),
                pd.Series([area], name='area'),
                pd.Series([category], name='kategorie'),
                pd.Series([False], name='has_egain'),
                pd.Series([energietraeger], name='energietraeger'),
                pd.Series([row['kWh']], name='kWh'),
                tmp_df_weather], axis=1)
    train_data_user_input.append(
        tmp_df
    )

train_df = pd.concat([pd.concat(train_data_user_input), train_df_db])
train_df[features + target]


weather = pd.DataFrame(synthesized_weather, columns = features_weather)
sampled_df = pd.concat([
                pd.Series([None]*len(weather), name='kWh'),
                pd.Series([building_id]*len(weather), name='OID'),
                pd.Series([area]*len(weather), name='area'),
                pd.Series([category]*len(weather), name='kategorie'),
                pd.Series([True]*len(weather), name='has_egain'),
                pd.Series([energietraeger]*len(weather), name='energietraeger'),
                weather],axis=1)
sampled_df = pd.concat([sampled_df, sampled_df],axis=0,ignore_index=True)
sampled_df.loc[:len(weather)-1,'has_egain'] = False
# sampled_df
test_df = sampled_df.copy()

pred = TabularPredictor(label=target[0],
                         problem_type='regression',
                         verbosity=0)\
                         .fit(train_data=train_df[features+target],
                                                    time_limit=10,
                              presets='best_quality')
test_df['autogluon_prediction'] = pred.predict(test_df)

rmtree('AutogluonModels')

plt.figure()

plt.plot(
    test_df.loc[test_df['has_egain']==True,'TM'],
    test_df.loc[test_df['has_egain']==True,'autogluon_prediction']/1e6,'o')
plt.plot(
    test_df.loc[test_df['has_egain']==False,'TM'],
    test_df.loc[test_df['has_egain']==False,'autogluon_prediction']/1e6,'o')


coldest = weather[weather['TM']==weather['TM'].min()]
warmest = weather[weather['TM']==weather['TM'].max()]

plt.plot(
    [coldest['TM'], warmest['TM']], 
    [df.loc[(df['OID']==building_id) &  (df['has_egain']==False),'prediction_bbsr_o'].values[0]/(1e6 * coldest['KF']),
     df.loc[(df['OID']==building_id) &  (df['has_egain']==False),'prediction_bbsr_o'].values[0]/(1e6 * warmest['KF']) 
    ],'--')

plt.plot(
    [coldest['TM'], warmest['TM']], 
    [df.loc[(df['OID']==building_id) &  (df['has_egain']==True),'prediction_bbsr_o'].values[0]/(1e6 * coldest['KF']),
     df.loc[(df['OID']==building_id) &  (df['has_egain']==True),'prediction_bbsr_o'].values[0]/(1e6 * warmest['KF']) 
    ],'--')

plt.plot(
    train_df.loc[train_df['OID']==building_id,'TM'],
    train_df.loc[train_df['OID']==building_id,'kWh']/(1e6),'o')

plt.legend(['Prediction AutoML','Prediction AutoML Retrofit','Prediction $BBSR_O$','Prediction $BBSR_O$ Retrofit', 'train input'])
plt.xlabel('Average Temperature / a')
plt.ylabel('MWh/a')
plt.title(f'category: {category}, area: {area} m^2')
plt.savefig(f'synth_weather_{category}_{building_id}_{area}.pdf')