In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

from hickathon_2023.data_preprocessing import get_data_preprocessor
from hickathon_2023.feature_extraction import FeatureExtractor
from hickathon_2023.model import get_model
from hickathon_2023.utils import Pipeline, load_data

SMALL_DATASET = True
Y_FEATURE = "energy_consumption_per_annum"

In [2]:
if SMALL_DATASET:
    path = "datasets/small/"
else:
    path = "datasets/"


def get_data():
    X_train, y_train = load_data(
        X_path=path + "train/train_features_sent.csv",
        y_path=path + "train/train_labels_sent.csv",
        y_feature=Y_FEATURE,
    )
    X_test, _ = load_data(X_path=path + "test/test_features_sent.csv")

    return X_train, y_train, X_test


X_train, y_train, X_test = get_data()

# 1. Feature extraction

In [3]:
X_train.head()

Unnamed: 0,level_0,additional_heat_generators,additional_water_heaters,altitude,area_code,balcony_depth,bearing_wall_material,building_category,building_class,building_height_ft,...,wall_insulation_type,water_heaters,water_heating_energy_source,water_heating_type,window_filling_type,window_frame_material,window_glazing_type,window_heat_retention_factor,window_orientation,window_thermal_conductivity
0,19594,[],,177.165359,654,,CONCRETE,[condo],[2 to 11],42.322838,...,internal,[electric storage tank],electricity,individual,argon or krypton,pvc,double glazing,1.116894,"[east or west,north]",16.057268
1,19099,[wood stove or insert],electric water heating,91.863518,354,,AGGLOMERATE,[individual house],[individual],12.795276,...,internal,"[solar water heater,electric storage tank]",electricity,individual,argon or krypton,metal without thermal break,double glazing,1.289049,"[est,north,sud]",22.149689
2,-29928,[],,823.490845,347,,CONCRETE,[individual house],[individual],15.748032,...,internal,[low temperature gas boiler],gas,individual,,wood,single glazing,1.212343,"[north,sud]",23.781612
3,3127,[],,341.207367,815,,INDETERMINED,"[condo,individual house]","[12+,2 to 11]",36.089241,...,internal,[electric storage tank],electricity,individual,dry air,pvc,double glazing,1.116894,"[est,sud]",15.347292
4,-22101,[],thermodynamic electric hot water (heat pump or...,869.422607,659,,AGGLOMERATE,[individual house],[2 to 11],13.451444,...,internal,"[standard gas boiler,thermodynamic electric ho...",thermodynamic electric hot water (pac or tank)...,individual,argon or krypton,wood,double glazing,1.599851,"[est,west,sud]",13.290529


In [4]:
feature_extractor = FeatureExtractor()
X_train = feature_extractor.transform(X_train)
X_train.head()

Unnamed: 0,level_0,altitude,area_code,balcony_depth,building_height_ft,building_total_area_sqft,building_type,building_use_type_code,building_use_type_description,building_year,...,water_heating_electricity,water_heating_wood,water_heating_network,water_heating_coal,window_orientation_north,window_orientation_east,window_orientation_south,window_orientation_west,lower_year_building,upper_year_building
0,19594,177.165359,654,0,42.322838,3562.85083,Flat,2,Residential multi-family,2002.0,...,True,False,False,False,True,True,False,True,1989.0,1999.0
1,19099,91.863518,354,0,12.795276,1754.515747,House,1,Residential single-family,1985.0,...,True,False,False,False,True,True,True,False,1970.0,1988.0
2,-29928,823.490845,347,0,15.748032,968.750977,House,1,Residential single-family,1970.0,...,False,False,False,False,True,False,True,False,1949.0,1970.0
3,3127,341.207367,815,0,36.089241,,Flat,2,Residential multi-family,1993.0,...,True,False,False,False,False,True,True,False,1989.0,1999.0
4,-22101,869.422607,659,0,13.451444,1162.501221,House,1,Residential single-family,1965.0,...,True,False,False,False,False,True,True,True,1949.0,1970.0


# 2. Data preprocessing

In [5]:
# from sklearn.impute import SimpleImputer
# from sklearn.compose import ColumnTransformer

# inputer = ColumnTransformer(
#     transformers=[
#         ("inputer_median", SimpleImputer(strategy="median"),
#                 [
#                     "altitude",
#                     "building_height_ft",
#                     "building_total_area_sqft",
#                     "living_area_sqft",
#                     "lowe_floor_thermal_conductivity",
#                     "outer_wall_thermal_conductivity",
#                     "percentage_glazed_surfaced",
#                     "upper_floor_thermal_conductivity",
#                     "window_heat_retention_factor",
#                     "window_thermal_conductivity",
#                 ])])
# inputer.fit_transform(X_train).isna().sum()

In [6]:
data_preprocessor = get_data_preprocessor()
X_train = data_preprocessor.fit_transform(X_train)
X_train.shape

(10107, 89)

In [7]:
X_train

Unnamed: 0,add_heat_generators_boiler,add_heat_generators_heat_pump,add_heat_generators_stove,altitude,area_code,balcony_depth,bearing_wall_agglomerate,bearing_wall_bricks,bearing_wall_chipboard,bearing_wall_concrete,...,water_heating_wood,window_filling_type,window_frame_material,window_glazing_type,window_heat_retention_factor,window_orientation_east,window_orientation_north,window_orientation_south,window_orientation_west,window_thermal_conductivity
0,False,False,False,177.165359,654,0,False,False,False,True,...,False,2,-1,1,1.116894,True,True,False,True,16.057268
1,False,False,True,91.863518,354,0,True,False,False,False,...,False,2,2,1,1.289049,True,True,True,False,22.149689
2,False,False,False,823.490845,347,0,False,False,False,True,...,False,0,1,0,1.212343,False,True,True,False,23.781612
3,False,False,False,341.207367,815,0,False,False,False,False,...,False,1,-1,1,1.116894,True,False,True,False,15.347292
4,False,False,False,869.422607,659,0,True,False,False,False,...,False,2,1,1,1.599851,True,False,True,True,13.290529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10102,False,False,False,580.708679,637,0,False,False,False,False,...,False,0,1,0,1.212343,True,False,True,False,28.040001
10103,False,False,False,78.740158,356,0,False,True,False,False,...,False,1,1,0,1.212343,True,True,False,True,25.454838
10104,False,False,False,78.740158,156,0,False,False,False,False,...,False,0,-1,1,1.084204,True,False,False,True,16.779108
10105,False,False,False,291.994751,660,0,False,True,False,False,...,False,1,1,1,1.133072,True,True,True,False,14.649374


Inputer

In [8]:
for column in [
    "altitude",
    "building_height_ft",
    "building_total_area_sqft",
    "living_area_sqft",
    "lowe_floor_thermal_conductivity",
    "outer_wall_thermal_conductivity",
    "percentage_glazed_surfaced",
    "upper_floor_thermal_conductivity",
    "window_heat_retention_factor",
    "window_thermal_conductivity",
]:
    X_train[column] = X_train[column].fillna(X_train[column].median())

for column in [
    "nb_commercial_units",
    "nb_dwellings",
    "nb_housing_units",
]:
    X_train[column] = X_train[column].fillna(0)

X_train["building_use_type_code"] = X_train["building_use_type_code"].fillna(1)

Convert dtypes

In [9]:
from hickathon_2023.data_preprocessing import convert_dtypes

X_train = convert_dtypes(X_train)

OneHotEncoder

In [10]:
from sklearn.preprocessing import OneHotEncoder
from hickathon_2023.features import FEATURES_ONEHOT
import numpy as np

encoder = OneHotEncoder(handle_unknown="ignore", sparse=False, dtype=bool)

print(X_train.shape)

X_train_onehot = encoder.fit_transform(X_train[FEATURES_ONEHOT])
X_train = np.concatenate(
    [X_train.drop(columns=FEATURES_ONEHOT).to_numpy(), X_train_onehot], axis=1
)

print(X_train.shape)

(10107, 89)
(10107, 221)


In [11]:
if SMALL_DATASET:
    np.save("datasets/train/X_small.npy", X_train)
else:
    np.save("datasets/train/X_big.npy", X_train)

# X_test

In [12]:
print(X_test.shape)

X_test = feature_extractor.transform(X_test)

print(X_test.shape)

X_test = data_preprocessor.transform(X_test)

print(X_test.shape)

for column in [
    "altitude",
    "building_height_ft",
    "building_total_area_sqft",
    "living_area_sqft",
    "lowe_floor_thermal_conductivity",
    "outer_wall_thermal_conductivity",
    "percentage_glazed_surfaced",
    "upper_floor_thermal_conductivity",
    "window_heat_retention_factor",
    "window_thermal_conductivity",
]:
    X_test[column] = X_test[column].fillna(X_test[column].median())

for column in [
    "nb_commercial_units",
    "nb_dwellings",
    "nb_housing_units",
]:
    X_test[column] = X_test[column].fillna(0)

X_test["building_use_type_code"] = X_test["building_use_type_code"].fillna(1)

X_test = convert_dtypes(X_test)

print(X_test.shape)

X_test_onehot = encoder.transform(X_test[FEATURES_ONEHOT])
X_test = np.concatenate(
    [X_test.drop(columns=FEATURES_ONEHOT).to_numpy(), X_test_onehot], axis=1
)

print(X_test.shape)

(4949, 71)
(4949, 112)
(4949, 89)
(4949, 89)
(4949, 221)


In [13]:
if SMALL_DATASET:
    np.save("datasets/test/X_test_small.npy", X_train)
else:
    np.save("datasets/test/X_test_big.npy", X_train)

In [14]:
# df = pd.DataFrame(X_train)

In [15]:
# df.isna().sum()[df.isna().sum() > 0]

In [16]:
# df_test = pd.DataFrame(X_test)

In [17]:
# df_test.isna().sum()[df_test.isna().sum() > 0]

In [18]:
# submission_df = pd.read_csv(path + "sample_submission_sent.csv")
# submission_df[Y_FEATURE] = y_test_pred
# submission_df.to_csv(path + "submission_sent.csv")

In [19]:
# for i in range(df.shape[0]):
#     for j in range(df.shape[1]):
#         if type(df.iloc[i, j]) == str:
#             print(df.iloc[i, j])