In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd

from hickathon_2023.data_preprocessing import get_data_preprocessor
from hickathon_2023.feature_extraction import FeatureExtractor
from hickathon_2023.model import get_model
from hickathon_2023.utils import Pipeline, load_data

SMALL_DATASET = True
Y_FEATURE = "energy_consumption_per_annum"

In [2]:
if SMALL_DATASET:
    path = "datasets/small/"
else:
    path = "datasets/"


def get_data():
    X_train, y_train = load_data(
        X_path=path + "train/train_features_sent.csv",
        y_path=path + "train/train_labels_sent.csv",
        y_feature=Y_FEATURE,
    )
    X_test, _ = load_data(X_path=path + "test/test_features_sent.csv")

    return X_train, y_train, X_test


X_train, y_train, X_test = get_data()

# 1. Feature extraction

In [3]:
X_train.head()

Unnamed: 0,level_0,additional_heat_generators,additional_water_heaters,altitude,area_code,balcony_depth,bearing_wall_material,building_category,building_class,building_height_ft,...,wall_insulation_type,water_heaters,water_heating_energy_source,water_heating_type,window_filling_type,window_frame_material,window_glazing_type,window_heat_retention_factor,window_orientation,window_thermal_conductivity
0,1002634,[],,177.16536,654,,CONCRETE,[condo],[2 to 11],42.322836,...,internal,[electric storage tank],electricity,individual,argon or krypton,pvc,double glazing,1.116894,"[east or west,north]",16.057267
1,477851,[wood stove or insert],electric water heating,91.86352,354,,AGGLOMERATE,[individual house],[individual],12.795276,...,internal,"[solar water heater,electric storage tank]",electricity,individual,argon or krypton,metal without thermal break,double glazing,1.289049,"[est,north,sud]",22.149689
2,822040,[],,823.49084,347,,CONCRETE,[individual house],[individual],15.748032,...,internal,[low temperature gas boiler],gas,individual,,wood,single glazing,1.212343,"[north,sud]",23.781613
3,1051703,[],,341.20736,815,,INDETERMINED,"[condo,individual house]","[12+,2 to 11]",36.08924,...,internal,[electric storage tank],electricity,individual,dry air,pvc,double glazing,1.116894,"[est,sud]",15.347292
4,436651,[],thermodynamic electric hot water (heat pump or...,869.4226,659,,AGGLOMERATE,[individual house],[2 to 11],13.451444,...,internal,"[standard gas boiler,thermodynamic electric ho...",thermodynamic electric hot water (pac or tank)...,individual,argon or krypton,wood,double glazing,1.599851,"[est,west,sud]",13.290529


In [4]:
feature_extractor = FeatureExtractor()
X_train = feature_extractor.transform(X_train)
X_train.head()

Unnamed: 0,level_0,additional_heat_generators,additional_water_heaters,altitude,area_code,balcony_depth,bearing_wall_material,building_category,building_class,building_height_ft,...,water_heaters,water_heating_energy_source,water_heating_type,window_filling_type,window_frame_material,window_glazing_type,window_heat_retention_factor,window_orientation,window_thermal_conductivity,year
0,1002634,[],,177.16536,654,,CONCRETE,[condo],[2 to 11],42.322836,...,[electric storage tank],electricity,individual,argon or krypton,pvc,double glazing,1.116894,"[east or west,north]",16.057267,2018
1,477851,[wood stove or insert],electric water heating,91.86352,354,,AGGLOMERATE,[individual house],[individual],12.795276,...,"[solar water heater,electric storage tank]",electricity,individual,argon or krypton,metal without thermal break,double glazing,1.289049,"[est,north,sud]",22.149689,2016
2,822040,[],,823.49084,347,,CONCRETE,[individual house],[individual],15.748032,...,[low temperature gas boiler],gas,individual,,wood,single glazing,1.212343,"[north,sud]",23.781613,2018
3,1051703,[],,341.20736,815,,INDETERMINED,"[condo,individual house]","[12+,2 to 11]",36.08924,...,[electric storage tank],electricity,individual,dry air,pvc,double glazing,1.116894,"[est,sud]",15.347292,2013
4,436651,[],thermodynamic electric hot water (heat pump or...,869.4226,659,,AGGLOMERATE,[individual house],[2 to 11],13.451444,...,"[standard gas boiler,thermodynamic electric ho...",thermodynamic electric hot water (pac or tank)...,individual,argon or krypton,wood,double glazing,1.599851,"[est,west,sud]",13.290529,2017


# 2. Data preprocessing

In [5]:
data_preprocessor = get_data_preprocessor()
X_train = data_preprocessor.fit_transform(X_train)
X_train[:5]

array([[-0.40744041286936356, 2018, False, 1],
       [-0.011971285209031254, 2016, False, 3],
       [-0.0910651107410977, 2018, False, 4],
       [-0.15038547989014758, 2013, False, 3],
       [-0.011971285209031254, 2017, False, 2]], dtype=object)

# 3. Complete model

In [6]:
X_train, y_train, X_test = get_data()
model = get_model()
model.fit(X_train, y_train)

In [7]:
y_test_pred = model.predict(X_test)

In [8]:
submission_df = pd.read_csv(path + "sample_submission_sent.csv")
submission_df[Y_FEATURE] = y_test_pred
submission_df.to_csv(path + "submission_sent.csv")