In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd

from hickathon_2023.data_preprocessing import get_data_preprocessor
from hickathon_2023.feature_extraction import FeatureExtractor
from hickathon_2023.model import get_model
from hickathon_2023.utils import Pipeline, load_data

SMALL_DATASET = True
Y_FEATURE = "energy_consumption_per_annum"

In [2]:
if SMALL_DATASET:
    path = "datasets/small/"
else:
    path = "datasets/"


def get_data():
    X_train, y_train = load_data(
        X_path=path + "train/train_features_sent.csv",
        y_path=path + "train/train_labels_sent.csv",
        y_feature=Y_FEATURE,
    )
    X_test, _ = load_data(X_path=path + "test/test_features_sent.csv")

    return X_train, y_train, X_test


X_train, y_train, X_test = get_data()

# 1. Feature extraction

In [3]:
X_train.head()

Unnamed: 0,level_0,additional_heat_generators,additional_water_heaters,altitude,area_code,balcony_depth,bearing_wall_material,building_category,building_class,building_height_ft,...,wall_insulation_type,water_heaters,water_heating_energy_source,water_heating_type,window_filling_type,window_frame_material,window_glazing_type,window_heat_retention_factor,window_orientation,window_thermal_conductivity
0,1002634,[],,177.16536,654,,CONCRETE,[condo],[2 to 11],42.322836,...,internal,[electric storage tank],electricity,individual,argon or krypton,pvc,double glazing,1.116894,"[east or west,north]",16.057267
1,477851,[wood stove or insert],electric water heating,91.86352,354,,AGGLOMERATE,[individual house],[individual],12.795276,...,internal,"[solar water heater,electric storage tank]",electricity,individual,argon or krypton,metal without thermal break,double glazing,1.289049,"[est,north,sud]",22.149689
2,822040,[],,823.49084,347,,CONCRETE,[individual house],[individual],15.748032,...,internal,[low temperature gas boiler],gas,individual,,wood,single glazing,1.212343,"[north,sud]",23.781613
3,1051703,[],,341.20736,815,,INDETERMINED,"[condo,individual house]","[12+,2 to 11]",36.08924,...,internal,[electric storage tank],electricity,individual,dry air,pvc,double glazing,1.116894,"[est,sud]",15.347292
4,436651,[],thermodynamic electric hot water (heat pump or...,869.4226,659,,AGGLOMERATE,[individual house],[2 to 11],13.451444,...,internal,"[standard gas boiler,thermodynamic electric ho...",thermodynamic electric hot water (pac or tank)...,individual,argon or krypton,wood,double glazing,1.599851,"[est,west,sud]",13.290529


In [4]:
feature_extractor = FeatureExtractor()
X_train = feature_extractor.transform(X_train)
X_train.head()

Unnamed: 0,level_0,altitude,area_code,balcony_depth,building_height_ft,building_total_area_sqft,building_type,building_use_type_code,building_use_type_description,building_year,...,water_heating_electricity,water_heating_wood,water_heating_network,water_heating_coal,window_orientation_north,window_orientation_east,window_orientation_south,window_orientation_west,lower_year_building,upper_year_building
0,1002634,177.16536,654,0,42.322836,3562.8509,Flat,2,Residential multi-family,2002.0,...,True,False,False,False,True,True,False,True,1989.0,1999.0
1,477851,91.86352,354,0,12.795276,1754.5157,House,1,Residential single-family,1985.0,...,True,False,False,False,True,True,True,False,1970.0,1988.0
2,822040,823.49084,347,0,15.748032,968.751,House,1,Residential single-family,1970.0,...,False,False,False,False,True,False,True,False,1949.0,1970.0
3,1051703,341.20736,815,0,36.08924,,Flat,2,Residential multi-family,1993.0,...,True,False,False,False,False,True,True,False,1989.0,1999.0
4,436651,869.4226,659,0,13.451444,1162.5012,House,1,Residential single-family,1965.0,...,True,False,False,False,False,True,True,True,1949.0,1970.0


# 2. Data preprocessing

In [5]:
data_preprocessor = get_data_preprocessor()
X_train = data_preprocessor.fit_transform(X_train)
X_train.shape

(10107, 278)

In [6]:
X_train[:5]

array([[0.0, 0.0, 0.0, ..., False, True, 16.05726725794424],
       [0.0, 0.0, 0.0, ..., True, False, 22.14968866724145],
       [0.0, 0.0, 0.0, ..., True, False, 23.781612843288865],
       [0.0, 0.0, 0.0, ..., True, False, 15.347291750768129],
       [0.0, 0.0, 0.0, ..., True, True, 13.290529050852152]], dtype=object)

# 3. Complete model

In [7]:
X_train, y_train, X_test = get_data()
model = get_model()
model.fit(X_train, y_train)

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
y_test_pred = model.predict(X_test)

In [None]:
submission_df = pd.read_csv(path + "sample_submission_sent.csv")
submission_df[Y_FEATURE] = y_test_pred
submission_df.to_csv(path + "submission_sent.csv")