In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

from hickathon_2023.data_preprocessing import get_data_preprocessor
from hickathon_2023.feature_extraction import FeatureExtractor
from hickathon_2023.model import get_model
from hickathon_2023.utils import Pipeline, load_data

SMALL_DATASET = True
Y_FEATURE = "energy_consumption_per_annum"

In [2]:
if SMALL_DATASET:
    path = "datasets/small/"
else:
    path = "datasets/"


def get_data():
    X_train, y_train = load_data(
        X_path=path + "train/train_features_sent.csv",
        y_path=path + "train/train_labels_sent.csv",
        y_feature=Y_FEATURE,
    )
    X_test, _ = load_data(X_path=path + "test/test_features_sent.csv")

    return X_train, y_train, X_test


X_train, y_train, X_test = get_data()

# 1. Feature extraction

In [3]:
X_train.head()

Unnamed: 0,level_0,additional_heat_generators,additional_water_heaters,altitude,area_code,balcony_depth,bearing_wall_material,building_category,building_class,building_height_ft,...,wall_insulation_type,water_heaters,water_heating_energy_source,water_heating_type,window_filling_type,window_frame_material,window_glazing_type,window_heat_retention_factor,window_orientation,window_thermal_conductivity
0,19594,[],,177.165359,654,,CONCRETE,[condo],[2 to 11],42.322838,...,internal,[electric storage tank],electricity,individual,argon or krypton,pvc,double glazing,1.116894,"[east or west,north]",16.057268
1,19099,[wood stove or insert],electric water heating,91.863518,354,,AGGLOMERATE,[individual house],[individual],12.795276,...,internal,"[solar water heater,electric storage tank]",electricity,individual,argon or krypton,metal without thermal break,double glazing,1.289049,"[est,north,sud]",22.149689
2,-29928,[],,823.490845,347,,CONCRETE,[individual house],[individual],15.748032,...,internal,[low temperature gas boiler],gas,individual,,wood,single glazing,1.212343,"[north,sud]",23.781612
3,3127,[],,341.207367,815,,INDETERMINED,"[condo,individual house]","[12+,2 to 11]",36.089241,...,internal,[electric storage tank],electricity,individual,dry air,pvc,double glazing,1.116894,"[est,sud]",15.347292
4,-22101,[],thermodynamic electric hot water (heat pump or...,869.422607,659,,AGGLOMERATE,[individual house],[2 to 11],13.451444,...,internal,"[standard gas boiler,thermodynamic electric ho...",thermodynamic electric hot water (pac or tank)...,individual,argon or krypton,wood,double glazing,1.599851,"[est,west,sud]",13.290529


In [4]:
feature_extractor = FeatureExtractor()
X_train = feature_extractor.transform(X_train)
X_train.head()

Unnamed: 0,level_0,altitude,area_code,balcony_depth,building_height_ft,building_total_area_sqft,building_type,building_use_type_code,building_use_type_description,building_year,...,water_heating_network,water_heating_coal,window_orientation_north,window_orientation_east,window_orientation_south,window_orientation_west,lower_year_building,upper_year_building,living_to_building_area_ratio,wall_area_by_conductivity
0,19594,177.165359,654,0,42.322838,3562.85083,Flat,2,Residential multi-family,2002.0,...,False,False,True,True,False,True,1989.0,1999.0,0.090634,578250.014173
1,19099,91.863518,354,0,12.795276,1754.515747,House,1,Residential single-family,1985.0,...,False,False,True,True,True,False,1970.0,1988.0,0.674847,175745.645845
2,-29928,823.490845,347,0,15.748032,968.750977,House,1,Residential single-family,1970.0,...,False,False,True,False,True,False,1949.0,1970.0,1.044444,160726.925496
3,3127,341.207367,815,0,36.089241,,Flat,2,Residential multi-family,1993.0,...,False,False,False,True,True,False,1989.0,1999.0,,
4,-22101,869.422607,659,0,13.451444,1162.501221,House,1,Residential single-family,1965.0,...,False,False,False,True,True,True,1949.0,1970.0,1.018519,228356.302939


# 2. Data preprocessing

In [5]:
data_preprocessor = get_data_preprocessor()
X_train = data_preprocessor.fit_transform(X_train)
X_train.shape



(10107, 263)

In [6]:
X_train

Unnamed: 0,area_code_109,area_code_127,area_code_156,area_code_158,area_code_159,area_code_162,area_code_172,area_code_173,area_code_178,area_code_191,...,water_heating_coal,water_heating_electricity,water_heating_gas,water_heating_network,water_heating_oil,water_heating_wood,window_orientation_east,window_orientation_north,window_orientation_south,window_orientation_west
0,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,True,True,False,True
1,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,True,True,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,True,True,False
3,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,True,False,True,False
4,False,False,False,False,False,False,False,False,False,False,...,False,True,True,False,False,False,True,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10102,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,True,False,True,False
10103,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,True,True,False,True
10104,False,False,True,False,False,False,False,False,False,False,...,False,True,False,False,False,False,True,False,False,True
10105,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,True,True,True,False


In [7]:
X_train.columns.tolist()

['area_code_109',
 'area_code_127',
 'area_code_156',
 'area_code_158',
 'area_code_159',
 'area_code_162',
 'area_code_172',
 'area_code_173',
 'area_code_178',
 'area_code_191',
 'area_code_204',
 'area_code_205',
 'area_code_213',
 'area_code_226',
 'area_code_228',
 'area_code_260',
 'area_code_264',
 'area_code_281',
 'area_code_299',
 'area_code_304',
 'area_code_345',
 'area_code_347',
 'area_code_351',
 'area_code_354',
 'area_code_356',
 'area_code_369',
 'area_code_374',
 'area_code_375',
 'area_code_377',
 'area_code_380',
 'area_code_382',
 'area_code_386',
 'area_code_387',
 'area_code_398',
 'area_code_400',
 'area_code_419',
 'area_code_451',
 'area_code_467',
 'area_code_471',
 'area_code_492',
 'area_code_498',
 'area_code_502',
 'area_code_518',
 'area_code_520',
 'area_code_542',
 'area_code_547',
 'area_code_559',
 'area_code_573',
 'area_code_585',
 'area_code_592',
 'area_code_597',
 'area_code_603',
 'area_code_615',
 'area_code_616',
 'area_code_637',
 'area_cod

In [8]:
if SMALL_DATASET:
    np.save("datasets/train/X_small.npy", X_train)
else:
    np.save("datasets/train/X_big.npy", X_train)

# X_test

In [9]:
print(X_test.shape)

X_test = feature_extractor.transform(X_test)

print(X_test.shape)

X_test = data_preprocessor.transform(X_test)

print(X_test.shape)

(4949, 71)
(4949, 118)
(4949, 263)


In [10]:
if SMALL_DATASET:
    np.save("datasets/test/X_test_small.npy", X_test)
else:
    np.save("datasets/test/X_test_big.npy", X_test)

In [11]:
X_train.isna().sum()[X_train.isna().sum() > 0]

Series([], dtype: int64)

In [12]:
X_test.isna().sum()[X_test.isna().sum() > 0]

Series([], dtype: int64)

In [13]:
# for j in range(X_train.shape[1]):
#     for i in range(X_train.shape[0]):
#         if type(X_train.iloc[i, j]) == str:
#             print(X_train.iloc[i, j])