In [1]:
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor
from lightgbm.sklearn import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor, export_graphviz

from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.neighbors import KNeighborsRegressor

### Define scoring metrics and CV score function

In [2]:
scoring_metrics = {
    "neg RMSE": "neg_root_mean_squared_error",
}

In [3]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

### Load CSV files

In [4]:
df = pd.read_csv('../data/train.csv')
X_test_submit = pd.read_csv('../data/test.csv')

### Any manual feature engineering before column transformation

In [5]:
facility_class = pd.read_csv("f_type.csv")
facility_class["facility_class"].unique()

array(['Retail', 'Warehouse', 'Educational', 'Warehouse_cold', 'Office',
       'Flex_space', 'Commercial', 'Industrial', 'Public_Assembly',
       'Hotel', 'Health_care', 'Services', 'Food_services', 'Residential',
       'Public_safety'], dtype=object)

In [6]:
df = pd.merge(df, facility_class, on="facility_type")

df.head(3)

Unnamed: 0,Year_Factor,State_Factor,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,...,days_above_90F,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,site_eui,id,facility_class
0,1,State_1,Commercial,Grocery_store_or_food_market,61242.0,1942.0,11.0,2.4,36,50.5,...,0,0,0,1.0,1.0,1.0,,248.682615,0,Retail
1,1,State_1,Commercial,Grocery_store_or_food_market,67346.0,1967.0,26.0,1.8,36,50.5,...,0,0,0,1.0,,1.0,12.0,287.863448,24,Retail
2,1,State_1,Commercial,Grocery_store_or_food_market,124196.0,1954.0,44.0,1.8,36,50.5,...,0,0,0,1.0,,1.0,12.0,241.932986,25,Retail


In [7]:
df.shape

(75757, 65)

In [8]:
value = df["direction_max_wind_speed"]
df['dir_max_wind_speed'] = np.where(value > 337.5, "N",
                                np.where(value > 292.5, "NE",
                                        np.where(value > 247.5, "E",
                                                 np.where(value > 202.5, "SE",
                                                          np.where(value > 157.5, "S",
                                                                   np.where(value > 112.5, "SW",
                                                                            np.where(value > 67.5, "W",
                                                                                     np.where(value > 22.5, "NW", "N"))))))))

value = df["direction_peak_wind_speed"]
df['dir_peak_wind_speed'] = np.where(value > 337.5, "N",
                                np.where(value > 292.5, "NE",
                                        np.where(value > 247.5, "E",
                                                 np.where(value > 202.5, "SE",
                                                          np.where(value > 157.5, "S",
                                                                   np.where(value > 112.5, "SW",
                                                                            np.where(value > 67.5, "W",
                                                                                     np.where(value > 22.5, "NW", "N"))))))))

In [9]:
df.shape

(75757, 67)

In [10]:
df['dir_max_wind_speed'].unique()

array(['N', 'E', 'NE'], dtype=object)

In [11]:
df['dir_peak_wind_speed'].unique()

array(['N', 'NE', 'E'], dtype=object)

##### Checking the data I realized that the mean wind direction is 62 degrees which aligns with NE that we are getting above

In [12]:
# # The following merges the imputed energy_star_rating
# # This was done in R
# df.to_csv("../data/train_facility.csv") ## export for R MICE imputation
# energy_star_imp = pd.read_csv("energy_star_imp.csv")
# df = pd.merge(df, energy_star_imp, on="facility_class")
# df['energy_star_rating'] = df['energy_star_rating'].fillna(df.pop('energy_star_rating_imp'))

In [18]:
# Impute train set using KNN
to_train = df[~df["energy_star_rating"].isna()]
to_impute = df[df["energy_star_rating"].isna()]

X_train_KNN = to_train[["floor_area", "facility_class", "year_built"]]
y_train_KNN = to_train["energy_star_rating"]

X_test_KNN = to_impute[["floor_area", "facility_class", "year_built"]]

y_train_KNN

0        11.0
1        26.0
2        44.0
3        55.0
4        23.0
         ... 
75752    38.0
75753    69.0
75754    96.0
75755    73.0
75756    69.0
Name: energy_star_rating, Length: 49048, dtype: float64

In [28]:
numeric = ["floor_area", "year_built"]
categorical = ["facility_class"]

numeric_transformer = make_pipeline(StandardScaler())

categorical_transformer = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse=True),
)

KNN_preprocessor = make_column_transformer(
    (numeric, StandardScaler()),
    (categorical, categorical_transformer)
)

In [29]:
KNN_pipe = make_pipeline(
    KNN_preprocessor, KNeighborsRegressor(n_neighbors=5, n_jobs=-1)
)

In [30]:
results = {}
results["KNN_imputer"] = mean_std_cross_val_scores(
    KNN_pipe, X_train_KNN, y_train_KNN, return_train_score=True, scoring=scoring_metrics
)
pd.DataFrame(results).T

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\artan\miniconda3\envs\site_energy_consumption_prediction\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\artan\miniconda3\envs\site_energy_consumption_prediction\lib\site-packages\sklearn\pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\artan\miniconda3\envs\site_energy_consumption_prediction\lib\site-packages\sklearn\pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users

Unnamed: 0,fit_time,score_time,test_neg RMSE,train_neg RMSE
KNN_imputer,0.003 (+/- 0.001),0.000 (+/- 0.000),nan (+/- nan),nan (+/- nan)


In [31]:
# Fit-predict
KNN_pipe.fit(X_train_KNN, y_train_KNN)
to_impute["energy_star_rating"] = KNN_pipe.predict(X_test_KNN)

TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. '['floor_area', 'year_built']' (type <class 'list'>) doesn't.

In [None]:
# Putting the two df's together
X_train = pd.concat([to_train, to_impute], axis=0)
X_train.head()

### Group columns for transformations

In [None]:
target = "site_eui"

numeric_features = [
    "floor_area", # Grouped and moved to ordinary feature
    "year_built",
    "energy_star_rating", # Imputed by facility_class + site_eui, take the average per facility_class
    # "ELEVATION", 
    "january_min_temp",
    "january_avg_temp",
    "january_max_temp",
#    "february_min_temp", # removed similar temperature columns
#    "february_avg_temp",
#    "february_max_temp",
#    "march_min_temp",
#    "march_avg_temp",
#    "march_max_temp",
#    "april_min_temp",
#    "april_avg_temp",
#    "april_max_temp",
#    "may_min_temp",
#    "may_avg_temp",
#    "may_max_temp",
#    "june_min_temp",
#    "june_avg_temp",
#    "june_max_temp",
    "july_min_temp",
    "july_avg_temp",
    "july_max_temp",
    "august_min_temp",
    "august_avg_temp",
    "august_max_temp",
#    "september_min_temp", # removed similar temperature columns
#    "september_avg_temp",
#    "september_max_temp",
#    "october_min_temp",
#    "october_avg_temp",
#    "october_max_temp",
#    "november_min_temp",
#    "november_avg_temp",
#    "november_max_temp",
#    "december_min_temp",
#    "december_avg_temp",
#    "december_max_temp",
    "cooling_degree_days",
    "heating_degree_days",
    "precipitation_inches",
    "snowfall_inches",
    "snowdepth_inches",
    "avg_temp",
#    "days_below_30F",
    "days_below_20F",
#    "days_below_10F", 
#    "days_below_0F",
#    "days_above_80F",
    "days_above_90F",
#    "days_above_100F",
#    "days_above_110F",
#    "direction_max_wind_speed",
#    "direction_peak_wind_speed",
    "max_wind_speed",
    "days_with_fog" ##???
]

ordinal_features = [] #['ord_floor_area']
categorical_features = [
                        "Year_Factor",  # Moved this down from numeric 
                        "State_Factor",
                        "facility_class",
                        "facility_type",
                        "dir_max_wind_speed",  # Added new feature
                        "dir_peak_wind_speed"]  # Added

drop_features = [
    "id",
    "building_class", # Moved this one here 
    #"floor_area", # Grouped and moved this one here 
    "direction_max_wind_speed",
    "direction_peak_wind_speed",
    "february_min_temp",
    "february_avg_temp",
    "february_max_temp",
    "march_min_temp",
    "march_avg_temp",
    "march_max_temp",
    "april_min_temp",
    "april_avg_temp",
    "april_max_temp",
    "may_min_temp",
    "may_avg_temp",
    "may_max_temp",
    "june_min_temp",
    "june_avg_temp",
    "june_max_temp",    
    "september_min_temp",
    "september_avg_temp",
    "september_max_temp",    
    "october_min_temp",
    "october_avg_temp",
    "october_max_temp",
    "november_min_temp",
    "november_avg_temp",
    "november_max_temp",
    "december_min_temp",
    "december_avg_temp",
    "december_max_temp",    
    "days_below_30F",    
    "days_below_10F",
    "days_below_0F",
    "days_above_80F",    
    "days_above_100F",
    "days_above_110F",    
    "ELEVATION", #Try dropping
]

assert df.columns.shape[0] == len(
    numeric_features
    + ordinal_features
    + categorical_features
    + [target]
    + drop_features
)

### Split data for CV

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)
X_train, y_train = train_df.drop(columns=[target]), train_df[target]
X_test, y_test = test_df.drop(columns=[target]), test_df[target]

### Column transformation & preprocessors

In [None]:
numeric_transformer = make_pipeline(SimpleImputer(strategy="constant", fill_value=0), StandardScaler())

categorical_transformer = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse=True),
)

In [None]:
preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features),
    ("drop", drop_features),
)

### Check transformed df

In [None]:
X_train_transformed = preprocessor.fit_transform(X_train)

In [None]:
column_names = (
    numeric_features
    + preprocessor.named_transformers_["pipeline-2"]
    .named_steps["onehotencoder"]
    .get_feature_names_out(categorical_features)
    .tolist()
)

X_train_transformed_df = pd.DataFrame(
    X_train_transformed.toarray(), columns=column_names, index=X_train.index
)

X_train_transformed_df.head()

### Dummy regressor as baseline

In [None]:
results = {}
pipe_dummy = DummyRegressor()
results["Dummy"] = mean_std_cross_val_scores(
    pipe_dummy, X_train, y_train, return_train_score=True, scoring=scoring_metrics
)
pd.DataFrame(results).T

### Train several models (CV) and retrieve the score

In [None]:
pipe_ridge = make_pipeline(preprocessor, Ridge(random_state=123))

pipe_rf = make_pipeline(
    preprocessor, RandomForestRegressor(random_state=123, n_jobs=-1, max_depth=5)
)

pipe_xgb = make_pipeline(
    preprocessor, XGBRegressor(random_state=123, n_jobs=-1, verbosity=0)
)

pipe_lgbm = make_pipeline(preprocessor, LGBMRegressor(random_state=123))

pipe_catboost = make_pipeline(
    preprocessor, CatBoostRegressor(random_state=123, verbose=0)
)

models = {
    #"Ridge": pipe_ridge, ## high mse
    #"Random Forest": pipe_rf,
    "XGBoost": pipe_xgb,
    "LightGBM": pipe_lgbm,
    "CatBoost": pipe_catboost,
    #"kNN": pipe_kNN,  ## high mse
}

In [None]:
for model_name, model in models.items():
    results[model_name] = mean_std_cross_val_scores(
        model, X_train, y_train, return_train_score=True, scoring=scoring_metrics
    )

In [None]:
pd.DataFrame(results).T

### Feature selection

### Hyperparameter tuning

## AutoML

## Averaging

In [None]:
averaging_model = VotingRegressor(
    list(models.items())
)  # need the list() here for cross-validation to work!

In [None]:
averaging_model.fit(X_train, y_train);

In [None]:
results["Voting"] = mean_std_cross_val_scores(
    averaging_model, X_train, y_train, return_train_score=True, scoring=scoring_metrics
)

In [None]:
pd.DataFrame(results)

## Stacking

In [None]:
stacking_model = StackingRegressor(
    list(models.items())
)  # need the list() here for cross-validation to work!

In [None]:
stacking_model.fit(X_train, y_train);

In [None]:
results["Stacking"] = mean_std_cross_val_scores(
    stacking_model, X_train, y_train, return_train_score=True, scoring=scoring_metrics
)

### Test the selected model

In [None]:
pipe = averaging_model

In [None]:
pipe_fitted = pipe.fit(X_train, y_train)

In [None]:
final_score = pipe.score(X_test, y_test)
final_score

### Generate csv for submission

In [None]:
# Transformation of test set
X_test_submit = pd.merge(X_test_submit, facility_class, on="facility_type")

# Impute energy_star_rating
# X_test_submit = pd.merge(X_test_submit, energy_star_imp, on="facility_class")
# X_test_submit['energy_star_rating'] = X_test_submit['energy_star_rating'].fillna(X_test_submit.pop('energy_star_rating_imp'))








value = X_test_submit["direction_max_wind_speed"]
X_test_submit['dir_max_wind_speed'] = np.where(value > 337.5, "N",
                                            np.where(value > 292.5, "NE",
                                                    np.where(value > 247.5, "E",
                                                             np.where(value > 202.5, "SE",
                                                                      np.where(value > 157.5, "S",
                                                                               np.where(value > 112.5, "SW",
                                                                                        np.where(value > 67.5, "W",
                                                                                                 np.where(value > 22.5, "NW", "N"))))))))

value = X_test_submit["direction_peak_wind_speed"]
X_test_submit['dir_peak_wind_speed'] = np.where(value > 337.5, "N",
                                            np.where(value > 292.5, "NE",
                                                    np.where(value > 247.5, "E",
                                                             np.where(value > 202.5, "SE",
                                                                      np.where(value > 157.5, "S",
                                                                               np.where(value > 112.5, "SW",
                                                                                        np.where(value > 67.5, "W",
                                                                                                 np.where(value > 22.5, "NW", "N"))))))))

#value_floor = X_test_submit["floor_area"]
#X_test_submit['ord_floor_area'] =  np.where(value_floor > 261980, 7,
#                              np.where(value_floor > 148466, 6,
#                                     np.where(value_floor > 105070, 5,
#                                            np.where(value_floor > 80088, 4,
#                                                  np.where(value_floor > 65333, 3,
#                                                         np.where(value_floor > 53250, 2, 1))))))

## Select your submission model

In [None]:
select_model = pipe_xgb

In [None]:
X_final = pd.concat([X_train, X_test],ignore_index=True)
y_final = pd.concat([y_train, y_test],ignore_index=True)
pipe_fitted = select_model.fit(X_final, y_final)

In [None]:
submission = pd.DataFrame({'id': X_test_submit["id"], 'site_eui': select_model.predict(X_test_submit)})
submission.head()

In [None]:
submission.to_csv("submission/test.csv", index=False)

# AutoML prediction