In [1]:
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor
from lightgbm.sklearn import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor, export_graphviz


### Define scoring metrics and CV score function

In [2]:
scoring_metrics = {
    "neg RMSE": "neg_root_mean_squared_error",
}

In [3]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

### Load CSV files

In [6]:
df = pd.read_csv('../data/train.csv')
X_test_submit = pd.read_csv('../data/test.csv')

### Any manual feature engineering before column transformation

In [None]:
facility_class = pd.read_csv("f_type.csv")
facility_class["facility_class"].unique()

In [None]:
df = pd.merge(df, facility_class, on="facility_type")
df.head(3)

In [None]:
df.shape

In [None]:
value = df["direction_max_wind_speed"]
df['dir_max_wind_speed'] = np.where(value > 337.5, "N",
                                np.where(value > 292.5, "NE",
                                        np.where(value > 247.5, "E",
                                                 np.where(value > 202.5, "SE",
                                                          np.where(value > 157.5, "S",
                                                                   np.where(value > 112.5, "SW",
                                                                            np.where(value > 67.5, "W",
                                                                                     np.where(value > 22.5, "NW", "N"))))))))

value = df["direction_peak_wind_speed"]
df['dir_peak_wind_speed'] = np.where(value > 337.5, "N",
                                np.where(value > 292.5, "NE",
                                        np.where(value > 247.5, "E",
                                                 np.where(value > 202.5, "SE",
                                                          np.where(value > 157.5, "S",
                                                                   np.where(value > 112.5, "SW",
                                                                            np.where(value > 67.5, "W",
                                                                                     np.where(value > 22.5, "NW", "N"))))))))

In [None]:
df.shape

In [None]:
df['dir_max_wind_speed'].unique()

In [None]:
df['dir_peak_wind_speed'].unique()

In [None]:
# Checking the data I realized that the mean wind direction is 62 degrees which aligns with NE that we are getting above

### Group columns for transformations

In [None]:
target = "site_eui"

numeric_features = [
    "floor_area",
    "year_built",
    "energy_star_rating", # Nan to 0?
    "ELEVATION",
    "january_min_temp",
    "january_avg_temp",
    "january_max_temp",
    "february_min_temp",
    "february_avg_temp",
    "february_max_temp",
    "march_min_temp",
    "march_avg_temp",
    "march_max_temp",
    "april_min_temp",
    "april_avg_temp",
    "april_max_temp",
    "may_min_temp",
    "may_avg_temp",
    "may_max_temp",
    "june_min_temp",
    "june_avg_temp",
    "june_max_temp",
    "july_min_temp",
    "july_avg_temp",
    "july_max_temp",
    "august_min_temp",
    "august_avg_temp",
    "august_max_temp",
    "september_min_temp",
    "september_avg_temp",
    "september_max_temp",
    "october_min_temp",
    "october_avg_temp",
    "october_max_temp",
    "november_min_temp",
    "november_avg_temp",
    "november_max_temp",
    "december_min_temp",
    "december_avg_temp",
    "december_max_temp",
    "cooling_degree_days",
    "heating_degree_days",
    "precipitation_inches",
    "snowfall_inches",
    "snowdepth_inches",
    "avg_temp",
    "days_below_30F",
    "days_below_20F",
    "days_below_10F",
    "days_below_0F",
    "days_above_80F",
    "days_above_90F",
    "days_above_100F",
    "days_above_110F",
#    "direction_max_wind_speed",
#    "direction_peak_wind_speed",
    "max_wind_speed",
    "days_with_fog"
]

ordinal_features = []
categorical_features = [
                        "Year_Factor",  # Moved this down from numeric
                        "State_Factor",
                        "facility_class",
                        "facility_type",
                        "dir_max_wind_speed",  # Added new feature
                        "dir_peak_wind_speed"]  # Added

drop_features = [
    "id",
    "building_class", # Moved this one here
    "direction_max_wind_speed",
    "direction_peak_wind_speed",
]

assert df.columns.shape[0] == len(
    numeric_features
    + ordinal_features
    + categorical_features
    + [target]
    + drop_features
)

### Split data for CV

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)
X_train, y_train = train_df.drop(columns=[target]), train_df[target]
X_test, y_test = test_df.drop(columns=[target]), test_df[target]

### Column transformation & preprocessors

In [None]:
numeric_transformer = make_pipeline(SimpleImputer(strategy="constant", fill_value=0), StandardScaler())

categorical_transformer = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse=True),
)

In [None]:
preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features),
    ("drop", drop_features),
)

### Check transformed df

In [None]:
X_train_transformed = preprocessor.fit_transform(X_train)

In [None]:
column_names = (
    numeric_features
    + preprocessor.named_transformers_["pipeline-2"]
    .named_steps["onehotencoder"]
    .get_feature_names_out(categorical_features)
    .tolist()
)

X_train_transformed_df = pd.DataFrame(
    X_train_transformed, columns=column_names, index=X_train.index
)

X_train_transformed_df.head()

### Dummy regressor as baseline

In [None]:
results = {}
pipe_dummy = DummyRegressor()
results["Dummy"] = mean_std_cross_val_scores(
    pipe_dummy, X_train, y_train, return_train_score=True, scoring=scoring_metrics
)
pd.DataFrame(results).T

### Train several models (CV) and retrieve the score

In [None]:
pipe_ridge = make_pipeline(preprocessor, Ridge(random_state=123))

pipe_rf = make_pipeline(
    preprocessor, RandomForestRegressor(random_state=123, max_depth=20, n_jobs=-1)
)

pipe_xgb = make_pipeline(
    preprocessor, XGBRegressor(random_state=123, n_jobs=-1, verbosity=0)
)

pipe_lgbm = make_pipeline(preprocessor, LGBMRegressor(random_state=123))

pipe_catboost = make_pipeline(
    preprocessor, CatBoostRegressor(random_state=123, verbose=0)
)

models = {
    "Ridge": pipe_ridge,
#    "Random Forest": pipe_rf,
    "XGBoost": pipe_xgb,
    "LightGBM": pipe_lgbm,
    "CatBoost": pipe_catboost,
}

for model_name, model in models.items():
    results[model_name] = mean_std_cross_val_scores(
        model, X_train, y_train, return_train_score=True, scoring=scoring_metrics
    )

In [None]:
pd.DataFrame(results).T

### Feature selection

### Hyperparameter tuning

### Test the selected model

In [None]:
pipe = pipe_xgb

In [None]:
pipe_fitted = pipe.fit(X_train, y_train)

In [None]:
final_score = pipe.score(X_test, y_test)
final_score

### Generate csv for submission

In [None]:
# Transformation of test set
X_test_submit = pd.merge(X_test_submit, facility_class, on="facility_type")

value = X_test_submit["direction_max_wind_speed"]
X_test_submit['dir_max_wind_speed'] = np.where(value > 337.5, "N",
                                            np.where(value > 292.5, "NE",
                                                    np.where(value > 247.5, "E",
                                                             np.where(value > 202.5, "SE",
                                                                      np.where(value > 157.5, "S",
                                                                               np.where(value > 112.5, "SW",
                                                                                        np.where(value > 67.5, "W",
                                                                                                 np.where(value > 22.5, "NW", "N"))))))))

value = X_test_submit["direction_peak_wind_speed"]
X_test_submit['dir_peak_wind_speed'] = np.where(value > 337.5, "N",
                                            np.where(value > 292.5, "NE",
                                                    np.where(value > 247.5, "E",
                                                             np.where(value > 202.5, "SE",
                                                                      np.where(value > 157.5, "S",
                                                                               np.where(value > 112.5, "SW",
                                                                                        np.where(value > 67.5, "W",
                                                                                                 np.where(value > 22.5, "NW", "N"))))))))

In [None]:
submission = pd.DataFrame({'id': X_test_submit["id"], 'site_eui': pipe_xgb.predict(X_test_submit)})
submission.head()

In [None]:
submission.to_csv("test.csv", index=False)