In [1]:
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor
from lightgbm.sklearn import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, Ridge, HuberRegressor
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import pickle
from sklearn.ensemble import StackingRegressor

### Define scoring metrics and CV score function

In [2]:
scoring_metrics = {
    "neg RMSE": "neg_root_mean_squared_error",
}

In [3]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

### Load CSV files

In [4]:
df = pd.read_csv('../data/train.csv')
X_test_submit = pd.read_csv('../data/test.csv')

### Any manual feature engineering before column transformation

In [5]:
facility_class = pd.read_csv("f_type.csv")
facility_class["facility_class"].unique()

array(['Retail', 'Warehouse', 'Educational', 'Warehouse_cold', 'Office',
       'Flex_space', 'Commercial', 'Industrial', 'Public_Assembly',
       'Hotel', 'Health_care', 'Services', 'Food_services', 'Residential',
       'Public_safety'], dtype=object)

In [6]:
df = pd.merge(df, facility_class, on="facility_type")

df.head(3)

Unnamed: 0,Year_Factor,State_Factor,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,...,days_above_90F,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,site_eui,id,facility_class
0,1,State_1,Commercial,Grocery_store_or_food_market,61242.0,1942.0,11.0,2.4,36,50.5,...,0,0,0,1.0,1.0,1.0,,248.682615,0,Retail
1,1,State_1,Commercial,Grocery_store_or_food_market,67346.0,1967.0,26.0,1.8,36,50.5,...,0,0,0,1.0,,1.0,12.0,287.863448,24,Retail
2,1,State_1,Commercial,Grocery_store_or_food_market,124196.0,1954.0,44.0,1.8,36,50.5,...,0,0,0,1.0,,1.0,12.0,241.932986,25,Retail


In [7]:
df.shape

(75757, 65)

In [8]:
value = df["direction_max_wind_speed"]
df['dir_max_wind_speed'] = np.where(value > 337.5, "N",
                                np.where(value > 292.5, "NE",
                                        np.where(value > 247.5, "E",
                                                 np.where(value > 202.5, "SE",
                                                          np.where(value > 157.5, "S",
                                                                   np.where(value > 112.5, "SW",
                                                                            np.where(value > 67.5, "W",
                                                                                     np.where(value > 22.5, "NW", "N"))))))))

value = df["direction_peak_wind_speed"]
df['dir_peak_wind_speed'] = np.where(value > 337.5, "N",
                                np.where(value > 292.5, "NE",
                                        np.where(value > 247.5, "E",
                                                 np.where(value > 202.5, "SE",
                                                          np.where(value > 157.5, "S",
                                                                   np.where(value > 112.5, "SW",
                                                                            np.where(value > 67.5, "W",
                                                                                     np.where(value > 22.5, "NW", "N"))))))))

In [9]:
df.shape

(75757, 67)

In [10]:
df['dir_max_wind_speed'].unique()

array(['N', 'E', 'NE'], dtype=object)

In [11]:
df['dir_peak_wind_speed'].unique()

array(['N', 'NE', 'E'], dtype=object)

In [12]:
# Checking the data I realized that the mean wind direction is 62 degrees which aligns with NE that we are getting above

In [13]:
# The following merges the imputed energy_star_rating
# This was done in R
# df.to_csv("../data/train_facility.csv") ## export for R MICE imputation
# energy_star_imp = pd.read_csv("energy_star_imp.csv")
# df = pd.merge(df, energy_star_imp, on="facility_class")
# df['energy_star_rating'] = df['energy_star_rating'].fillna(df.pop('energy_star_rating_imp'))

### Group columns for transformations

In [14]:
target = "site_eui"

numeric_features = [
    "floor_area",
    "Year_Factor",  # Moved this down from numeric
    #    "year_built",
    "energy_star_rating",  # Imputed by facility_class + site_eui, take the average per facility_class
#    "ELEVATION",
    "january_min_temp",
    "january_avg_temp",
    "january_max_temp",
    #    "february_min_temp", # removed similar temperature columns
    #    "february_avg_temp",
    #    "february_max_temp",
    #    "march_min_temp",
    #    "march_avg_temp",
    #    "march_max_temp",
    #    "april_min_temp",
    #    "april_avg_temp",
    #    "april_max_temp",
    #    "may_min_temp",
    #    "may_avg_temp",
    #    "may_max_temp",
    #    "june_min_temp",
    #    "june_avg_temp",
    #    "june_max_temp",
    "july_min_temp",
    "july_avg_temp",
    "july_max_temp",
    #    "august_min_temp",
    #    "august_avg_temp",
    #    "august_max_temp",
    #    "september_min_temp", # removed similar temperature columns
    #    "september_avg_temp",
    #    "september_max_temp",
    #    "october_min_temp",
    #    "october_avg_temp",
    #    "october_max_temp",
    #    "november_min_temp",
    #    "november_avg_temp",
    #    "november_max_temp",
    #    "december_min_temp",
    #    "december_avg_temp",
    #    "december_max_temp",
    "cooling_degree_days",
    "heating_degree_days",
    "precipitation_inches",
    "snowfall_inches",
    "snowdepth_inches",
    #    "avg_temp",
    #    "days_below_30F",
    "days_below_20F",
    #    "days_below_10F",
    #    "days_below_0F",
    #    "days_above_80F",
    "days_above_90F",
    #    "days_above_100F",
    #    "days_above_110F",
    #    "direction_max_wind_speed",
    #    "direction_peak_wind_speed",
    #    "max_wind_speed",
    #    "days_with_fog"
]

year_features = ["year_built"]
ordinal_features = []
categorical_features = [

    "State_Factor",
    "facility_class",
    "facility_type",
    "dir_max_wind_speed",  # Added new feature
    "dir_peak_wind_speed",
]  # Added

drop_features = [
    "id",
    "building_class",  # Moved this one here
    "ELEVATION",    
    "direction_max_wind_speed",
    "direction_peak_wind_speed",
    "february_min_temp",
    "february_avg_temp",
    "february_max_temp",
    "march_min_temp",
    "march_avg_temp",
    "march_max_temp",
    "april_min_temp",
    "april_avg_temp",
    "april_max_temp",
    "may_min_temp",
    "may_avg_temp",
    "may_max_temp",
    "june_min_temp",
    "june_avg_temp",
    "june_max_temp",
    "august_min_temp",
    "august_avg_temp",
    "august_max_temp",
    "september_min_temp",
    "september_avg_temp",
    "september_max_temp",
    "october_min_temp",
    "october_avg_temp",
    "october_max_temp",
    "november_min_temp",
    "november_avg_temp",
    "november_max_temp",
    "december_min_temp",
    "december_avg_temp",
    "december_max_temp",
    "avg_temp",    
    "days_below_30F",
    "days_below_10F",
    "days_below_0F",
    "days_above_80F",
    "days_above_100F",
    "days_above_110F",
    "max_wind_speed",
    "days_with_fog",
]

assert df.columns.shape[0] == len(
    numeric_features
    + year_features
    + ordinal_features
    + categorical_features
    + [target]
    + drop_features
)

### Split data for CV

In [15]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)
X_train, y_train = train_df.drop(columns=[target]), train_df[target]
X_test, y_test = test_df.drop(columns=[target]), test_df[target]

In [16]:
...

Ellipsis

### Knn Regression for energy star rating

In [17]:
train_df_energy_notna = X_train[X_train["energy_star_rating"].notna()]

# for training the model
train_df_energy, test_df_energy = train_test_split(train_df_energy_notna, test_size=0.2, random_state=123)
X_train_energy, y_train_energy = train_df_energy[["floor_area", "facility_class", "year_built"]], train_df_energy["energy_star_rating"]
X_test_energy, y_test_energy = test_df_energy[["floor_area", "facility_class", "year_built"]], test_df_energy["energy_star_rating"]


# for imputataion from knn prediction later
train_df_energy_isna = X_train[X_train["energy_star_rating"].isna()]
test_df_energy_isna = X_test[X_test["energy_star_rating"].isna()]

In [18]:
numeric_transformer_energy = make_pipeline(SimpleImputer(strategy="constant", fill_value=0), StandardScaler())
year_transformer_energy = make_pipeline(SimpleImputer(strategy="constant", fill_value=1840), StandardScaler())

categorical_transformer_energy = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse=True),
)

preprocessor_energy = make_column_transformer(
    (numeric_transformer_energy, ["floor_area"]),
    (year_transformer_energy, ["year_built"]),
    (categorical_transformer_energy, ["facility_class"]),
)

### KNN Hyperparameter tuning

In [19]:
# for i in np.arange(1,10):

#     pipe_knn_energy = make_pipeline(
#         preprocessor_energy, KNeighborsRegressor(n_neighbors=i,n_jobs=-1)
#     )

#     results_energy["knn_"+str(i)] = mean_std_cross_val_scores(
#         pipe_knn_energy, X_train_energy, y_train_energy, return_train_score=True, scoring=scoring_metrics
#     )

In [20]:
# pd.DataFrame(results_energy).T

### Run if requires retrain

In [21]:
pipe_knn_energy = make_pipeline(
        preprocessor_energy, KNeighborsRegressor(n_neighbors=3,n_jobs=-1)
    )

In [22]:
pipe_knn_fitted = pipe_knn_energy.fit(X_train_energy, y_train_energy)

### Save the trained KNN regression model

In [23]:
# Save pipe_knn_energy model
try:
    filename = "pipe_knn_fitted.p"
    outfile = open("pipe_knn_fitted.p", "wb")
    pickle.dump(pipe_knn_fitted, outfile)
    outfile.close()

    print("")
    print("Pipe is dumped successfully")

except Exception as error:
    print(f"Error message: %s" % error)
    print("Error while saving pipe")


Pipe is dumped successfully


### Load the trained KNN regression model

In [24]:
file = open('pipe_knn_fitted.p', 'rb')

# dump information to that file
pipe_knn_fitted = pickle.load(file)

# close the file
file.close()

### Test KNN model RMSE

In [25]:
y_pred = pipe_knn_fitted.predict(X_test_energy)
knn_score = np.sqrt(mean_squared_error(y_test_energy, y_pred))
knn_score

25.4962305693059

### Impute X_train

In [26]:
X_train_df_na = train_df_energy_isna[["floor_area", "facility_class", "year_built"]]
X_train_df_na.head(3)

Unnamed: 0,floor_area,facility_class,year_built
27033,137704.0,Educational,
19870,160000.0,Commercial,1937.0
67251,219649.0,Residential,1993.0


In [27]:
y_pred = pipe_knn_fitted.predict(X_train_df_na)

In [28]:
y_pred_df = pd.DataFrame({"energy_star_rating": y_pred}, index = X_train_df_na.index)
y_pred_df.head()

Unnamed: 0,energy_star_rating
27033,28.666667
19870,68.666667
67251,71.666667
39610,41.0
19922,92.666667


In [29]:
X_train.fillna(value = y_pred_df, inplace = True)
X_train

Unnamed: 0,Year_Factor,State_Factor,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,...,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,id,facility_class,dir_max_wind_speed,dir_peak_wind_speed
69130,6,State_6,Residential,Multifamily_Uncategorized,108000.0,1923.0,47.000000,42.7,11,34.451613,...,0,0,,,,131.0,62084,Residential,N,N
27033,3,State_1,Commercial,Education_Uncategorized,137704.0,,28.666667,169.2,39,55.822581,...,0,0,,,,,1040,Educational,N,N
2610,5,State_6,Commercial,Education_Other_classroom,158421.0,1962.0,85.000000,15.2,8,29.854839,...,0,0,,,,,43934,Educational,N,N
11958,1,State_6,Commercial,Office_Uncategorized,101004.0,1906.0,86.000000,42.7,6,29.677419,...,1,0,1.0,1.0,1.0,12.0,15474,Office,N,N
1,1,State_1,Commercial,Grocery_store_or_food_market,67346.0,1967.0,26.000000,1.8,36,50.500000,...,0,0,1.0,,1.0,12.0,24,Retail,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63206,5,State_6,Residential,Multifamily_Uncategorized,67320.0,1939.0,22.000000,25.6,8,29.854839,...,0,0,330.0,340.0,18.3,148.0,52109,Residential,NE,N
61404,5,State_6,Residential,Multifamily_Uncategorized,249756.0,1926.0,81.000000,25.6,8,29.854839,...,0,0,330.0,340.0,18.3,148.0,50115,Residential,NE,N
17730,6,State_6,Commercial,Office_Uncategorized,118134.0,1930.0,41.000000,42.7,11,34.451613,...,0,0,,,,131.0,60468,Office,N,N
28030,5,State_11,Commercial,Health_Care_Inpatient,401194.0,1966.0,88.000000,18.3,28,45.419355,...,0,0,,,,,72468,Health_care,N,N


### Impute X_test (our validation set)

In [30]:
X_test_df_na = test_df_energy_isna[["floor_area", "facility_class", "year_built"]]
X_test_df_na.head(3)

Unnamed: 0,floor_area,facility_class,year_built
22778,67644.0,Educational,2000.0
41574,109938.0,Residential,1988.0
36069,57296.0,Residential,


In [31]:
y_pred = pipe_knn_fitted.predict(X_test_df_na)

In [32]:
y_pred_df = pd.DataFrame({"energy_star_rating": y_pred}, index = X_test_df_na.index)
y_pred_df.head(3)

Unnamed: 0,energy_star_rating
22778,78.0
41574,69.0
36069,59.333333


In [33]:
X_test.fillna(value = y_pred_df, inplace = True)
X_test

Unnamed: 0,Year_Factor,State_Factor,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,...,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,id,facility_class,dir_max_wind_speed,dir_peak_wind_speed
22778,5,State_8,Commercial,Education_College_or_university,67644.0,2000.0,78.000000,18.3,3,24.016129,...,0,0,,,,,67685,Educational,N,N
57236,5,State_6,Residential,Multifamily_Uncategorized,158063.0,2010.0,32.000000,15.2,8,29.854839,...,0,0,,,,,43949,Residential,N,N
696,4,State_6,Commercial,Warehouse_Distribution_or_Shipping_center,104000.0,1926.0,53.000000,42.7,4,28.596774,...,0,0,1.0,1.0,1.0,,40586,Warehouse,N,N
63958,6,State_6,Residential,Multifamily_Uncategorized,64386.0,1999.0,36.000000,3.4,11,34.451613,...,0,0,1.0,1.0,1.0,,53091,Residential,N,N
28363,3,State_6,Commercial,Religious_worship,61488.0,2003.0,9.000000,11.9,11,35.080645,...,0,0,,,,,26278,Public_Assembly,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31749,5,State_11,Residential,5plus_Unit_Building,59352.0,2004.0,30.000000,18.3,28,45.419355,...,0,0,,,,,71023,Residential,N,N
69089,6,State_6,Residential,Multifamily_Uncategorized,69656.0,1910.0,54.000000,42.7,11,34.451613,...,0,0,,,,131.0,62003,Residential,N,N
74775,6,State_11,Residential,2to4_Unit_Building,34880.0,1987.0,90.000000,26.5,28,43.451613,...,0,0,,,,,74008,Residential,N,N
19381,5,State_1,Commercial,Data_Center,190190.0,2000.0,99.000000,9.1,29,51.387097,...,10,0,,,,,4195,Flex_space,N,N


### End of Knn

In [34]:
...

Ellipsis

### Column transformation & preprocessors

In [35]:
numeric_transformer = make_pipeline(SimpleImputer(strategy="constant", fill_value=0), StandardScaler())
year_transformer = make_pipeline(SimpleImputer(strategy="constant", fill_value=1840), StandardScaler())
categorical_transformer = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse=True),
)

In [36]:
preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (year_transformer, year_features),    
    (categorical_transformer, categorical_features),
    ("drop", drop_features),
)

### Check transformed df

In [37]:
X_train_transformed = preprocessor.fit_transform(X_train)

In [38]:
column_names = (
    numeric_features
    + year_features
    + preprocessor.named_transformers_["pipeline-3"]
    .named_steps["onehotencoder"]
    .get_feature_names_out(categorical_features)
    .tolist()
)

X_train_transformed_df = pd.DataFrame(
    X_train_transformed.toarray(), columns=column_names, index=X_train.index
)

X_train_transformed_df.head()

Unnamed: 0,floor_area,Year_Factor,energy_star_rating,january_min_temp,january_avg_temp,january_max_temp,july_min_temp,july_avg_temp,july_max_temp,cooling_degree_days,...,facility_type_Warehouse_Nonrefrigerated,facility_type_Warehouse_Refrigerated,facility_type_Warehouse_Selfstorage,facility_type_Warehouse_Uncategorized,dir_max_wind_speed_E,dir_max_wind_speed_N,dir_max_wind_speed_NE,dir_peak_wind_speed_E,dir_peak_wind_speed_N,dir_peak_wind_speed_NE
69130,-0.232267,1.11016,-0.505423,-0.044973,0.021055,-0.009842,0.580968,0.508076,0.110769,0.730937,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
27033,-0.112455,-0.929984,-1.191532,2.940822,3.076356,3.901936,0.580968,-1.546656,-3.739477,-1.117188,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2610,-0.028893,0.430112,0.916695,-0.36488,-0.636123,-0.568667,0.820453,0.535682,0.110769,0.965781,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
11958,-0.260485,-2.29008,0.954119,-0.578151,-0.661488,-1.127493,0.580968,0.890627,2.035892,0.327617,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,-0.396245,-2.29008,-1.29133,2.620915,2.315413,1.666635,-2.053367,-3.392365,-2.776916,-2.776415,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


### Dummy regressor as baseline

In [39]:
results = {}
pipe_dummy = DummyRegressor()
results["Dummy"] = mean_std_cross_val_scores(
    pipe_dummy, X_train, y_train, return_train_score=True, scoring=scoring_metrics
)
pd.DataFrame(results).T

Unnamed: 0,fit_time,score_time,test_neg RMSE,train_neg RMSE
Dummy,0.005 (+/- 0.001),0.000 (+/- 0.000),-58.681 (+/- 2.337),-58.716 (+/- 0.590)


### Train several models (CV) and retrieve the score

In [69]:
pipe_ridge = make_pipeline(preprocessor, Ridge(random_state=123))

pipe_rf = make_pipeline(
    preprocessor, RandomForestRegressor(n_estimators = 300, random_state=123, n_jobs=-1)
)

pipe_xgb = make_pipeline(
    preprocessor, XGBRegressor(random_state=123, n_jobs=-1, verbosity=0, n_estimators=1000)
)

pipe_lgbm = make_pipeline(preprocessor, LGBMRegressor(#num_leaves = 32, min_child_samples=100, 
                                                      random_state=123,
                                                      #feature_fraction = 0.9,
                                                      #lambda_l1 = 10, lambda_l2 = 10,
                                                      #bagging_freq=1,
                                                      #bagging_fraction=0.9,
                                                      #verbose = 0
                                                      n_estimators=1000
                                                     ))

pipe_catboost = make_pipeline(
    preprocessor, CatBoostRegressor(random_state=123, verbose=0)
)

models = {
    #"Ridge": pipe_ridge,
    #"Random Forest": pipe_rf,
    "XGBoost": pipe_xgb,
#    "LightGBM": pipe_lgbm,
    #"CatBoost": pipe_catboost,
    #"kNN": pipe_kNN,
}

#for model_name, model in models.items():
#    results[model_name] = mean_std_cross_val_scores(
#        model, X_train, y_train, return_train_score=True, scoring=scoring_metrics, cv = 10
#    )

In [62]:
pd.DataFrame(results).T

Unnamed: 0,fit_time,score_time,test_neg RMSE,train_neg RMSE
Dummy,0.005 (+/- 0.001),0.000 (+/- 0.000),-58.681 (+/- 2.337),-58.716 (+/- 0.590)
XGBoost,21.969 (+/- 0.565),0.058 (+/- 0.001),-38.467 (+/- 2.708),-13.970 (+/- 0.093)


### Feature selection

### Hyperparameter tuning

In [42]:
# param_grid_xgb = {
#      "xgbregressor__max_depth": np.linspace(start=1, stop=20, num=5, dtype=int),
#      "xgbregressor__min_child_weight": np.linspace(start=1, stop=20, num=5, dtype=int),
#      "xgbregressor__subsample": np.logspace(-3, 0, 10),
#      "xgbregressor__colsample_bytree": np.logspace(-3, 0, 10),
#      "xgbregressor__eta": np.logspace(-3, 0, 10)
# }


In [43]:
# xgb_search_rf = RandomizedSearchCV(
#     pipe_xgb,
#     param_grid_xgb,
#     n_jobs=-1,
#     n_iter=100,
#     scoring="neg_root_mean_squared_error",
#     cv=5,
#     random_state=123,
#     return_train_score=True,
# )

# xgb_search_rf.fit(X_train, y_train);

In [70]:
ensemble_regressor = {
#    "Ridge": pipe_ridge,
    "Random Forest": pipe_rf,
    "XGBoost": pipe_xgb,
    "LightGBM": pipe_lgbm,
#    "CatBoost": pipe_catboost,
}

stacking_model = StackingRegressor(list(ensemble_regressor.items()))

### Test the selected model #1

In [71]:
pipe = stacking_model

In [None]:
pipe_fitted = pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)
final_score = np.sqrt(mean_squared_error(y_test, y_pred))
final_score

### Test the tuned model #2

In [63]:
pipe2 = pipe_xgb

In [64]:
pipe2_fitted = pipe2.fit(X_train, y_train)

In [65]:
y_pred = pipe2.predict(X_test)
final_score = np.sqrt(mean_squared_error(y_test, y_pred))
final_score

33.475003391840346

### Generate csv for submission

In [51]:
# Transformation of test set
X_test_submit = pd.merge(X_test_submit, facility_class, on="facility_type")

# Impute energy_star_rating
X_test_submit_energy_isna = X_test_submit[X_test_submit["energy_star_rating"].isna()]
X_test_submit_na = X_test_submit_energy_isna[["floor_area", "facility_class", "year_built"]]
y_pred = pipe_knn_fitted.predict(X_test_submit_na)
y_pred_df = pd.DataFrame({"energy_star_rating": y_pred}, index = X_test_submit_na.index)
X_test_submit.fillna(value = y_pred_df, inplace = True)

value = X_test_submit["direction_max_wind_speed"]
X_test_submit['dir_max_wind_speed'] = np.where(value > 337.5, "N",
                                            np.where(value > 292.5, "NE",
                                                    np.where(value > 247.5, "E",
                                                             np.where(value > 202.5, "SE",
                                                                      np.where(value > 157.5, "S",
                                                                               np.where(value > 112.5, "SW",
                                                                                        np.where(value > 67.5, "W",
                                                                                                 np.where(value > 22.5, "NW", "N"))))))))

value = X_test_submit["direction_peak_wind_speed"]
X_test_submit['dir_peak_wind_speed'] = np.where(value > 337.5, "N",
                                            np.where(value > 292.5, "NE",
                                                    np.where(value > 247.5, "E",
                                                             np.where(value > 202.5, "SE",
                                                                      np.where(value > 157.5, "S",
                                                                               np.where(value > 112.5, "SW",
                                                                                        np.where(value > 67.5, "W",
                                                                                                 np.where(value > 22.5, "NW", "N"))))))))

In [52]:
select_model = stacking_model
submission = pd.DataFrame({'id': X_test_submit["id"], 'site_eui': select_model.predict(X_test_submit)})
submission.head()

Unnamed: 0,id,site_eui
0,75757,216.196404
1,75758,187.311307
2,75759,155.403244
3,75760,249.944963
4,75761,242.882366


In [53]:
submission.to_csv("test.csv", index=False)

In [66]:
select_model2 = pipe2
submission2 = pd.DataFrame({'id': X_test_submit["id"], 'site_eui': select_model2.predict(X_test_submit)})
submission2.head()

Unnamed: 0,id,site_eui
0,75757,192.998795
1,75758,209.167557
2,75759,155.271072
3,75760,250.166611
4,75761,231.551239


In [67]:
submission2.to_csv("test2.csv", index=False)