In [1]:
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor
# from lightgbm.sklearn import LGBMRegressor
# from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor, export_graphviz


### Define scoring metrics and CV score function

In [2]:
scoring_metrics = {
    "neg RMSE": "neg_root_mean_squared_error",
}

In [3]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

### Load CSV files

In [4]:
df = pd.read_csv('train_data.csv')
# X_test_submit = pd.read_csv('../data/test.csv')

### Any manual feature engineering before column transformation

In [5]:
temp_averages= ["january_avg_temp","february_avg_temp","march_avg_temp","april_avg_temp","may_avg_temp",
                "june_avg_temp","july_avg_temp","august_avg_temp","september_avg_temp","october_avg_temp","november_avg_temp",
                "december_avg_temp"]
temp_mins= ["january_min_temp","february_min_temp","march_min_temp","april_min_temp","may_min_temp",
                "june_min_temp","july_min_temp","august_min_temp","september_min_temp","october_min_temp","november_min_temp",
                "december_min_temp"]
temp_max= ["january_max_temp","february_max_temp","march_max_temp","april_max_temp","may_max_temp",
                "june_max_temp","july_max_temp","august_max_temp","september_max_temp","october_max_temp","november_max_temp",
                "december_max_temp"]

df["months_above_65"] =(df[temp_averages] >=65).sum(axis=1)
df["months_below_65"] =(df[temp_averages] <65).sum(axis=1)
df["months_min_below_65"] = (df[temp_mins] <65).sum(axis=1)
df["months_min_above_65"] = (df[temp_mins] >=65).sum(axis=1)
df["months_max_below_65"] = (df[temp_max] <65).sum(axis=1)
df["months_max_above_65"] = (df[temp_max] >=65).sum(axis=1)


In [6]:
#Building Grouping 

df.loc[df.facility_type.isin(["Commercial_Unknown",
                            "Commercial_Other",
                            "Mixed_Use_Predominantly_Commercial",
                            "Mixed_Use_Commercial_and_Residential"]), 'b_subtype'] = "commercial_other"

df.loc[df.facility_type.isin(["Data_Center", 
                                "Laboratory"]), 'b_subtype'] = "science"
                                    
df.loc[df.facility_type.isin(["Education_Other_classroom",
                "Education_College_or_university"
                "Education_Uncategorized"
                "Education_Preschool_or_daycare"]), 'b_subtype'] = "education"   


df.loc[df.facility_type.isin(["Public_Assembly_Entertainment_culture",                              
                "Public_Assembly_Drama_theater",
                "Public_Assembly_Social_meeting",
                "Public_Assembly_Recreation",
                "Public_Assembly_Movie_Theater",
                "Public_Assembly_Library",
                "Public_Safety_Uncategorized",
                "Public_Safety_Fire_or_police_station",
                "Public_Assembly_Other",
                "Public_Safety_Penitentiary",
                "Public_Safety_Courthouse",
                "Public_Assembly_Stadium",
                "Public_Assembly_Uncategorized",
                "Religious_worship",
                "Parking_Garage"]), 'b_subtype'] = "public"   

df.loc[df.facility_type.isin(["Warehouse_Distribution_or_Shipping_center",
                "Warehouse_Nonrefrigerated",
                "Warehouse_Selfstorage",
                "Warehouse_Uncategorized",
                "Warehouse_Refrigerated"]), 'b_subtype'] = "warehouse"  

df.loc[df.facility_type.isin(["Retail_Vehicle_dealership_showroom",
            "Retail_Uncategorized",
            "Retail_Strip_shopping_mall",
            "Retail_Enclosed_mall"]), 'b_subtype'] = "retail"  


df.loc[df.facility_type.isin(["Food_Service_Uncategorized",
            "Food_Service_Other",
            "Food_Service_Restaurant_or_cafeteria",
            "Food_Sales",
            "Grocery_store_or_food_market"]), 'b_subtype'] = "food"  

df.loc[df.facility_type.isin(["Nursing_Home",
            "Lodging_Dormitory_or_fraternity_sorority",
            "Lodging_Other",
            "Lodging_Uncategorized",
            "Lodging_Hotel",
            "Mixed_Use_Predominantly_Residential"]), 'b_subtype'] = "lodging" 

df.loc[df.facility_type.isin(["Office_Bank_or_other_financial"
            "Office_Mixed_use",
            "Office_Uncategorized",
            "Office_Medical_non_diagnostic"]), 'b_subtype'] = "office" 

df.loc[df.facility_type.isin(["Service_Vehicle_service_repair_shop",
            "Service_Drycleaning_or_Laundry",
            "Service_Uncategorized"]), 'b_subtype'] = "survice" 

df.loc[df.facility_type.isin(["Health_Care_Outpatient_Uncategorized",
            "Health_Care_Inpatient",
            "Health_Care_Uncategorized",
            "Health_Care_Outpatient_Clinic"]), 'b_subtype'] = "health_care" 


df.loc[df.facility_type == "Industrial", 'b_subtype'] = "industrial" 

df.loc[df.facility_type == "2to4_Unit_Building", 'b_subtype'] = "2to4_unit" 

df.loc[df.facility_type == "5plus_Unit_Building", 'b_subtype'] = "5_unit" 

df.loc[df.facility_type == "Multifamily_Uncategorized", 'b_subtype'] = "multifamily" 

### Group columns for transformations

In [7]:
target = "site_eui"

numeric_features = [
    "Year_Factor",
    "floor_area",
    "year_built",
    "energy_star_rating", # Nan to 0?
    "ELEVATION",
    "january_avg_temp",
    "february_avg_temp",
    "march_avg_temp",
    "april_avg_temp",
    "may_avg_temp",
    "june_avg_temp",
    "july_avg_temp",
    "august_avg_temp",
    "september_avg_temp",
    "october_avg_temp",
    "november_avg_temp",
    "december_avg_temp",
    "cooling_degree_days",
    "heating_degree_days",
    "precipitation_inches",
    "snowdepth_inches",
    "avg_temp",
    "days_below_30F",
    "days_below_20F",
    "days_below_10F",
    "days_below_0F",
    "days_above_80F",
    "days_above_90F",
    "days_above_100F",
    "days_above_110F",
    "max_wind_speed",
    "months_above_65",
    "months_below_65"
]

ordinal_features = []
categorical_features = ["State_Factor",
                        "building_class",
                        'b_subtype']

drop_features = [
    "id",
    "january_min_temp",
    "january_max_temp",
    "february_max_temp",
    "february_min_temp",
    "march_min_temp",
    "march_max_temp",
    "april_min_temp",
    "april_max_temp",
    "may_min_temp",
    "may_max_temp",
    "june_min_temp",
    "june_max_temp",
    "july_min_temp",
    "july_max_temp",
    "august_min_temp",
    "august_max_temp",
    "september_min_temp",
    "september_max_temp",
    "october_min_temp",
    "october_max_temp",
    "november_min_temp",
    "november_max_temp",
    "december_min_temp",
    "december_max_temp",
    "snowfall_inches",
    "direction_peak_wind_speed",
    "direction_max_wind_speed",
    "days_with_fog",
    "facility_type",
    "months_min_below_65",
    "months_min_above_65",
    "months_max_below_65",
    "months_max_above_65"]

assert df.columns.shape[0] == len(
    numeric_features
    + ordinal_features
    + categorical_features
    + [target]
    + drop_features
)

### Split data for CV

In [8]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)
X_train, y_train = train_df.drop(columns=[target]), train_df[target]
X_test, y_test = test_df.drop(columns=[target]), test_df[target]

### Column transformation & preprocessors

In [9]:
numeric_transformer = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())

categorical_transformer = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse=True),
)

In [10]:
preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features),
    ("drop", drop_features),
)

### Check transformed df

In [11]:
X_train_transformed = preprocessor.fit_transform(X_train)

In [12]:
column_names = (
    numeric_features
    + preprocessor.named_transformers_["pipeline-2"]
    .named_steps["onehotencoder"]
    .get_feature_names_out(categorical_features)
    .tolist()
)

X_train_transformed_df = pd.DataFrame(
    X_train_transformed, columns=column_names, index=X_train.index
)

X_train_transformed_df.head()

Unnamed: 0,Year_Factor,floor_area,year_built,energy_star_rating,ELEVATION,january_avg_temp,february_avg_temp,march_avg_temp,april_avg_temp,may_avg_temp,...,b_subtype_industrial,b_subtype_lodging,b_subtype_multifamily,b_subtype_office,b_subtype_public,b_subtype_retail,b_subtype_science,b_subtype_survice,b_subtype_warehouse,b_subtype_nan
69130,1.107399,2.675386,-0.689701,0.0,-0.351574,-0.642124,-0.185602,0.224124,-1.542441,-1.638004,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27033,-0.930308,-0.410844,-0.689701,-1.263114,-0.229869,0.109424,-0.184002,-0.651749,-0.320761,-0.338626,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2610,-0.251072,-0.501988,0.073207,0.124062,1.350624,2.973102,2.183968,2.189378,2.527678,-0.090301,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
11958,0.428163,-0.433457,-1.588842,1.121094,2.707715,-1.771235,-2.241353,-1.551884,-2.235285,-1.251078,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-2.288779,0.433997,0.073207,-0.699574,-0.62666,2.315448,1.699886,1.386293,0.686532,-2.46383,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Dummy regressor as baseline

In [13]:
results = {}
pipe_dummy = DummyRegressor()
results["Dummy"] = mean_std_cross_val_scores(
    pipe_dummy, X_train, y_train, return_train_score=True, scoring=scoring_metrics
)
pd.DataFrame(results).T

Unnamed: 0,fit_time,score_time,test_neg RMSE,train_neg RMSE
Dummy,0.014 (+/- 0.005),0.001 (+/- 0.002),-58.543 (+/- 1.843),-58.563 (+/- 0.456)


### Train several models (CV) and retrieve the score

In [15]:
pipe_ridge = make_pipeline(preprocessor, Ridge(random_state=123))

pipe_rf = make_pipeline(
    preprocessor, RandomForestRegressor(random_state=123, n_jobs=-1)
)

# pipe_xgb = make_pipeline(
#     preprocessor, XGBRegressor(random_state=123, n_jobs=-1, verbosity=0)
# )

# pipe_lgbm = make_pipeline(preprocessor, LGBMRegressor(random_state=123))

pipe_catboost = make_pipeline(
    preprocessor, CatBoostRegressor(random_state=123, verbose=0)
)

models = {
    # "Ridge": pipe_ridge,
    # "Random Forest": pipe_rf,
    # "XGBoost": pipe_xgb,
    # "LightGBM": pipe_lgbm,
    "CatBoost": pipe_catboost,
}

for model_name, model in models.items():
    results[model_name] = mean_std_cross_val_scores(
        model, X_train, y_train, return_train_score=True, scoring=scoring_metrics
    )

In [17]:
pd.DataFrame(results).T

Unnamed: 0,fit_time,score_time,test_neg RMSE,train_neg RMSE
Dummy,0.014 (+/- 0.005),0.001 (+/- 0.002),-58.543 (+/- 1.843),-58.563 (+/- 0.456)
CatBoost,7.788 (+/- 1.172),0.266 (+/- 0.019),-41.998 (+/- 1.598),-35.311 (+/- 0.359)


In [16]:
pipe_catboost

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Year_Factor', 'floor_area',
                                                   'year_built',
                                                   'energy_star_rating',
                                                   'ELEVATION',
                                                   'january_avg_temp',
                                                   'february_avg_temp',
                                                   'march_avg_temp',
                                                   'april_avg_

## Neural Network Framework

In [35]:
import torch
from torch import nn
from torchvision import transforms, datasets, utils
from torch.utils.data import DataLoader, TensorDataset

In [48]:
class energy_model(torch.nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.main = nn.Sequential(
            torch.nn.Linear(input_size, 30),
            nn.LeakyReLU(),
            torch.nn.Linear(30, 20),
            nn.LeakyReLU(),
            torch.nn.Linear(20, 5),
            nn.LeakyReLU(),
            torch.nn.Linear(5, 1)
        )

    def forward(self, x):
        out= self.main(x)
        return out

In [55]:
def trainer(model, optimizer, trainloader, validloader, epochs=5, patience=5, verbose=True):
    """Simple training wrapper for PyTorch network."""
    
    def RMSELoss(yhat,y):
        return torch.sqrt(torch.mean((yhat-y)**2))

    train_loss = []
    valid_loss = []
    train_accuracy = []
    valid_accuracy = []
    
    for epoch in range(epochs):  # for each epoch
        
        train_batch_loss = 0
        train_batch_acc = 0
        valid_batch_loss = 0
        valid_batch_acc = 0
        
        # Training
        for X, y in trainloader:
            optimizer.zero_grad()       # Zero all the gradients w.r.t. parameters
            y_hat = model(X.view(X.shape[0], -1)).flatten()
            loss = RMSE(yhat,y)   # Calculate loss based on output
            loss.backward()             # Calculate gradients w.r.t. parameters
            optimizer.step()            # Update parameters
            train_batch_loss += loss.item()  # Add loss for this batch to running total
            train_batch_acc += (y_hat_labels == y).type(torch.float32).mean().item()   # Average accuracy for this batch
            
        train_loss.append(train_batch_loss / len(trainloader))     # loss = total loss in epoch / number of batches = loss per batch
        train_accuracy.append(train_batch_acc / len(trainloader))  # accuracy
        
        # Validation
        model.eval()  # this turns off those random dropout layers, we don't want them for validation!
        
        with torch.no_grad():  # this stops pytorch doing computational graph stuff under-the-hood and saves memory and time
            for X, y in validloader:
                y_hat = model(X.view(X.shape[0], -1)).flatten()  # Forward pass to get output
                y_hat_labels = torch.sigmoid(y_hat) > 0.5        # convert probabilities to False (0) and True (1)
                loss = RMSE(yhat,y)   # Calculate loss based on output
                valid_batch_loss += loss.item()                  # Add loss for this batch to running total
                valid_batch_acc += (y_hat_labels == y).type(torch.float32).mean().item()   # Average accuracy for this batch
                
        valid_loss.append(valid_batch_loss / len(validloader))
        valid_accuracy.append(valid_batch_acc / len(validloader))  # accuracy
        
        model.train()  # turn back on the dropout layers for the next training loop
        
        # Print progress
        if verbose:
            print(f"Epoch {epoch + 1:3}:",
                  f"Train Loss: {train_loss[-1]:.3f}.",
                  f"Valid Loss: {valid_loss[-1]:.3f}.",
                  f"Train Accuracy: {train_accuracy[-1]:.2f}.",
                  f"Valid Accuracy: {valid_accuracy[-1]:.2f}.")
        
        # Early stopping
        if epoch > 0 and valid_loss[-1] > valid_loss[-2]:
            consec_increases += 1
        else:
            consec_increases = 0
        if consec_increases == patience:
            print(f"Stopped early at epoch {epoch + 1:3}: val loss increased for {consec_increases} consecutive epochs!")
            break
    
    results = {"train_loss": train_loss,
               "valid_loss": valid_loss,
               "train_accuracy": train_accuracy,
               "valid_accuracy": valid_accuracy}
    return results

In [56]:
input_size = len(X_train_transformed_df.columns)-1
model = energy_model(input_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

np.savez('train_data',input=X_train,targets=y_train)
training_data=np.load('train_data.npz')
train_loader = torch.utils.data.DataLoader(training_data, batch_size= 200, shuffle=True)

np.savez('test_data',input=X_test,targets=y_test)
testing_data=np.load('test_data.npz')
valid_loader = torch.utils.data.DataLoader(testing_data, batch_size= 200, shuffle=True)


trainer(model, optimizer, train_loader, valid_loader, epochs=3, patience=3)

KeyError: '1 is not a file in the archive'

### Feature selection

### Hyperparameter tuning

In [None]:
param_grid = {
     "catboost_depth":np.arange(2, 20, 1),
     "catboost_iterations":np.arange(2, 20, 1)
}

random_search = RandomizedSearchCV(
    pipe_catboost,
    param_distributions=param_grid,
    n_jobs=-1,
    n_iter=5,
    cv=5,
    random_state=123,
)
random_search.fit(X_train, y_train)

In [None]:
random_search.best_params_

### Test the selected model

In [14]:
pipe = pipe_xgb

In [15]:
pipe_fitted = pipe.fit(X_train, y_train)

In [16]:
final_score = pipe.score(X_test, y_test)
final_score

0.4981037009181184

### Generate csv for submission

In [23]:
submission = pd.DataFrame({'id': X_test_submit["id"], 'site_eui': pipe_xgb.predict(X_test_submit)})
submission.head()

Unnamed: 0,id,site_eui
0,75757,283.555634
1,75758,230.533096
2,75759,341.535339
3,75760,281.239624
4,75761,297.303864


In [24]:
submission.to_csv("test.csv", index=False)