In [102]:
import numpy as np
import pandas as pd

from torch.utils.data import DataLoader
import torch.utils.data as data_utils
import torch
from torch import nn
from torchvision import transforms, datasets, utils
from torch.utils.data import DataLoader, TensorDataset

import sklearn
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

### Define scoring metrics and CV score function

### Load CSV files

In [103]:
df = pd.read_csv('train_data.csv')

### Any manual feature engineering before column transformation

In [104]:
temp_averages= ["january_avg_temp","february_avg_temp","march_avg_temp","april_avg_temp","may_avg_temp",
                "june_avg_temp","july_avg_temp","august_avg_temp","september_avg_temp","october_avg_temp","november_avg_temp",
                "december_avg_temp"]
temp_mins= ["january_min_temp","february_min_temp","march_min_temp","april_min_temp","may_min_temp",
                "june_min_temp","july_min_temp","august_min_temp","september_min_temp","october_min_temp","november_min_temp",
                "december_min_temp"]
temp_max= ["january_max_temp","february_max_temp","march_max_temp","april_max_temp","may_max_temp",
                "june_max_temp","july_max_temp","august_max_temp","september_max_temp","october_max_temp","november_max_temp",
                "december_max_temp"]

df["months_above_65"] =(df[temp_averages] >=65).sum(axis=1)
df["months_below_65"] =(df[temp_averages] <65).sum(axis=1)
df["months_min_below_65"] = (df[temp_mins] <65).sum(axis=1)
df["months_min_above_65"] = (df[temp_mins] >=65).sum(axis=1)
df["months_max_below_65"] = (df[temp_max] <65).sum(axis=1)
df["months_max_above_65"] = (df[temp_max] >=65).sum(axis=1)


In [105]:
facility_class = pd.read_csv("f_type.csv")
df = pd.merge(df, facility_class, on="facility_type")

In [106]:
# value = df["direction_max_wind_speed"]
# df['dir_max_wind_speed'] = np.where(value > 337.5, "N",
#                                 np.where(value > 292.5, "NE",
#                                         np.where(value > 247.5, "E",
#                                                  np.where(value > 202.5, "SE",
#                                                           np.where(value > 157.5, "S",
#                                                                    np.where(value > 112.5, "SW",
#                                                                             np.where(value > 67.5, "W",
#                                                                                      np.where(value > 22.5, "NW", "N"))))))))

# value = df["direction_peak_wind_speed"]
# df['dir_peak_wind_speed'] = np.where(value > 337.5, "N",
#                                 np.where(value > 292.5, "NE",
#                                         np.where(value > 247.5, "E",
#                                                  np.where(value > 202.5, "SE",
#                                                           np.where(value > 157.5, "S",
#                                                                    np.where(value > 112.5, "SW",
#                                                                             np.where(value > 67.5, "W",
#                                                                                      np.where(value > 22.5, "NW", "N"))))))))

### Group columns for transformations

In [107]:
target = "site_eui"

numeric_features = [
    "floor_area",
    "year_built",
    "energy_star_rating",
    "cooling_degree_days",
    "heating_degree_days",
    "precipitation_inches",
    "snowdepth_inches",
    "avg_temp",
    "january_avg_temp",
    "february_avg_temp",
    "march_avg_temp",
    "april_avg_temp",
    "may_avg_temp",
    "june_avg_temp",
    "july_avg_temp",
    "august_avg_temp",
    "september_avg_temp",
    "october_avg_temp",
    "november_avg_temp",
    "december_avg_temp",
    "days_below_30F",
    "days_above_80F",
    "max_wind_speed",
    "months_above_65",
    "months_below_65",
    "months_min_below_65",
    "months_min_above_65",
    "months_max_below_65",
    "months_max_above_65",
    "snowfall_inches",
    "january_min_temp",
    "january_max_temp",
    "february_max_temp",
    "february_min_temp",
    "march_min_temp",
    "march_max_temp",
    "april_min_temp",
    "april_max_temp",
    "may_min_temp",
    "may_max_temp",
    "june_min_temp",
    "june_max_temp",
    "july_min_temp",
    "july_max_temp",
    "august_min_temp",
    "august_max_temp",
    "september_min_temp",
    "september_max_temp",
    "october_min_temp",
    "october_max_temp",
    "november_min_temp",
    "november_max_temp",
    "december_min_temp",
    "december_max_temp",
    "ELEVATION",
    "days_below_20F",
    "days_below_10F",
    "days_below_0F",
    "days_above_90F",
    "days_above_100F",
    "days_above_110F"
]

ordinal_features = []
categorical_features = ["State_Factor",
                        "building_class",
                        'facility_class']

drop_features = [
    "id",
    "days_with_fog",
    "facility_type",
    "direction_max_wind_speed",
    "direction_peak_wind_speed",
    "Year_Factor",
]

assert df.columns.shape[0] == len(
    numeric_features
    + ordinal_features
    + categorical_features
    + [target]
    + drop_features
)

### Split data for CV

In [108]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)
X_train, y_train = train_df.drop(columns=[target]), train_df[target]
X_test, y_test = test_df.drop(columns=[target]), test_df[target]

### Column transformation & preprocessors

In [109]:
numeric_transformer = make_pipeline(SimpleImputer(strategy="constant", fill_value=0), StandardScaler())

categorical_transformer = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse=True),
)

In [110]:
preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features),
    ("drop", drop_features),
)

## Neural Network Framework

In [111]:
# Prepare data
X_train_transformed = preprocessor.fit_transform(X_train)

column_names = (
    numeric_features
    + preprocessor.named_transformers_["pipeline-2"]
    .named_steps["onehotencoder"]
    .get_feature_names_out(categorical_features)
    .tolist()
)

X_train_transformed_df = pd.DataFrame(
    X_train_transformed, columns=column_names, index=X_train.index
)

X_valid_transformed = preprocessor.fit_transform(X_test)

X_valid_transformed_df = pd.DataFrame(
    X_valid_transformed, columns=column_names, index=X_test.index
)

In [112]:
class energy_model(torch.nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.main = nn.Sequential(
            torch.nn.Linear(input_size, 150),
            nn.LeakyReLU(),
            nn.Dropout(0.3),
            
            torch.nn.Linear(150, 50),
            nn.LeakyReLU(),
            nn.Dropout(0.3),
            
            torch.nn.Linear(50, 30),
            nn.LeakyReLU(),
            nn.Dropout(0.3),
            
            torch.nn.Linear(30, 1)
        )

    def forward(self, x):
        out= self.main(x)
        return out

In [113]:
def trainer(model, criterion, optimizer, trainloader, validloader, epochs=5, patience=5, verbose=True):
    """Simple training wrapper for PyTorch network."""
    

    train_loss = []
    valid_loss = []
    train_accuracy = []
    valid_accuracy = []
    
    for epoch in range(epochs):  # for each epoch
        
        train_batch_loss = 0
        train_batch_acc = 0
        valid_batch_loss = 0
        valid_batch_acc = 0
        
        # Training
        for X, y in trainloader:
            optimizer.zero_grad()       # Zero all the gradients w.r.t. parameters
            y_hat = model(X).flatten()
            loss = criterion(y_hat, y)   # Calculate loss based on output
            loss.backward()             # Calculate gradients w.r.t. parameters
            optimizer.step()            # Update parameters
            train_batch_loss += loss.item()  # Add loss for this batch to running total
            train_batch_acc += (torch.sqrt(torch.mean((y_hat-y)**2))*-1).type(torch.float32).item()   
            
        train_loss.append(train_batch_loss / len(trainloader))     # loss = total loss in epoch / number of batches = loss per batch
        train_accuracy.append(train_batch_acc / len(trainloader))  # accuracy
        
        # Validation
        model.eval()  # this turns off those random dropout layers, we don't want them for validation!
        
        with torch.no_grad():  # this stops pytorch doing computational graph stuff under-the-hood and saves memory and time
            for X, y in validloader:
                y_hat = model(X).flatten()  # Forward pass to get output
                loss = criterion(y_hat, y)   # Calculate loss based on output
                valid_batch_loss += loss.item()                  # Add loss for this batch to running total
                valid_batch_acc += (torch.sqrt(torch.mean((y_hat-y)**2))*-1).type(torch.float32).item()   # Average accuracy for this batch
                
        valid_loss.append(valid_batch_loss / len(validloader))
        valid_accuracy.append(valid_batch_acc / len(validloader))  # accuracy
        
        model.train()  # turn back on the dropout layers for the next training loop
        
        # Print progress
        if verbose:
            print(f"Epoch {epoch + 1:3}:",
                  f"Train Loss: {train_loss[-1]:.3f}.",
                  f"Valid Loss: {valid_loss[-1]:.3f}.",
                  f"Train Accuracy: {train_accuracy[-1]:.2f}.",
                  f"Valid Accuracy: {valid_accuracy[-1]:.2f}.")
        
        # Early stopping
        if epoch > 0 and valid_loss[-1] > valid_loss[-2]:
            consec_increases += 1
        else:
            consec_increases = 0
        if consec_increases == patience:
            print(f"Stopped early at epoch {epoch + 1:3}: val loss increased for {consec_increases} consecutive epochs!")
            break
    
    results = {"train_loss": train_loss,
               "valid_loss": valid_loss,
               "train_accuracy": train_accuracy,
               "valid_accuracy": valid_accuracy}
    return results

In [114]:
input_size = np.shape(X_train_transformed_df)[1]
model = energy_model(input_size)
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr =0.001)


train_target = torch.tensor(y_train.values.astype(np.float32))
train = torch.tensor(X_train_transformed_df.values.astype(np.float32)) 
train_tensor = data_utils.TensorDataset(train, train_target) 
train_loader = data_utils.DataLoader(dataset = train_tensor,  batch_size=15, shuffle=True)


valid_target = torch.tensor(y_test.values.astype(np.float32))
valid = torch.tensor(X_valid_transformed_df.values.astype(np.float32)) 
valid_tensor = data_utils.TensorDataset(valid, valid_target) 
valid_loader = data_utils.DataLoader(dataset = valid_tensor,  batch_size=15, shuffle=True)

In [115]:
trainer(model, criterion, optimizer, train_loader, valid_loader, epochs=30, patience=3);

Epoch   1: Train Loss: 31.879. Valid Loss: 25.715. Train Accuracy: -48.45. Valid Accuracy: -40.95.
Epoch   2: Train Loss: 29.250. Valid Loss: 25.493. Train Accuracy: -45.60. Valid Accuracy: -40.95.
Epoch   3: Train Loss: 28.963. Valid Loss: 25.449. Train Accuracy: -45.28. Valid Accuracy: -41.10.
Epoch   4: Train Loss: 28.655. Valid Loss: 25.328. Train Accuracy: -44.94. Valid Accuracy: -40.84.
Epoch   5: Train Loss: 28.463. Valid Loss: 25.830. Train Accuracy: -44.80. Valid Accuracy: -42.13.
Epoch   6: Train Loss: 28.244. Valid Loss: 25.219. Train Accuracy: -44.63. Valid Accuracy: -40.60.
Epoch   7: Train Loss: 28.169. Valid Loss: 25.349. Train Accuracy: -44.47. Valid Accuracy: -40.76.
Epoch   8: Train Loss: 28.064. Valid Loss: 25.164. Train Accuracy: -44.49. Valid Accuracy: -40.32.
Epoch   9: Train Loss: 27.895. Valid Loss: 25.001. Train Accuracy: -44.10. Valid Accuracy: -40.36.
Epoch  10: Train Loss: 27.753. Valid Loss: 24.795. Train Accuracy: -44.21. Valid Accuracy: -40.31.
Epoch  11:

## Predicting Using the model

In [125]:
df = pd.read_csv('test.csv')

In [126]:
## Preprocess testing data
temp_averages= ["january_avg_temp","february_avg_temp","march_avg_temp","april_avg_temp","may_avg_temp",
                "june_avg_temp","july_avg_temp","august_avg_temp","september_avg_temp","october_avg_temp","november_avg_temp",
                "december_avg_temp"]
temp_mins= ["january_min_temp","february_min_temp","march_min_temp","april_min_temp","may_min_temp",
                "june_min_temp","july_min_temp","august_min_temp","september_min_temp","october_min_temp","november_min_temp",
                "december_min_temp"]
temp_max= ["january_max_temp","february_max_temp","march_max_temp","april_max_temp","may_max_temp",
                "june_max_temp","july_max_temp","august_max_temp","september_max_temp","october_max_temp","november_max_temp",
                "december_max_temp"]

df["months_above_65"] =(df[temp_averages] >=65).sum(axis=1)
df["months_below_65"] =(df[temp_averages] <65).sum(axis=1)
df["months_min_below_65"] = (df[temp_mins] <65).sum(axis=1)
df["months_min_above_65"] = (df[temp_mins] >=65).sum(axis=1)
df["months_max_below_65"] = (df[temp_max] <65).sum(axis=1)
df["months_max_above_65"] = (df[temp_max] >=65).sum(axis=1)

df = pd.merge(df, facility_class, on="facility_type")

id_submission = df["id"]

In [127]:
X_test_transformed = preprocessor.fit_transform(df)
column_names_testing = (
    numeric_features
    + preprocessor.named_transformers_["pipeline-2"]
    .named_steps["onehotencoder"]
    .get_feature_names_out(categorical_features)
    .tolist()
)
X_test_transformed = pd.DataFrame(
   X_test_transformed, columns=column_names_testing )

In [128]:
# State factor 6 needs to be manually added in
X_test_transformed['State_Factor_State_6'] = 0

# reorder columns to be in same order as training set
X_test_transformed = X_test_transformed[X_train_transformed_df.columns]

# Change to tensor
X_test_transformed= torch.tensor(X_test_transformed.values.astype(np.float32))

In [129]:
prediction = []
for i in range(0,X_test_transformed.shape[0]):
    prediction.append( model(X_test_transformed[i]).item())
    
submission = pd.DataFrame({'id': id_submission, 'site_eui': prediction})
submission.to_csv("submission.csv", index=False)