In [7]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
sns.set_style("whitegrid")
import os
from skimpy import skim
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder


In [2]:
train = pd.read_csv(r"data/train.csv")
test = pd.read_csv(r"data/test.csv")
# skim(train)
numerical_features=train.select_dtypes('number').columns


In [8]:
def fill_missing_values(df, test_df):
    missing_columns_train = [col for col in df.columns if df[col].isnull().any()]
    print("Columns with Missing values in train set:", missing_columns_train)
    missing_columns_test = [col for col in test_df.columns if test_df[col].isnull().any()]
    print("Columns with Missing values in test set:", missing_columns_test)

    df.loc[df["year_built"] == 0, "year_built"] = np.nan
    test_df.loc[test_df["year_built"] == 0, "year_built"] = np.nan

    imputer = SimpleImputer()
    imputer.fit(train[missing_columns_train])
    data_transformed = imputer.transform(train[missing_columns_train])
    df[missing_columns_train] = pd.DataFrame(data_transformed)
    test_data_transformed = imputer.transform(test[missing_columns_test])
    test_df[missing_columns_test] = pd.DataFrame(test_data_transformed)

    print("Columns with Missing values in train set after imputation:", df.columns[df.isnull().any()])
    print("Columns with Missing values in test set after imputation:", test_df.columns[test_df.isnull().any()])

    return df, test_df


def categorical_label_encoding(df, test_df):
    object_cols = []
    int_cols = []
    float_cols = []
    for col in df.columns:
        if col != "site_eui":
            if df[col].dtype == "object":
                object_cols.append(col)
            elif df[col].dtype == "int64":
                int_cols.append(col)
            elif df[col].dtype == "float64":
                float_cols.append(col)
    print(f"Starting Label Encoding for {object_cols}")
    le = LabelEncoder()
    for col in object_cols:
        df[col] = le.fit_transform(df[col])
        test_df[col] = le.fit_transform(test_df[col])

    return df, test_df


def scale_data(df, test_df):
    print("Scaling Data with StandardScaler")
    import copy

    # code copied from https://www.kaggle.com/usharengaraju/wids2022-lgbm-starter-w-b
    y_df = df["site_eui"]
    df = df.drop(["site_eui", "id"], axis=1)
    test_df = test_df.drop(["id"], axis=1)
    scaler = StandardScaler()
    # return scaled data as a pandas dataframe including their original column names
    df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    test_df = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)
    return df, test_df, y_df


def feature_selection(df, test_df):
    print("Feature Selection")
    # drop features where standard deviation is 0
    drop_cols = []
    for col in df.columns:
        if df[col].std() == 0 or df[col].std() == np.nan:
            if col != "site_eui":
                drop_cols.append(col)
                print(col)

    for col in test_df.columns:
        if test_df[col].std() == 0 or test_df[col].std() == np.nan:
            if col != "site_eui":
                drop_cols.append(col)
                print(col)
    return drop_cols


def preprocess_data(df, test_df):
    df, test_df = fill_missing_values(df, test_df)
    df, test_df = categorical_label_encoding(df, test_df)
    df, test_df, y_df = scale_data(df, test_df)
    print("Preprocessing Done")
    print("Train Data Shape:", df.shape)
    print("Test Data Shape:", test_df.shape)
    return df, test_df, y_df


In [5]:
df, test_df = preprocess_data(train, test)

Columns with Missing values in train set: ['year_built', 'energy_star_rating', 'direction_max_wind_speed', 'direction_peak_wind_speed', 'max_wind_speed', 'days_with_fog']
Columns with Missing values in test set: ['year_built', 'energy_star_rating', 'direction_max_wind_speed', 'direction_peak_wind_speed', 'max_wind_speed', 'days_with_fog']


In [None]:
# Split train set into train and test for model validation  

X_train, X_test, y_train, y_test = train_test_split(
    preprocessed_df, y_df, test_size=0.25, random_state=42)
print("Train Data Shape:", X_train.shape)
print("Test Data Shape:", X_test.shape)

## Modelling 
We model the data as a regression problem. We use the following models:
- Linear Regression
- Decision Tree Regressor
- Random Forest Regressor
- Gradient Boosting (GBM)
- Light Gradient Boosting (Light GBM)
- Extreme Gradient Boosting (XGBM)
Prior to choosing the best performing model, we first compare the R2 scores of each model in their default settings. For the models that perform well, we then tune their hyperparameters to improve their performance. Cross-validation is used to improve the generalization of the models.

In [None]:
# A class which return cross-val mean R2 score 

class CV_regression_model ():
    def __init__(self, model_name, model, X, y, folds=5):
        self.model_name = model_name
        self.model = model
        self.X = X
        self.y = y
        self.folds = folds
        self.results = None 
        self.mean_score = None
        self.std_score = None

    def fit(self):
        scores = cross_val_score(self.model, self.X, self.y, cv=self.folds, scoring='r2')
        self.results = scores
        self.mean_score = np.mean(scores)
        self.std_score = np.std(scores)

    def print_results(self):
        print(f"Model: {self.model_name}")
        print(f"Mean R2 score: {self.mean_score}")
        print(f"Standard Deviation: {self.std_score}")
        print(f"R2 scores: {self.results}")

    def r2_scores(self):
        return round(self.mean_score, 4), round(self.std_score, 3)

In [None]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression() 
lr = CV_regression_model("Linear Regression", lr_model, X_train, y_train)
lr.fit()
lr.print_results()

from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor(random_state=42)
dt = CV_regression_model("Decision Tree", dt_model, X_train, y_train)
dt.fit()
dt.print_results()
df_r2_score.append(dt.r2_scores())

from sklearn.ensemble import RandomForestRegressor
rfr_model = RandomForestRegressor(random_state=42)
rfr = CV_regression_model("Random Forest", rfr_model, X_train, y_train)
rfr.fit()
rfr.print_results()
df_r2_score.append(rfr.r2_scores())

from sklearn.ensemble import GradientBoostingRegressor
gbr_model = GradientBoostingRegressor(random_state=42)
gbr = CV_regression_model("Gradient Boosting", gbr_model, X_train, y_train)
gbr.fit()
gbr.print_results()
df_r2_score.append(gbr.r2_scores())

from xgboost import XGBRegressor
xgbr_model =  XGBRegressor(random_state=42)
xgbr = CV_regression_model("XGBoost", xgbr_model, X_train, y_train)
xgbr.fit()
xgbr.print_results()
df_r2_score.append(xgbr.r2_scores())
import lightgbm
lgbm_model = lightgbm.LGBMRegressor(random_state=42)
lgbm = CV_regression_model("LightGBM", lgbm_model, X_train, y_train)
lgbm.fit()
lgbm.print_results()
df_r2_score.append(lgbm.r2_scores())

# Define a data frame that includes R2 score and model numbers. 
result = pd.DataFrame()
df_model = ["Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "XGBoost", "LightGBM"]
# Assigne model name and R2 scores 
result['model'] = df_model
result['r2'] = df_r2 

# make barplot of result
plt.figure(figsize=(10, 6))
sns.barplot(x='model', y='r2', data=result)
plt.title('R2 Scores of Models')
plt.show()