## Sample Model

Based on the exploration, we try the following *as a starting point*.

> #### Delete the following columns
> - days_with_fog
> - direction_peak_wind_speed
> - direction_max_wind_speed
> - max_wind_speed
> - year_built
> - days_above_90F
> - days_above_110F
> - facility_type
> - site_eui (obviously, because target column)
> - For now also delete Year_Factor

> #### Impute missing values for energy_star_rating 
> Imputing is done by replacing nan by the mean <br>
> By second thoughts we do NOT impute as XGBoost infers missing values

> #### One-hot encode categorical values 
> - State_Factor
> - Building_Class

For the sample model, we use **random forest** and **xgboost**.

In [1]:
import sklearn
import xgboost as xgb
from evaluation import RMSE
from evaluation import helper_func
import pandas as pd
import gc
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
import lightgbm as lgbm
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
import optuna
import math
from sklearn.model_selection import cross_val_score, KFold

#### Preparing and reading data

In [2]:
data = pd.read_csv("/Users/charlottefelius/documents/wids2022/WIDS/data/train.csv")
submission = pd.read_csv("/Users/charlottefelius/documents/wids2022/WIDS/data/test.csv")

In [3]:
len(submission)

9705

### Feature engineering

#### delete columns

In [4]:
reduced = pd.read_csv("/Users/charlottefelius/documents/wids2022/WIDS/data/train.csv")

# to_delete = ["days_with_fog", "direction_peak_wind_speed", "direction_max_wind_speed", "max_wind_speed",
#              "days_above_90F", "days_above_110F", "Year_Factor"]

to_delete_less = ["days_with_fog", "direction_peak_wind_speed", "direction_max_wind_speed", "max_wind_speed"]
# to_delete_less = ["building_class"]

def delete_cols(dataframe, columns):
    for colname in columns:
        del dataframe[colname]

delete_cols(reduced, to_delete_less)
delete_cols(submission, to_delete_less)

# # collect garbage
# gc.collect()


#### Temperature difference

In [5]:
min_temp = [string for string in reduced.columns if 'min_temp' in string]
max_temp = [string for string in reduced.columns if 'max_temp' in string]

In [6]:
below_ex = [string for string in reduced.columns if 'days_below_30F' in string]
above_ex = [string for string in reduced.columns if 'days_above_80F' in string]

In [7]:
above_ex

['days_above_80F']

In [8]:
# Per month, do max - min

for min_, max_ in zip(min_temp, max_temp):
    name = min_.split("min")[0] + "diff"
    reduced[name] = reduced[max_] - reduced[min_]
    
for min_, max_ in zip(min_temp, max_temp):
    name = min_.split("min")[0] + "diff"
    submission[name] = submission[max_] - submission[min_]

In [9]:
# days_below_10f, days above_100f

for min_, max_ in zip(below_ex, above_ex):
    name = "extreme_temp"
    reduced[name] = reduced[max_] - reduced[min_]
    
for min_, max_ in zip(below_ex, above_ex):
    name = "extreme_temp"
    submission[name] = submission[max_] - submission[min_]

In [10]:
# temp diff aug - january

# reduced["climate"] = (reduced["august_avg_temp"] - reduced["january_avg_temp"]) * reduced[['january_diff', 'february_diff',
#                                                                                            'march_diff', 'april_diff', 'may_diff', 'june_diff',
#                                                                                           'july_diff', 'august_diff', 'september_diff',
#                                                                                           'october_diff', 'november_diff', 'december_diff']].mean(axis=1)



In [11]:
# for i in reduced.columns:
#     print(i)

In [12]:
# delete_cols(reduced, min_temp)
# delete_cols(reduced, max_temp)
# delete_cols(submission, min_temp)
# delete_cols(submission, max_temp)

#### Impute variables 

In [13]:
# # take average per facility type for star_rating

# red = reduced.groupby("facility_type").mean()["energy_star_rating"]
# sub = submission.groupby("facility_type").mean()["energy_star_rating"]

# fill_dict = {}

# for i, j in zip(red.items(), sub.items()): 
#     avg = (i[1] + j[1]) / 2
#     if math.isnan(avg) == False:
#         fill_dict[i[0]] = avg
    
# nans = []

# for i, j in zip(red.items(), sub.items()): 
#     avg = (i[1] + j[1]) / 2
#     if math.isnan(avg) == False:
#             continue
#     nans.append(i[0])

# for key in nans:
#     fill_dict[key] = 10
    
# for i,j in reduced[reduced['energy_star_rating'].isna()]["facility_type"].items():
#     reduced.loc[i,'energy_star_rating'] = fill_dict[j]
    
# for i,j in submission[submission['energy_star_rating'].isna()]["facility_type"].items():
#     submission.loc[i,'energy_star_rating'] = fill_dict[j]

In [14]:
# impute with mean, ensures highest correlation

reduced['energy_star_rating'] = reduced['energy_star_rating'].fillna(56.0)
submission['energy_star_rating'] = submission['energy_star_rating'].fillna(56.0)
# median 76.0 no
# 63 random
# 56 works best
reduced['year_built'] = reduced['year_built'].fillna(reduced['year_built'].median())
submission['year_built'] = submission['year_built'].fillna(submission['year_built'].median())

In [15]:
# fillmedian = ["direction_max_wind_speed", "direction_peak_wind_speed", "days_with_fog", "max_wind_speed"]

# for i in fillmedian:
#     reduced[i] = reduced[i].fillna(reduced[i].median())
#     submission[i] = submission[i].fillna(submission[i].median())

In [16]:
# test which value enhances highest correlation still
# highest_corr = []

# for i in data['energy_star_rating'].unique():
#     d = pd.read_csv("/Users/charlottefelius/documents/wids2022/WIDS/data/train.csv")
#     d['energy_star_rating'] = d['energy_star_rating'].fillna(i)
#     correlations = helper_func.get_correlation(d, 'site_eui', 0)[1]
#     highest_corr.append((i, correlations[1]))

#### Create new columns

In [17]:
# below compute sum of below 30 and sum of above 80
# days_ = [string for string in reduced.columns if 'days_' in string]

In [18]:
# # below compute sum of below 30 and sum of above 80
# reduced['d_below_30_F'] = reduced[list(reduced.filter(regex='days_below_'))].sum(axis=1)
# reduced['d_above_80_F'] = reduced[list(reduced.filter(regex='days_above_'))].sum(axis=1)
# submission['d_below_30_F'] = submission[list(submission.filter(regex='days_below_'))].sum(axis=1)
# submission['d_above_80_F'] = submission[list(submission.filter(regex='days_above_'))].sum(axis=1)

In [19]:
# delete the other columns
# delete_cols(reduced, days_)
# delete_cols(submission, days_)

#### Onehot encoding

In [20]:
# onehot = ["facility_type", "State_Factor", "building_class"]
onehot = ["State_Factor", "building_class"]
#"building_class"

def onehotter(dataframe, to_onehot):
    
    for i in to_onehot:
        ohe_df = pd.get_dummies(dataframe[i], prefix=i)

        # concat with original data
            
        dataframe = pd.concat([dataframe, ohe_df], axis=1).drop([i], axis=1)
        
    return dataframe

In [21]:
submission = onehotter(submission, onehot)
reduced = onehotter(reduced, onehot)

#### Group facility

In [22]:
groups_facility_ = ["Mixed_Use_", "Commercial_", "Data_", "Education_", "Food_", "Health_", "Lodging_", "Warehouse_",
                   "Service_", "Retail_", "Public_Assembly_", "Public_Safety_", "Office_", "_Unit_Building"]

In [23]:
# groups_facility_ = ["Mixed_Use_", "Office_",  "Lodging_", "Service_", "Retail_", "Public_Safety_", 
#                      "_Unit_Building"]

for name in groups_facility_:
    newname = name + "new"
    reduced["facility_type"] = reduced.facility_type.str.replace(name, newname)
    reduced["facility_type"] = reduced["facility_type"].str.split('new').str[0]
    submission["facility_type"] = submission.facility_type.str.replace(name, newname)
    submission["facility_type"] = submission["facility_type"].str.split('new').str[0]

In [24]:
# find all columns with this regex, replace value by one of the new names

# for name in groups_facility_:
#     newname = name + "new"
#     reduced["facility_type"] = reduced.facility_type.str.replace(name, newname)
#     reduced["facility_type"] = reduced["facility_type"].str.split('new').str[0]


In [25]:
# also onehot encode

onehot = ["facility_type"]
reduced = onehotter(reduced, onehot)
submission = onehotter(submission, onehot)

In [26]:
# train model per state_factor
# train model per building_type
# train catboost per building_type!

# red_commercial = reduced[reduced['building_class_Commercial'] == 1]
# red_residential = reduced[reduced['building_class_Residential'] == 1]
# del red_commercial['building_class_Commercial']
# del red_residential['building_class_Residential']

# sub_commercial = submission[submission['building_class_Commercial'] == 1]
# sub_residential = submission[submission['building_class_Residential'] == 1]
# del sub_commercial['building_class_Commercial']
# del sub_residential['building_class_Residential']

#### Evaluation

In [27]:
def RMSE(original, predicted):
    
    aggregate = 0
    
    for orig, pred in zip(original, predicted):
        aggregate += (orig - pred)**2
    
    RMSE_ = math.sqrt(1/len(original) * aggregate)
        
    print(f'RMSE: {RMSE_}')
    
    return RMSE_

## Ensemble model

#### Divide per building_type

In [28]:
# train model per state_factor
# train model per building_type
# train catboost per building_type!

red_commercial = reduced[reduced['building_class_Commercial'] == 1]
red_residential = reduced[reduced['building_class_Residential'] == 1]
del red_commercial['building_class_Commercial']
del red_residential['building_class_Residential']

sub_commercial = submission[submission['building_class_Commercial'] == 1]
sub_residential = submission[submission['building_class_Residential'] == 1]
del sub_commercial['building_class_Commercial']
del sub_residential['building_class_Residential']

In [29]:
# red_residential.groupby("State_Factor").count()

In [30]:
# sub_commercial.groupby("State_Factor").count()

In [31]:
models_red = [red_commercial, red_residential]
models_sub = [sub_commercial, sub_residential]

In [32]:
def get_pred(model_name):
    
    X = model_name
    ids = X["id"]
    X.pop("id")
    
    return X, ids

def get_X_y(model_name):
    
    X = model_name
    y = X['site_eui']
    y_id = X['id']
    X.pop('site_eui')
    X.pop('id')
    X.pop('State_Factor_State_6')
    
    return X, y, y_id

def runmodel_predict(X, y, testdata, model):
    model.fit(X, y)
    preds = model.predict(testdata)
    return preds

def run_types_pred(data_train, data_test):
    
    ids = []
    resultlist_xgb = []
    resultlist_lgbm = []
    
    for train, test in zip(data_train, data_test):
        
        # XGBoost
    
        print("XGBoost")
        X, y, y_idx = get_X_y(train)
        testdata, y_id = get_pred(test)
        ids.append(y_id)
        
        data_dmatrix = xgb.DMatrix(data=X,label=y)
        model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.2,
                max_depth = 11, alpha = 0.2, n_estimators = 1000)
        
        predicted = runmodel_predict(X, y, testdata, model)
        resultlist_xgb.append(predicted)
        
        # LightGBM
        
        print("LightGBM")
        model = lgbm.LGBMRegressor(max_depth=10, learning_rate=0.2, n_estimators=2400)
        predicted = runmodel_predict(X, y, testdata, model)
        resultlist_lgbm.append(predicted)
        
    return resultlist_xgb, resultlist_lgbm, ids

In [36]:
to_concat_xgb_t, to_concat_lgbm_t, ids = run_types_pred(models_red, models_sub)

XGBoost
LightGBM
XGBoost
LightGBM


In [37]:
ids2 = [i for si in ids for i in si]

In [38]:
# ids2 heeft id's
ids2 = [i for si in ids for i in si]
xgb = []
lgbm = []
cb = []

for i in to_concat_xgb_t:
    xgb.append(i)
    
for i in to_concat_lgbm_t:
    lgbm.append(i)
    
for i in catbooster:
    cb.append(i)

xgb = [item for sublist in xgb for item in sublist]
lgbm = [item for sublist in lgbm for item in sublist]
cb = [item for sublist in cb for item in sublist]

In [39]:
len(xgb), len(lgbm), len(ids2)

(9705, 9705, 9705)

In [40]:
avg_predicted = []

for i, j in zip(xgb, lgbm):
    avg_predicted.append((i+j)/2)

In [41]:
print(xgb[:5])
print(lgbm[:5])
print(avg_predicted[:5])
print(ids2[:5])

[220.36429, 274.92633, 218.76958, 261.40292, 246.63524]
[203.3668235484881, 225.11405530038988, 142.16586179739295, 240.01912258114322, 184.61675566156586]
[211.86555593928313, 250.02019293339805, 180.46771941188007, 250.7110230825638, 215.6259971545134]
[75757, 75758, 75759, 75760, 75761]


In [42]:
result = pd.DataFrame()
result["id"] = ids2
result["site_eui"] = avg_predicted
result = result.sort_values(by=['id'])

In [43]:
# Write to CSV file

result.to_csv('/Users/charlottefelius/documents/WIDS2022/WIDS/results/xgb_lgbm6.csv', index=False, header=True)

In [44]:
stop

NameError: name 'stop' is not defined

In [None]:
def runmodel_train(X, y, model):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=22)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print("RMSE: %f" % (rmse))
    return preds, y_test

### XGboost

In [None]:
# [number - 1 for number in numbers]
def run_types_train(datasets):
    
    resultlist_xgb = []
    resultlist_lgbm = []
    
    for i, m in enumerate(datasets):
        
        # XGBoost
        
        print(f"Iteration {i}")
        print("XGBoost")
        X, y, y_id = get_X_y(m)
        
        data_dmatrix = xgb.DMatrix(data=X,label=y)
        model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.2,
                max_depth = 11, alpha = 0.2, n_estimators = 1000)
        
        predicted, original = runmodel_train(X, y, model)
        resultlist_xgb.append((list(predicted), list(original)))
        
        # LightGBM
        
        print("LightGBM")
        model = lgbm.LGBMRegressor(max_depth=10, learning_rate=0.2, n_estimators=2400)
        predicted, original = runmodel_train(X, y, model)
        resultlist_lgbm.append((list(predicted), list(original)))
        
    return resultlist_xgb, resultlist_lgbm

In [None]:
to_concat_xgb, to_concat_lgbm = run_types(models_red)

In [None]:
preds2 = []

predicted = []
original = []

for i in to_concat_xgb:
    predicted.append(i[0])
    original.append(i[1])

predicted = [item for sublist in predicted for item in sublist]

preds2.append(predicted)

predl = []
for i in to_concat_lgbm:
    predl.append(i[0])

predl = [item for sublist in predl for item in sublist]
preds2.append(predl)

original = [item for sublist in original for item in sublist]

RMSE(original, predicted)
RMSE(original, predl)

In [None]:
# take avg of different model values
# expm1(mean(log1p(x)))

avg_predicted = []

for i, j in zip(preds2[0], preds2[1]):
    print(np.exp(np.log((i+j)/2)))
#     avg_predicted.append((3*i+j)/4)

RMSE(original, avg_predicted)

In [None]:
raise NotImplementedError()

### LightGBM

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=22)

In [None]:
model = lgbm.LGBMRegressor(max_depth=10, learning_rate=0.2, n_estimators=2400)

In [None]:
model.fit(X_train, y_train)

In [None]:
# 37.48368551936998

predicted = model.predict(X_test)

In [None]:
RMSE(y_test, predicted)

### Catboost (Does not work on Apple M1)

In [None]:
MODEL_MAX_DEPTH = 12
MODEL_TASK_TYPE = 'GPU'
MODEL_RL = 0.025
MODEL_EVAL_METRIC ='RMSE'
MODEL_LOSS_FUNCTION = 'RMSE'
MODEL_ESR = 10
MODEL_VERBOSE = 1000
MODEL_ITERATIONS = 28000

In [None]:
model = CatBoostRegressor(
    verbose=1000,
    early_stopping_rounds=10,
    #random_state=41,
    random_seed=535,
    max_depth=MODEL_MAX_DEPTH,
    task_type=MODEL_TASK_TYPE,
    learning_rate=MODEL_RL,
    iterations=28000
    
)


In [None]:
# Highest score RMSE: 38.672019

model.fit(X_train, y_train)
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

In [None]:
raise NotImplementedError()

#### Fit testset, write to .csv

In [None]:
# Now fit on whole training set
model.fit(X, y)
predicted = model.predict(submission)

In [None]:
# Convert Preds to DataFrame 

result = pd.DataFrame(Y_test_sub)
result["site_eui"] = predicted

In [None]:
# Write to CSV file

result.to_csv('/Users/charlottefelius/documents/WIDS2022/WIDS/results/attempt32.csv', index=False, header=True)

In [None]:
result