In [1]:
%config Completer.use_jedi = False

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from scipy import stats

import shap

from sklearn.preprocessing import StandardScaler

import optuna.integration.lightgbm as lgbm
import optuna


import warnings
warnings.filterwarnings('ignore')

import wandb

plt.rcParams.update({'font.size': 18})
plt.style.use('ggplot')

In [None]:
train = pd.read_csv('./widsdatathon2022/train.csv')
test = pd.read_csv('./widsdatathon2022/test.csv')

train.head()

In [None]:
print(train.State_Factor.unique())
print(test.State_Factor.unique())

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
train.isnull().sum()/len(train)*100

In [None]:
numeric_features = train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train.select_dtypes(include=['object']).columns
cat_df = train[categorical_features]
num_df = train[numeric_features]

In [None]:
cat_df.isnull().sum()/len(train)*100

In [None]:
num_df.isnull().sum()/len(train)*100

In [None]:
numerical = ['Year_Factor', 'floor_area', 'year_built','energy_star_rating', 'ELEVATION',
             'direction_peak_wind_speed','max_wind_speed', 'days_with_fog', 'site_eui']
num_df[numerical].hist(bins=20, layout=(3,3), figsize=(9,3))
plt.tight_layout()
plt.show()

In [None]:
numerical = ['january_min_temp', 'january_avg_temp', 'january_max_temp',
             'february_min_temp', 'february_avg_temp','february_max_temp']
num_df[numerical].hist(bins=20, layout=(3,3), figsize=(9,3))
plt.tight_layout()
plt.show()

In [None]:
numerical = ['march_min_temp', 'march_avg_temp', 'march_max_temp',
             'april_min_temp', 'april_avg_temp','april_max_temp']
num_df[numerical].hist(bins=20, layout=(3,3), figsize=(9,3))
plt.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20, 10))
for variable, ax in zip(categorical_features, axs.flatten()):
    sns.histplot(train, x='site_eui', hue=variable, multiple='stack', ax=ax)

In [None]:
# https://www.kaggle.com/usharengaraju/wids2022-lgbm-starter-w-b
plt.figure(figsize=(25, 11))
sns.heatmap(train.isna().values, 
            cmap=['#ffd514','#ff355d'], xticklabels=train.columns)
plt.title("Missing values in training Data", size=20)

In [None]:
#code copied from https://www.kaggle.com/shrutisaxena/wids2022-starter-code
missing_columns = [col for col in train.columns if train[col].isnull().any()]
missingvalues_count =train.isna().sum()
missingValues_df = pd.DataFrame(missingvalues_count.rename('Null Values Count')).loc[missingvalues_count.ne(0)]
missingValues_df.style.background_gradient(cmap="Pastel1")

In [None]:
train.describe().style.background_gradient(cmap="Pastel1")

In [None]:
plt.figure(figsize=(15, 7))
plt.subplot(121)
sns.kdeplot(train.site_eui , color = "#ffd514")
plt.subplot(122)
sns.boxplot(train.site_eui , color = "#ff355d");

In [4]:
train = pd.read_csv('./widsdatathon2022/train.csv')
test = pd.read_csv('./widsdatathon2022/test.csv')

In [33]:
# year_built: replace with current year.
train['year_built'] = train['year_built'].replace(np.nan, 2022)
# energy_star_rating: replace with mean.
train['energy_star_rating'] = train['energy_star_rating'].\
replace(np.nan, train['energy_star_rating'].mean())
# direction_max_wind_speed, direction_peak_wind_speed, max_wind_speed: 
# replace with 1 with since 25, 50 75 percentiles are all 1.
train['direction_max_wind_speed'] = \
train['direction_max_wind_speed'].replace(np.nan, 1)
train['direction_peak_wind_speed'] = \
train['direction_peak_wind_speed'].replace(np.nan, 1)
train['max_wind_speed'] = train['max_wind_speed'].replace(np.nan, 1)
# energy_star_rating: replace with mean.
train['days_with_fog'] = train['days_with_fog'].replace(np.nan,
                                                        train['days_with_fog'].mean())
# test data
# year_built: replace with current year.
test['year_built'] = test['year_built'].replace(np.nan, 2022)
# energy_star_rating: replace with mean.
test['energy_star_rating'] = test['energy_star_rating'].\
replace(np.nan, test['energy_star_rating'].mean())
# direction_max_wind_speed, direction_peak_wind_speed, max_wind_speed: 
# replace with 1 with since 25, 50 75 percentiles are all 1.
test['direction_max_wind_speed'] = \
test['direction_max_wind_speed'].replace(np.nan, 1)
test['direction_peak_wind_speed'] = \
test['direction_peak_wind_speed'].replace(np.nan, 1)
test['max_wind_speed'] = test['max_wind_speed'].replace(np.nan, 1)
# energy_star_rating: replace with mean.
test['days_with_fog'] = test['days_with_fog'].replace(np.nan,
                                                      test['days_with_fog'].mean())

In [34]:
# handling categorical fields
from sklearn.preprocessing import LabelBinarizer
le = LabelBinarizer()
train = pd.get_dummies(train, columns=['building_class'])
test = pd.get_dummies(test, columns=['building_class'])

# cols = ['State_Factor','facility_type']
# train.drop(cols, axis=1, inplace=True)
# test.drop(cols, axis=1, inplace=True)
# train['building_class']= le.fit_transform(train['building_class']).astype("uint8")
# test['building_class']= le.fit_transform(test['building_class']).astype("uint8")
train['State_Factor']= le.fit_transform(train['State_Factor']).astype("uint8")
test['State_Factor']= le.fit_transform(test['State_Factor']).astype("uint8")
train['facility_type']= le.fit_transform(train['facility_type']).astype("uint8")
test['facility_type']= le.fit_transform(test['facility_type']).astype("uint8")

In [35]:
feat = [col for col in train.columns if col not in ["id", "site_eui"]]
X = train[feat]
y = train["site_eui"]

X_test = test[feat]

In [5]:
# https://www.kaggle.com/avinashreddykovvuri/a01-vitb-avinash-s-wids-mark-6

def nul_values(col,data):
    null_val = dict()
    for i in col:
        if (data[i].isnull().sum())>0:
            null_val[i] = (data[i].isnull().sum())
    print(null_val)
    
print("Null values in train data",end=" : ")
nul_values(train.columns,train)
print("Null values in test data",end=" : ")
nul_values(test.columns,test)

# train = train.drop(["direction_max_wind_speed", 
#                     'direction_peak_wind_speed', 'max_wind_speed'],axis=1)
# test = test.drop(["direction_max_wind_speed", 
#                   'direction_peak_wind_speed', 'max_wind_speed'],axis=1)
train['direction_max_wind_speed'] = \
train['direction_max_wind_speed'].replace(np.nan, 1)
train['direction_peak_wind_speed'] = \
train['direction_peak_wind_speed'].replace(np.nan, 1)
train['max_wind_speed'] = train['max_wind_speed'].replace(np.nan, 1)

test['direction_max_wind_speed'] = \
test['direction_max_wind_speed'].replace(np.nan, 1)
test['direction_peak_wind_speed'] = \
test['direction_peak_wind_speed'].replace(np.nan, 1)
test['max_wind_speed'] = test['max_wind_speed'].replace(np.nan, 1)


b = pd.concat([train["year_built"],train["energy_star_rating"],train['days_with_fog']],axis=1)
a = pd.concat([test["year_built"],test["energy_star_rating"],test["days_with_fog"]],axis=1)
test = test.drop(["year_built","energy_star_rating",'days_with_fog'],axis=1)
train = train.drop(["year_built","energy_star_rating",'days_with_fog'],axis=1)

from sklearn.impute import SimpleImputer
# Imputation
my_imputer = SimpleImputer()
imputed_X_test = pd.DataFrame(my_imputer.fit_transform(a))
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(b))
# Imputation removed column names; put them back
imputed_X_test.columns = a.columns
imputed_X_train.columns = b.columns
train = pd.concat([train,imputed_X_train],axis=1)
test = pd.concat([test,imputed_X_test],axis=1)

nul_values(train.columns,train)
nul_values(test.columns,test)

def fndCatData(data):
    col_names = []
    for col in data.columns:
        if data[col].dtypes == "object":
            col_names.append(col)
    print(col_names)
# fndCatData(train)
# fndCatData(test)

from sklearn.preprocessing import LabelEncoder
train["State_Factor"] = LabelEncoder().fit_transform(train["State_Factor"])
train["building_class"] = LabelEncoder().fit_transform(train["building_class"])
train["facility_type"] = LabelEncoder().fit_transform(train["facility_type"])
test["State_Factor"] = LabelEncoder().fit_transform(test["State_Factor"])
test["building_class"] = LabelEncoder().fit_transform(test["building_class"])
test["facility_type"] = LabelEncoder().fit_transform(test["facility_type"])

train = train.drop(["id"],axis=1)
X = train.drop(["site_eui"],axis=1)
id = test['id']
test = test.drop(["id"],axis=1)
X_test = test
y = train["site_eui"]

Null values in train data : {'year_built': 1837, 'energy_star_rating': 26709, 'direction_max_wind_speed': 41082, 'direction_peak_wind_speed': 41811, 'max_wind_speed': 41082, 'days_with_fog': 45796}
Null values in test data : {'year_built': 92, 'energy_star_rating': 2254, 'direction_max_wind_speed': 8575, 'direction_peak_wind_speed': 8575, 'max_wind_speed': 8575, 'days_with_fog': 9117}
{}
{}


In [91]:
import copy
trainnames = copy.deepcopy(X)
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=42)

In [7]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(verbose=2000,
                          iterations=28000,
                          early_stopping_rounds=10,
                          random_seed=42,
                          max_depth=15,
                          task_type='GPU',
                          learning_rate=0.0025,
                          eval_metric='RMSE',
                          loss_function='RMSE'
                        )

model.fit(X_train, y_train)

0:	learn: 58.2038308	total: 361ms	remaining: 2h 48m 21s
2000:	learn: 38.2261971	total: 12m 3s	remaining: 2h 36m 44s
4000:	learn: 34.9586242	total: 24m 5s	remaining: 2h 24m 30s
6000:	learn: 32.6071055	total: 36m 9s	remaining: 2h 12m 31s
8000:	learn: 30.7575209	total: 48m 12s	remaining: 2h 30s
10000:	learn: 29.2274190	total: 1h 19s	remaining: 1h 48m 33s
12000:	learn: 27.9228297	total: 1h 12m 24s	remaining: 1h 36m 31s
14000:	learn: 26.7984989	total: 1h 24m 33s	remaining: 1h 24m 32s
16000:	learn: 25.7960259	total: 1h 36m 47s	remaining: 1h 12m 34s
18000:	learn: 24.9060015	total: 1h 49m 2s	remaining: 1h 34s
20000:	learn: 24.1054147	total: 2h 1m 13s	remaining: 48m 29s
22000:	learn: 23.3730446	total: 2h 13m 29s	remaining: 36m 23s
24000:	learn: 22.7073582	total: 2h 25m 43s	remaining: 24m 16s
26000:	learn: 22.0943359	total: 2h 38m 4s	remaining: 12m 9s
27999:	learn: 21.5228471	total: 2h 50m 29s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1a6faaa8df0>

In [116]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
# param_grid = {
#    'n_estimators': [100, 200, 500],
#    'max_features': ['auto', 'sqrt', 'log2'],
#    'max_depth' : [4,5,6,7,8],
#    'criterion' :['mse', 'mae']
# }
# random_forest_tuning = RandomForestRegressor(random_state = 42)
# GSCV = GridSearchCV(estimator=random_forest_tuning, param_grid=param_grid, cv=5)
# GSCV.fit(X_train, y_train)
# GSCV.best_params_ 
model = RandomForestRegressor(n_estimators = 1000, max_features = 'auto',
                              max_depth = 450, criterion ='mse')
model.fit(X, y)

RandomForestRegressor(max_depth=450, n_estimators=1000)

In [121]:
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators=1000, learning_rate=0.0025, max_depth=450)
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.0025, max_delta_step=0,
             max_depth=450, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [125]:
import lightgbm as ltb
model = ltb.LGBMRegressor(n_estimators=1000, learning_rate=0.0025,max_depth=450,subsample=0.5)
model.fit(X_train, y_train)

LGBMRegressor(learning_rate=0.0025, max_depth=450, n_estimators=1000,
              subsample=0.5)

In [8]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
y_pred_class_rf = model.predict(X_val)
# y_pred_prob_rf = model.predict_proba(X_val)


print('rmse is {:.3f}'.format(np.sqrt(mean_squared_error(y_val,y_pred_class_rf))))
# print('roc-auc is {:.3f}'.format(roc_auc_score(y_val,y_pred_prob_rf[:,1])))

rmse is 40.983


In [None]:
y_pred_class_rf

In [9]:
# preds = np.mean(np.column_stack(predictions), axis=1)
preds = model.predict(X_test)
sub = pd.DataFrame(columns = ['id', 'site_eui'])
sub['id'] = id
sub['site_eui'] = preds
sub

Unnamed: 0,id,site_eui
0,75757,218.144297
1,75758,226.979362
2,75759,174.948451
3,75760,231.524225
4,75761,216.896730
...,...,...
9700,85457,51.420409
9701,85458,53.121744
9702,85459,81.370424
9703,85460,75.563593


In [10]:
sub.to_csv("submission.csv", index=False)