In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from tqdm import tqdm, trange
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import metrics
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler

import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.set_config_file(offline=True)
# Input data files are available i

import datetime

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_features = pd.read_pickle('train_features.pickle')
test_features = pd.read_pickle('test_features.pickle')

In [3]:
train_features['dayofyear'] = pd.to_datetime(train_features['timestamp']).dt.dayofyear+pd.to_datetime(train_features['timestamp']).dt.hour/24
test_features['dayofyear'] = pd.to_datetime(test_features['timestamp']).dt.dayofyear+pd.to_datetime(test_features['timestamp']).dt.hour/24

In [7]:
#Downsampling
neg = train_features[train_features['anomaly'] == 0]
pos = train_features[train_features['anomaly'] == 1]

print(neg.shape, pos.shape)
negs1 = neg.sample(n = pos.shape[0], random_state=10)
negs2 = neg.sample(n = pos.shape[0], random_state=20)
df_eq = pd.concat([negs1, pos, negs2, pos], axis=0)
print(df_eq.shape)

(1712198, 180) (37296, 180)
(149184, 180)


In [10]:
list_variables = list(df_eq.drop(['anomaly',
                                  'wind_direction',
                                  'air_temperature_std_lag73'],axis=1).select_dtypes(include=['float','int']).columns)

In [11]:
features = df_eq.drop(['anomaly'], axis=1)
features = features[list_variables]
target = df_eq['anomaly']

In [12]:
#Data split for taining and validation data
X_train = features[features['building_id']%5<4]
X_val = features[features['building_id']%5==4]
Y_train = target[features['building_id']%5<4]
Y_val = target[features['building_id']%5==4]

print(X_train.shape, X_val.shape)

(119520, 169) (29664, 169)


In [13]:
#Normalization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
test = scaler.transform(test_features[list_variables])

In [14]:
#XGBoost modeling
xgb_model = XGBClassifier(n_estimators=100)
xgb_model.fit(X_train, Y_train)

pred_train_xgb = xgb_model.predict_proba(X_train)[:,1]
pred_val_xgb = xgb_model.predict_proba(X_val)[:,1]

score_train_xgb = metrics.roc_auc_score(Y_train, pred_train_xgb)
score_val_xgb = metrics.roc_auc_score(Y_val, pred_val_xgb)

print('Training Accuracy : ', score_train_xgb)
print('Validation Accuracy : ', score_val_xgb)

Training Accuracy :  0.9999720737697113
Validation Accuracy :  0.9809208046345569


In [15]:
#HistGradientBoosting modeling
hist_model = HistGradientBoostingClassifier()
hist_model.fit(np.nan_to_num(X_train), Y_train)

pred_train_hist = hist_model.predict_proba(np.nan_to_num(X_train))[:,1]
pred_val_hist = hist_model.predict_proba(np.nan_to_num(X_val))[:,1]

score_train_hist = metrics.roc_auc_score(Y_train, pred_train_hist)
score_val_hist = metrics.roc_auc_score(Y_val, pred_val_hist)

print('Training Accuracy : ', score_train_hist)
print('Validation Accuracy : ', score_val_hist)

Training Accuracy :  0.9976287488860971
Validation Accuracy :  0.9798740037491442


In [16]:
#Catboost modeling
cat_model = CatBoostClassifier()
cat_model.fit(X_train, Y_train, silent=True)

pred_train_cat = cat_model.predict_proba(X_train)[:,1]
pred_val_cat = cat_model.predict_proba(X_val)[:,1]

score_train_cat = metrics.roc_auc_score(Y_train, pred_train_cat)
score_val_cat = metrics.roc_auc_score(Y_val, pred_val_cat)

print('Training Accuracy : ', score_train_cat)
print('Validation Accuracy : ', score_val_cat)

Training Accuracy :  0.9999149459520995
Validation Accuracy :  0.9797653181100545


In [17]:
#LightGBM modeling
lgb_model = lgb.LGBMClassifier(n_estimators=100)
lgb_model.fit(X_train, Y_train)

pred_train_lgb = lgb_model.predict_proba(X_train)[:,1]
pred_val_lgb = lgb_model.predict_proba(X_val)[:,1]

score_train_lgb = metrics.roc_auc_score(Y_train, pred_train_lgb)
score_val_lgb = metrics.roc_auc_score(Y_val, pred_val_lgb)

print('Training Accuracy : ', score_train_lgb)
print('Validation Accuracy : ', score_val_lgb)

Training Accuracy :  0.9980787846884552
Validation Accuracy :  0.9803649297101507


In [19]:
#Model ensembling
score_train_ensemble = metrics.roc_auc_score(Y_train, (pred_train_xgb+pred_train_cat+pred_train_lgb+pred_train_hist)/4)
score_val_ensemble = metrics.roc_auc_score(Y_val, (pred_val_xgb+pred_val_cat+pred_val_lgb+pred_val_hist)/4)

print('Training Accuracy : ', score_train_ensemble)
print('Validation Accuracy : ', score_val_ensemble)

Training Accuracy :  0.9997815543603225
Validation Accuracy :  0.9824927554297033


In [23]:
lgb_model.fit(X_all, target)
xgb_model.fit(X_all, target)
cat_model.fit(X_all, target, silent=True)
hist_model.fit(np.nan_to_num(X_all), target)

predictions = xgb_model.predict_proba(test)[:,1]/4 + lgb_model.predict_proba(test)[:,1]/4 + cat_model.predict_proba(test)[:,1]/4 + hist_model.predict_proba(np.nan_to_num(test))[:,1]/4

In [24]:
ss = pd.read_csv('../input/energy-anomaly-detection/sample_submission.csv')
ss['anomaly'] = predictions
ss.loc[test_features['meter_reading']==1.0, 'anomaly'] = 1

ss.loc[(test_features['dayofyear']==1)&((test_features['building_id']>145)|(test_features['building_id']<105)),'anomaly'] = 0
ss.loc[test_features['dayofyear']>366.9583,'anomaly'] = 0

ss.to_csv('Submission_lgb_xgb.csv', index=False)
ss

Unnamed: 0,row_id,anomaly
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0
...,...,...
1800562,1800562,0.0
1800563,1800563,0.0
1800564,1800564,0.0
1800565,1800565,0.0
