In [1]:
%cd ../..

/home/batman/kaggle/kaggle-ashrae-DSR


In [2]:
from pathlib import Path
Path.cwd()

PosixPath('/home/batman/kaggle/kaggle-ashrae-DSR')

In [3]:
PATH_UTILS_PACKAGE = Path("./src/utils")
assert PATH_UTILS_PACKAGE.exists(), "Can't find {}".format(PATH_UTILS_PACKAGE)

## LOGGING

In [4]:
# Logging
# =============================================================================
import sys
import logging
import datetime

logger = logging.getLogger()
logger.handlers = []

# Set level
logger.setLevel(logging.INFO)

# Create formatter
FORMAT = "%(asctime)s : %(message)s"
DATE_FMT = "%Y-%m-%d %H:%M:%S"
formatter = logging.Formatter(FORMAT, DATE_FMT)

# Create handler and assign
handler = logging.StreamHandler(sys.stderr)
handler.setFormatter(formatter)
logger.handlers = [handler]

logging.info("Logging started")

logging.info("Kernel started {}".format(datetime.datetime.now()))



2019-11-24 13:47:38 : Logging started
2019-11-24 13:47:38 : Kernel started 2019-11-24 13:47:38.974256


## IMPORTS

In [7]:
import os
from pathlib import Path

In [8]:
import src.utils.ashrae_transformers as trfs

In [None]:
from src.utils import 

In [6]:
# Our local imports
import src.utils.ashrae_transformers as trfs
from src.utils.utility_classes import Map

In [9]:
dir(trfs)

['BaseEstimator',
 'FeatureSelector',
 'TemporalTransformer',
 'TransformerMixin',
 'TypeSelector',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__']

In [None]:
# Standard imports
# =============================================================================
import os
from pathlib import Path
import sys
import zipfile
import gc
import time
from pprint import pprint
from functools import reduce
from collections import defaultdict
import json
import yaml
import inspect
import gc
import random

In [None]:
# Basic imports
# =============================================================================
import tqdm

In [None]:
# ML imports
# =============================================================================
import numpy as np
print('numpy {} as np'.format(np.__version__))
import pandas as pd
print('pandas {} as pd'.format(pd.__version__))
from sklearn_pandas import DataFrameMapper
import sklearn as sk
print('sklearn {} as sk'.format(sk.__version__))

import sklearn.preprocessing
import sklearn.metrics
import sklearn.linear_model
import sklearn.pipeline
import sklearn.model_selection
import sklearn.ensemble
import sklearn.feature_extraction
import sklearn.decomposition
import sklearn.compose
import sklearn.utils

# Models
import lightgbm as lgb
print("lightgbm", lgb.__version__)
import xgboost as xgb
print("xgboost", xgb.__version__)
# from catboost import CatBoostClassifier
import catboost as catb
print("catboost", catb.__version__)

In [None]:
# Plotting
# =============================================================================
import matplotlib as mpl
print('matplotlib {} as mpl'.format(mpl.__version__))
import matplotlib.pyplot as plt
print('matplotlib.pyplot as plt'.format())
import seaborn as sns
print('seaboarn {} as sns'.format(sns.__version__))

## SETTINGS

In [None]:
# Set the environment
SETTINGS = Map()
SETTINGS.data = Map()
SETTINGS.model = Map()

SETTINGS.data.path_data_root = Path.cwd() / 'data' / 'feather'
SETTINGS.data.use_ucf = True
SETTINGS.data.path_output = Path.cwd() / 'output'

SETTINGS.model.folds = 5
SETTINGS.model.num_rounds=1000
SETTINGS.control = Map()
SETTINGS.control.debug = False

logging.info("Settings:".format())
pprint(SETTINGS)

## LOAD DATA

In [None]:
logging.info(" *** Step 2: Load data *** ".format())

# Train
train_df = pd.read_feather(SETTINGS.data.path_data_root / 'train.feather')
logging.info("Loaded: train_df {} with {} buildings, {:0.1f} MB".format(train_df.shape, train_df.loc[:, 'building_id'].nunique(), train_df.memory_usage().sum() / 1024 ** 2))
r1 = train_df.head()

# Test
test_df = pd.read_feather(SETTINGS.data.path_data_root / 'test.feather')
logging.info("Loaded: test_df {} with {} buildings, {:0.1f} MB".format(test_df.shape, test_df.loc[:, 'building_id'].nunique(), test_df.memory_usage().sum() / 1024 ** 2))
r2 = test_df.head()

# Weather train
weather_train_df = pd.read_feather(SETTINGS.data.path_data_root/'weather_train.feather')
logging.info("Loaded: weather_train_df {}".format(weather_train_df.shape))
r = weather_train_df.head()

# Weather test
weather_test_df = pd.read_feather(SETTINGS.data.path_data_root/'weather_test.feather')
logging.info("Loaded: weather_test_df {}".format(weather_test_df.shape))

# Meta
building_meta_df = pd.read_feather(SETTINGS.data.path_data_root/'building_metadata.feather')
logging.info("Loaded: building_meta_df {}".format(building_meta_df.shape))
r = building_meta_df.head()
building_meta_df.set_index('building_id',inplace=True, drop=True)

# Sample
# sample_submission = pd.read_feather(os.path.join(SETTINGS.data.path_data_root, 'sample_submission.feather'))
# logging.info("Loaded: sample_submission {}".format(sample_submission.shape))
# sample_submission = reduce_mem_usage(sample_submission)

In [None]:
# train_merge = train_df_data.merge(building_meta_df, on='building_id', how='left')
# train_merge.memory_usage().sum() / 1024 ** 2
# test_merge = test_df_data.merge(building_meta_df, on='building_id', how='left')
# test_merge.memory_usage().sum() / 1024 ** 2
#
# train_df = train_merge.merge(weather_train_df, on=['site_id', 'timestamp'], how='left')
# train_df.memory_usage().sum() / 1024 ** 2
# r = train_df.head()
#
# test_df = test_merge.merge(weather_test_df, on=['site_id', 'timestamp'], how='left')
# test_df.memory_usage().sum() / 1024 ** 2
# r = test_df.head()

# train2.info()

# del

In [None]:
def preprocess_date_time_cols(df):
    df['date'] = df['timestamp'].dt.date
    logging.info("Added date column".format())
    df["hour"] = df["timestamp"].dt.hour
    logging.info("Added hour column".format())
    # df["day"] = df["timestamp"].dt.day
    df["weekend"] = df["timestamp"].dt.weekday
    logging.info("Added weekend column".format())
    df["month"] = df["timestamp"].dt.month
    logging.info("Added month column".format())
    df["dayofweek"] = df["timestamp"].dt.dayofweek
    logging.info("Added dayofweek column".format())

    return df

In [None]:
logging.info("Adding basic time features to train and test".format())
train_df = preprocess_date_time_cols(train_df)
test_df = preprocess_date_time_cols(test_df)

In [None]:
train_df['meter_reading_log1p'] = np.log1p(train_df['meter_reading'])
test_df['meter_reading_log1p'] = np.log1p(test_df['meter_reading'])
logging.info("Added meter_reading_log1p [ln(1+x)] column".format())

In [None]:
if 0:
    # test_df['building_mean'] = test_df['building_id'].map(building_mean)
    test_df['building_median'] = test_df['building_id'].map(building_median)
    # test_df['building_min'] = test_df['building_id'].map(building_min)
    # test_df['building_max'] = test_df['building_id'].map(building_max)
    # test_df['building_std'] = test_df['building_id'].map(building_std)

    print('preprocessing weather...')
    weather_test_df = weather_test_df.groupby('site_id').apply(lambda group: group.interpolate(limit_direction='both'))
    weather_test_df.groupby('site_id').apply(lambda group: group.isna().sum())

    # add_lag_feature(weather_test_df, window=3)
    # add_lag_feature(weather_test_df, window=72)

    print('reduce mem usage...')
    reduce_mem_usage(test_df, use_float16=True)
    reduce_mem_usage(weather_test_df, use_float16=True)

    gc.collect()



    logging.info("Removing building site=0".format())
    building_meta_df[building_meta_df.site_id == 0]
    train_df = train_df.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')


logging.info(" *** Step 3: Feature engineering *** ".format())

# Data preprocessing

Now, Let's try building GBDT (Gradient Boost Decision Tree) model to predict `meter_reading_log1p`. I will try using LightGBM in this notebook.

# Add time feature

Some features introduced in https://www.kaggle.com/ryches/simple-lgbm-solution by @ryches

Features that are likely predictive:

#### Weather

- time of day
- holiday
- weekend
- cloud_coverage + lags
- dew_temperature + lags
- precip_depth + lags
- sea_level_pressure + lags
- wind_direction + lags
- wind_speed + lags

#### Train

- max, mean, min, std of the specific building historically



However we should be careful of putting time feature, since we have only 1 year data in training,
including `date` makes overfiting to training data.

How about `month`? It may be better to check performance by cross validation.
I go not using this data in this kernel for robust modeling.



In [None]:
# print(SETTINGS)
# sort train. i dont know it is best
# if use_ucf:
#     train_df = train_df.sort_values('month')
#     train_df = train_df.reset_index()

In [None]:
logging.info("Adding building_median log1p feature".format())

df_group = train_df.groupby('building_id')['meter_reading_log1p']
#building_mean = df_group.mean().astype(np.float16)
building_median = df_group.median().astype(np.float16)
#building_min = df_group.min().astype(np.float16)
#building_max = df_group.max().astype(np.float16)
#building_std = df_group.std().astype(np.float16)

#train_df['building_mean'] = train_df['building_id'].map(building_mean)
train_df['building_median'] = train_df['building_id'].map(building_median)
#train_df['building_min'] = train_df['building_id'].map(building_min)
#train_df['building_max'] = train_df['building_id'].map(building_max)
#train_df['building_std'] = train_df['building_id'].map(building_std)
del df_group
logging.info("Added building_median log1p feature".format())

In [None]:
#building_mean.head()

# Fill Nan value in weather dataframe by interpolation


weather data has a lot of NaNs!!

![](http://)I tried to fill these values by **interpolating** data.

In [None]:
# weather_train_df.head()

In [None]:
# weather_train_df.describe()

In [None]:
# weather_train_df.isna().sum()

In [None]:
# weather_train_df.shape

In [None]:
logging.info("Interpolating over NaNs in weather_train_df".format())
# weather_train_df.groupby('site_id').apply(lambda group: group.isna().sum())
weather_train_df = weather_train_df.groupby('site_id').apply(lambda group: group.interpolate(limit_direction='both'))

In [None]:
# weather_train_df.groupby('site_id').apply(lambda group: group.isna().sum())

Seems number of nan has reduced by `interpolate` but some property has never appear in specific `site_id`, and nan remains for these features.

## lags

Adding some lag feature

In [None]:
def add_lag_feature(weather_df, window=3):
    group_df = weather_df.groupby('site_id')
    cols = ['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']
    rolled = group_df[cols].rolling(window=window, min_periods=0)
    lag_mean = rolled.mean().reset_index().astype(np.float16)
    lag_max = rolled.max().reset_index().astype(np.float16)
    lag_min = rolled.min().reset_index().astype(np.float16)
    lag_std = rolled.std().reset_index().astype(np.float16)
    for col in cols:
        weather_df[f'{col}_mean_lag{window}'] = lag_mean[col]
        weather_df[f'{col}_max_lag{window}'] = lag_max[col]
        weather_df[f'{col}_min_lag{window}'] = lag_min[col]
        weather_df[f'{col}_std_lag{window}'] = lag_std[col]

In [None]:
# skip lag feature to save memory
#add_lag_feature(weather_train_df, window=3)
#add_lag_feature(weather_train_df, window=72)

In [None]:
weather_train_df.head()

In [None]:
# categorize primary_use column to reduce memory on merge...
logging.info("Convert primary_use to Categorical type".format())
primary_use_list = building_meta_df['primary_use'].unique()
primary_use_dict = {key: value for value, key in enumerate(primary_use_list)}
print('primary_use_dict: ', primary_use_dict)
building_meta_df['primary_use'] = building_meta_df['primary_use'].map(primary_use_dict)
gc.collect()

In [None]:
reduce_mem_usage(train_df, use_float16=True)
reduce_mem_usage(building_meta_df, use_float16=True)
reduce_mem_usage(weather_train_df, use_float16=True)

In [None]:
building_meta_df.head()

In [None]:

category_cols = ['building_id', 'site_id', 'primary_use']  # , 'meter'
feature_cols = ['square_feet', 'year_built'] + [
    'hour', 'weekend',  # 'month' , 'dayofweek'
    'building_median'] + [
                   'air_temperature', 'cloud_coverage',
                   'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
                   'wind_direction', 'wind_speed', ]


#     'air_temperature_mean_lag72',
#     'air_temperature_max_lag72', 'air_temperature_min_lag72',
#     'air_temperature_std_lag72', 'cloud_coverage_mean_lag72',
#     'dew_temperature_mean_lag72', 'precip_depth_1_hr_mean_lag72',
#     'sea_level_pressure_mean_lag72', 'wind_direction_mean_lag72',
#     'wind_speed_mean_lag72', 'air_temperature_mean_lag3',
#     'air_temperature_max_lag3',
#     'air_temperature_min_lag3', 'cloud_coverage_mean_lag3',
#     'dew_temperature_mean_lag3',
#     'precip_depth_1_hr_mean_lag3', 'sea_level_pressure_mean_lag3',
#     'wind_direction_mean_lag3', 'wind_speed_mean_lag3']

# Train model

To win in kaggle competition, how to evaluate your model is important.
What kind of cross validation strategy is suitable for this competition? This is time series data, so it is better to consider time-splitting.

However this notebook is for simple tutorial, so I will proceed with KFold splitting without shuffling, so that at least near-term data is not included in validation.

In [None]:
def create_X_y(train_df, target_meter):
    target_train_df = train_df[train_df['meter'] == target_meter]
    target_train_df = target_train_df.merge(building_meta_df, on='building_id', how='left')
    target_train_df = target_train_df.merge(weather_train_df, on=['site_id', 'timestamp'], how='left')
    X_train = target_train_df[feature_cols + category_cols]
    y_train = target_train_df['meter_reading_log1p'].values

    del target_train_df
    return X_train, y_train

In [None]:
# folds = 5
seed = 666
shuffle = False
kf = sk.model_selection.KFold(n_splits=SETTINGS.model.folds, shuffle=shuffle, random_state=seed)
oof_total = 0

# Train model by each meter type

In [None]:
target_meter = 0
X_train, y_train = create_X_y(train_df, target_meter=target_meter)
y_valid_pred_total = np.zeros(X_train.shape[0])
gc.collect()
print('target_meter', target_meter, X_train.shape)

cat_features = [X_train.columns.get_loc(cat_col) for cat_col in category_cols]
print('cat_features', cat_features)

logging.info(" *** Step 4: Build model *** ".format())

In [None]:
def fit_lgbm(train, val, devices=(-1,), seed=None, cat_features=None, num_rounds=1500, lr=0.1, bf=0.1):
    """Train Light GBM model"""
    X_train, y_train = train
    X_valid, y_valid = val
    metric = 'l2'
    params = {'num_leaves': 31,
              'objective': 'regression',
              #               'max_depth': -1,
              'learning_rate': lr,
              "boosting": "gbdt",
              "bagging_freq": 5,
              "bagging_fraction": bf,
              "feature_fraction": 0.9,
              "metric": metric,
              #               "verbosity": -1,
              #               'reg_alpha': 0.1,
              #               'reg_lambda': 0.3
              }
    device = devices[0]
    if device == -1:
        # use cpu
        pass
    else:
        # use gpu
        print(f'using gpu device_id {device}...')
        params.update({'device': 'gpu', 'gpu_device_id': device})

    params['seed'] = seed

    early_stop = 20
    verbose_eval = 20

    d_train = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_features)
    d_valid = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_features)
    watchlist = [d_train, d_valid]

    print('training LGB:')
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=SETTINGS.model.num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval,
                      early_stopping_rounds=early_stop)

    # predictions
    y_pred_valid = model.predict(X_valid, num_iteration=model.best_iteration)

    print('best_score', model.best_score)
    log = {'train/mae': model.best_score['training']['l2'],
           'valid/mae': model.best_score['valid_1']['l2']}
    return model, y_pred_valid, log

In [None]:
models0 = []
for train_idx, valid_idx in kf.split(X_train, y_train):
    train_data = X_train.iloc[train_idx, :], y_train[train_idx]
    valid_data = X_train.iloc[valid_idx, :], y_train[valid_idx]

    print('train', len(train_idx), 'valid', len(valid_idx))
    #     model, y_pred_valid, log = fit_cb(train_data, valid_data, cat_features=cat_features, devices=[0,])
    model, y_pred_valid, log = fit_lgbm(train_data, valid_data, cat_features=category_cols,
                                        num_rounds=SETTINGS.model.num_rounds, lr=0.05, bf=0.7)
    y_valid_pred_total[valid_idx] = y_pred_valid
    models0.append(model)
    gc.collect()
    if SETTINGS.control.debug:
        break

In [None]:
sns.distplot(y_train)
sns.distplot(y_valid_pred_total)

oof0 = sk.metrics.mean_squared_error(y_train, y_valid_pred_total)
oof_total += oof0 * len(y_train)

del X_train, y_train
gc.collect()

In [None]:
def plot_feature_importance(model):
    importance_df = pd.DataFrame(model.feature_importance(),
                                 index=feature_cols + category_cols,
                                 columns=['importance']).sort_values('importance')
    fig, ax = plt.subplots(figsize=(8, 8))
    importance_df.plot.barh(ax=ax)
    fig.show()

In [None]:
target_meter = 1
X_train, y_train = create_X_y(train_df, target_meter=target_meter)
y_valid_pred_total = np.zeros(X_train.shape[0])
gc.collect()
print('target_meter', target_meter, X_train.shape)

cat_features = [X_train.columns.get_loc(cat_col) for cat_col in category_cols]
print('cat_features', cat_features)

models1 = []
for train_idx, valid_idx in kf.split(X_train, y_train):
    train_data = X_train.iloc[train_idx ,:], y_train[train_idx]
    valid_data = X_train.iloc[valid_idx ,:], y_train[valid_idx]

    print('train', len(train_idx), 'valid', len(valid_idx))
    #     model, y_pred_valid, log = fit_cb(train_data, valid_data, cat_features=cat_features, devices=[0,])
    model, y_pred_valid, log = fit_lgbm(train_data, valid_data, cat_features=category_cols, num_rounds=SETTINGS.model.num_rounds,
                                        lr=0.05, bf=0.5)
    y_valid_pred_total[valid_idx] = y_pred_valid
    models1.append(model)
    gc.collect()
    if SETTINGS.control.debug:
        break

sns.distplot(y_train)
sns.distplot(y_valid_pred_total)

oof1 = sk.metrics.mean_squared_error(y_train, y_valid_pred_total)
oof_total += oof1 * len(y_train)

del X_train, y_train
gc.collect()

In [None]:
target_meter = 2
X_train, y_train = create_X_y(train_df, target_meter=target_meter)
y_valid_pred_total = np.zeros(X_train.shape[0])

gc.collect()
print('target_meter', target_meter, X_train.shape)

cat_features = [X_train.columns.get_loc(cat_col) for cat_col in category_cols]
print('cat_features', cat_features)

models2 = []
for train_idx, valid_idx in kf.split(X_train, y_train):
    train_data = X_train.iloc[train_idx ,:], y_train[train_idx]
    valid_data = X_train.iloc[valid_idx ,:], y_train[valid_idx]

    print('train', len(train_idx), 'valid', len(valid_idx))
    #     model, y_pred_valid, log = fit_cb(train_data, valid_data, cat_features=cat_features, devices=[0,])
    model, y_pred_valid, log = fit_lgbm(train_data, valid_data, cat_features=category_cols,
                                        num_rounds=SETTINGS.model.num_rounds, lr=0.05, bf=0.8)
    y_valid_pred_total[valid_idx] = y_pred_valid
    models2.append(model)
    gc.collect()
    if SETTINGS.control.debug:
        break

sns.distplot(y_train)
sns.distplot(y_valid_pred_total)

oof2 = sk.metrics.mean_squared_error(y_train, y_valid_pred_total)
oof_total += oof2 * len(y_train)

del X_train, y_train
gc.collect()

In [None]:
target_meter = 3
X_train, y_train = create_X_y(train_df, target_meter=target_meter)
y_valid_pred_total = np.zeros(X_train.shape[0])

gc.collect()
print('target_meter', target_meter, X_train.shape)

cat_features = [X_train.columns.get_loc(cat_col) for cat_col in category_cols]
print('cat_features', cat_features)

models3 = []
for train_idx, valid_idx in kf.split(X_train, y_train):
    train_data = X_train.iloc[train_idx ,:], y_train[train_idx]
    valid_data = X_train.iloc[valid_idx ,:], y_train[valid_idx]

    print('train', len(train_idx), 'valid', len(valid_idx))
    #     model, y_pred_valid, log = fit_cb(train_data, valid_data, cat_features=cat_features, devices=[0,])
    model, y_pred_valid, log = fit_lgbm(train_data, valid_data, cat_features=category_cols, num_rounds=SETTINGS.model.num_rounds,
                                        lr=0.03, bf=0.9)
    y_valid_pred_total[valid_idx] = y_pred_valid
    models3.append(model)
    gc.collect()
    if SETTINGS.control.debug:
        break

sns.distplot(y_train)
sns.distplot(y_valid_pred_total)

oof3 = sk.metrics.mean_squared_error(y_train, y_valid_pred_total)
oof_total += oof3 * len(y_train)

del X_train, y_train
gc.collect()

# OOF SCOREs

In [None]:
print ('oof score meter0 =', np.sqrt(oof0))
print ('oof score meter1 =', np.sqrt(oof1))
print ('oof score meter2 =', np.sqrt(oof2))
print ('oof score meter3 =', np.sqrt(oof3))
print ('oof score total  =', np.sqrt(oof_total / len(train_df)))

# Prediction on test data

In [None]:
del train_df, weather_train_df, building_meta_df
gc.collect()

In [None]:
def create_X(test_df, target_meter):
    target_test_df = test_df[test_df['meter'] == target_meter]
    target_test_df = target_test_df.merge(building_meta_df, on='building_id', how='left')
    target_test_df = target_test_df.merge(weather_test_df, on=['site_id', 'timestamp'], how='left')
    X_test = target_test_df[feature_cols + category_cols]
    return X_test

In [None]:
def pred(X_test, models, batch_size=1000000):
    iterations = (X_test.shape[0] + batch_size -1) // batch_size
    print('iterations', iterations)

    y_test_pred_total = np.zeros(X_test.shape[0])
    for i, model in enumerate(models):
        print(f'predicting {i}-th model')
        for k in tqdm.tqdm(range(iterations)):
            y_pred_test = model.predict(X_test[k * batch_size:(k + 1) * batch_size], num_iteration=model.best_iteration)
            y_test_pred_total[k * batch_size:(k + 1) * batch_size] += y_pred_test

    y_test_pred_total /= len(models)
    return y_test_pred_total

In [None]:
%%time
X_test = create_X(test_df, target_meter=0)
gc.collect()

y_test0 = pred(X_test, models0)

sns.distplot(y_test0)

del X_test
gc.collect()

In [None]:
%%time
X_test = create_X(test_df, target_meter=1)
gc.collect()

y_test1 = pred(X_test, models1)
sns.distplot(y_test1)

del X_test
gc.collect()

In [None]:
%%time
X_test = create_X(test_df, target_meter=2)
gc.collect()

y_test2 = pred(X_test, models2)
sns.distplot(y_test2)

del X_test
gc.collect()

In [None]:
X_test = create_X(test_df, target_meter=3)
gc.collect()

y_test3 = pred(X_test, models3)
sns.distplot(y_test3)

del X_test
gc.collect()

In [None]:
sample_submission.loc[test_df['meter'] == 0, 'meter_reading'] = np.expm1(y_test0)
sample_submission.loc[test_df['meter'] == 1, 'meter_reading'] = np.expm1(y_test1)
sample_submission.loc[test_df['meter'] == 2, 'meter_reading'] = np.expm1(y_test2)
sample_submission.loc[test_df['meter'] == 3, 'meter_reading'] = np.expm1(y_test3)

In [None]:
sample_submission.to_csv(SETTINGS.data.path_output / 'submission.csv', index=False, float_format='%.4f')

# Replace to UCF data
if 0:
    # %%
    leak_score = 0

    leak_df = pd.read_pickle(ucf_root / 'site0.pkl')
    leak_df['meter_reading'] = leak_df.meter_reading_scraped
    leak_df.drop(['meter_reading_original', 'meter_reading_scraped'], axis=1, inplace=True)
    leak_df.fillna(0, inplace=True)
    leak_df = leak_df[leak_df.timestamp.dt.year > 2016]
    leak_df.loc[leak_df.meter_reading < 0, 'meter_reading'] = 0  # remove large negative values

    sample_submission.loc[sample_submission.meter_reading < 0, 'meter_reading'] = 0

    for bid in leak_df.building_id.unique():
        temp_df = leak_df[(leak_df.building_id == bid)]
        for m in temp_df.meter.unique():
            v0 = sample_submission.loc[(test_df.building_id == bid) & (test_df.meter == m), 'meter_reading'].values
            v1 = temp_df[temp_df.meter == m].meter_reading.values

            leak_score += sk.metrics.mean_squared_error(np.log1p(v0), np.log1p(v1)) * len(v0)

            sample_submission.loc[(test_df.building_id == bid) & (test_df.meter == m), 'meter_reading'] = temp_df[
                temp_df.meter == m].meter_reading.values

    # %%
    if not SETTINGS.control.debug:
        sample_submission.to_csv('submission_ucf_replaced.csv', index=False, float_format='%.4f')

    # %%
    sample_submission.head()

    # %%
    np.log1p(sample_submission['meter_reading']).hist()

    # %% [markdown]
    # # UCF score

    # %%
    print('UCF score = ', np.sqrt(leak_score / len(leak_df)))

    # %%
    plot_feature_importance(models0[1])

    # %%
    plot_feature_importance(models1[1])

    # %%
    plot_feature_importance(models2[1])

    # %%
    plot_feature_importance(models3[1])

    # %% [markdown]
    # # References
    #
    # These kernels inspired me to write this kernel, thank you for sharing!
    #
    #  - https://www.kaggle.com/rishabhiitbhu/ashrae-simple-eda
    #  - https://www.kaggle.com/isaienkov/simple-lightgbm
    #  - https://www.kaggle.com/ryches/simple-lgbm-solution