In [159]:
import sys
sys.path.insert(0, '../../')

%load_ext autoreload
%autoreload 2

from pathlib import Path
import numpy as np
import pandas as pd
from src.utils import build_pipeline
from src.utils import ashrae_transformers

%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [160]:
!ls ../../

Pipfile          README.md        [1m[36mnotebooks[m[m        [1m[36msrc[m[m
Pipfile.lock     [1m[36mdata[m[m             requirements.txt


In [161]:
train = pd.read_feather('../../data/feather/train_merged.feather')
test = pd.read_feather('../../data/feather/test_merged.feather')

In [162]:
# Site 0 remove missing meter readings from Site 0
train = train[~((train.site_id==0) & 
                (train.meter==0) & 
                (train.building_id <= 104) & 
                (train.timestamp < "2016-05-21"))]

In [163]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold, KFold
from tqdm import notebook as tqdm
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

In [164]:
X = train.drop(columns='meter_reading')
y = train['meter_reading'].map(np.log1p)

if the following error appears

```error: 'i' format requires -2147483648 <= number <= 2147483647```

probably joblib multiprocessing fails to pickle data and send between processes, which means you have to stay with n_jobs=1 (till a solution is available)

To change the n_jobs=1, do it the following way if you import the pipeline from another script

`build_pipeline.full_pipeline.n_jobs = 1`

In [30]:
NFOLDS = 10
folds  = KFold(n_splits=NFOLDS, shuffle=False, random_state=123)

oof_preds = np.zeros((len(train), 1))
test_preds = np.zeros((len(test), 1))

for fold_, (train_index, val_index) in enumerate(folds.split(X)):
    train_X = X.iloc[train_index]
    val_X = X.iloc[val_index]
    train_y = y.iloc[train_index]
    val_y = y.iloc[val_index]
    
    lgbm = LGBMRegressor(metric='rmse')
    
    model_pipeline = Pipeline(steps=[('full_pipeline', build_pipeline.full_pipeline), 
                                      ('model', lgbm)])
    print(f'Model {fold_} fitting...')
    model_pipeline.fit(train_X, train_y)
    print(f'Model {fold_} fitted.')
    val_pred = np.expm1(model_pipeline.predict(val_X))
    
    rmse = np.sqrt(mean_squared_error(val_y, val_pred))
    
    print(f'Fold: {fold_} , RMSE: {rmse}')
    
    test_fold_pred = np.expm1(model_pipeline.predict(test))
    
    oof_preds[val_index,:] = val_pred.reshape((-1, 1))
    
    test_preds += test_fold_pred.reshape((-1, 1))
    
test_preds /= NFOLDS

Model 0 fitting...
[FeatureUnion]  (step 1 of 4) Processing categorical_pipeline1, total=  32.7s
[FeatureUnion]  (step 2 of 4) Processing categorical_pipeline2, total=   0.2s
[FeatureUnion]  (step 3 of 4) Processing numerical_pipeline, total=  11.7s
[FeatureUnion]  (step 4 of 4) Processing temporal_pipeline, total=   8.4s
Model 0 fitted.
Fold: 0 , RMSE: 1270.929131607815
Model 1 fitting...
[FeatureUnion]  (step 1 of 4) Processing categorical_pipeline1, total=  30.7s
[FeatureUnion]  (step 2 of 4) Processing categorical_pipeline2, total=   0.2s
[FeatureUnion]  (step 3 of 4) Processing numerical_pipeline, total=  12.2s
[FeatureUnion]  (step 4 of 4) Processing temporal_pipeline, total=   8.5s
Model 1 fitted.
Fold: 1 , RMSE: 1407.3912480937154
Model 2 fitting...
[FeatureUnion]  (step 1 of 4) Processing categorical_pipeline1, total=  38.8s
[FeatureUnion]  (step 2 of 4) Processing categorical_pipeline2, total=   0.2s
[FeatureUnion]  (step 3 of 4) Processing numerical_pipeline, total=  13.1s
[

In [146]:
def metric(y_pred, y_true):
    return np.sqrt(np.mean(np.power(np.log(y_pred + 1) - np.log(y_true + 1), 2)))

In [33]:
metric(np.clip(a = np.squeeze(oof_preds), a_min=0, a_max=None), np.expm1(y).values)

1.3649332379397958

In [34]:
from datetime import datetime

In [71]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

p = Path('../../data/output/')

oof_prediction_file = p / f"oof_{timestamp}_{model_pipeline.steps[-1][1].__class__.__name__}.csv"
test_prediction_file = p / f"test_{timestamp}_{model_pipeline.steps[-1][1].__class__.__name__}.csv"

p.mkdir(parents=True, exist_ok=True)


In [74]:
oof_preds = np.clip(a = np.squeeze(oof_preds), a_min=0, a_max=None)
test_preds = np.clip(a = np.squeeze(test_preds), a_min=0, a_max=None)

submission = pd.DataFrame({'row_id':test.index, 'meter_reading':test_preds})
submission.to_csv(test_prediction_file, index=False)

oof_prediction = pd.DataFrame({'row_id':X.index, 'meter_reading':oof_preds})
oof_prediction.to_csv(oof_prediction_file, index=False)

In [75]:
X.site_id.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])

In [165]:
from tqdm import tqdm_notebook as tqdm

In [166]:
oof_preds = np.zeros((len(train), 1))
test_preds = np.zeros((len(test), 1))

models = {}

for site_id in tqdm(range(16), desc='site_id'):
    
    X_site = X[X['site_id']==site_id]
    y_site = y.loc[X_site.index]
    test_site = test[test['site_id']==site_id]
    
    models[site_id] = []
    rmse_scores = []

    NFOLDS = 10
    folds  = KFold(n_splits=NFOLDS, shuffle=False, random_state=123)

    print(f"{NFOLDS} CV for site_id: {site_id}")
    
    for fold_, (train_index, val_index) in enumerate(folds.split(X_site)):
        train_X = X_site.iloc[train_index]
        val_X = X_site.iloc[val_index]
        train_y = y_site.iloc[train_index]
        val_y = y_site.iloc[val_index]

        lgbm = LGBMRegressor(metric='rmse')

        model_pipeline = Pipeline(steps=[('full_pipeline', build_pipeline.full_pipeline), 
                                          ('model', lgbm)])
  
        model_pipeline.fit(train_X, train_y)
        val_pred = np.expm1(model_pipeline.predict(val_X))

        # Scoring
        rmse = np.sqrt(mean_squared_error(val_y, val_pred))
        print(f'\tFold: {fold_} , RMSE: {rmse}')

        test_fold_pred = np.expm1(model_pipeline.predict(test.iloc[test_site.index]))
        oof_preds[val_index,:] = val_pred.reshape((-1, 1))
        test_preds[test_site.index] += test_fold_pred.reshape((-1, 1))
        
        models[site_id].append(model_pipeline)
        
        rmse_scores.append(rmse)
    
    print(f"{NFOLDS} CV for site_id: {site_id} fitted. RMSE: {np.array(rmse_scores).mean()}+/-{np.array(rmse_scores).std()} ")

test_preds /= NFOLDS

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(IntProgress(value=0, description='site_id', max=16, style=ProgressStyle(description_width='init…

10 CV for site_id: 0
	Fold: 0 , RMSE: 2039.5935254492513
	Fold: 1 , RMSE: 1522.4074615414725
	Fold: 2 , RMSE: 1454.3190227329305
	Fold: 3 , RMSE: 1434.7190825058829
	Fold: 4 , RMSE: 1590.2727473306527
	Fold: 5 , RMSE: 1471.2856686521634
	Fold: 6 , RMSE: 1498.9110554651854
	Fold: 7 , RMSE: 956.7738980044031
	Fold: 8 , RMSE: 853.8726907832279
	Fold: 9 , RMSE: 987.550797595622
10 CV for site_id: 0 fitted. RMSE: 1380.9705950060793+/-337.7107896656341 
10 CV for site_id: 1
	Fold: 0 , RMSE: 223.87341236635635
	Fold: 1 , RMSE: 223.87875978681265
	Fold: 2 , RMSE: 221.1479561662237
	Fold: 3 , RMSE: 213.03385784933775
	Fold: 4 , RMSE: 207.1700323808346
	Fold: 5 , RMSE: 206.82593831641253
	Fold: 6 , RMSE: 202.47334192992363
	Fold: 7 , RMSE: 203.7675150484371
	Fold: 8 , RMSE: 199.03094022029248
	Fold: 9 , RMSE: 218.62416959306205
10 CV for site_id: 1 fitted. RMSE: 211.98259236576928+/-8.874347567262191 
10 CV for site_id: 2
	Fold: 0 , RMSE: 219.4968764873154
	Fold: 1 , RMSE: 220.68191624538898
	Fo

In [168]:
def metric(y_pred, y_true):
    return np.sqrt(np.mean(np.power(np.log(y_pred + 1) - np.log(y_true + 1), 2)))

metric(np.clip(a = np.squeeze(oof_preds), a_min=0, a_max=None), np.expm1(y).values)

4.495986217539851