In [1]:
import pandas as pd
import numpy as np
import random
import feather
import gc
import json
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import lightgbm as lgb
from sklearn.model_selection import KFold
import sys

sys.path.insert(0, "/home/jupyter/kaggle/energy/src")
import utility

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [19]:
SEED = 42
LOCAl_TEST = True
utility.set_seed(SEED)
TARGET = 'meter_reading'

In [3]:
# Gather the original data frames
train_df = pd.read_feather(f'{utility.CREATED_FEATURE_DIR}/train_data_cleaned_site_0.feather')
test_df = pd.read_feather(f'{utility.CREATED_DATA_DIR}/test_merged.feather')

# Gather the generated features
train_features_df = pd.read_feather(f'{utility.CREATED_FEATURE_DIR}/train_features.feather')
test_features_df = pd.read_feather(f'{utility.CREATED_FEATURE_DIR}/test_features.feather')

print(train_df.shape, test_df.shape, train_features_df.shape, test_features_df.shape)
#print(train_df.shape, test_df.shape)

# Remove the minute and second related features
train_features_df.drop(['minute', 'second'], axis=1, inplace=True)
test_features_df.drop(['minute', 'second'], axis=1, inplace=True)


.labels was deprecated in version 0.24.0. Use .codes instead.



(19822322, 16) (41697600, 16) (19822322, 25) (41697600, 25)


In [20]:
utility.trigger_gc()

2760


In [5]:
# Label Encode primary_use
train_features_df, test_features_df = utility.do_label_encoding(train_df, 
                                                        test_df, 
                                                        train_features_df, 
                                                        test_features_df, 
                                                        ['primary_use'], 'label')

LabelEncoder of feature [primary_use] is saved at [primary_use_label]


In [13]:
cat_features = ['site_id_building_id', 'site_building_meter_id',
       'site_building_meter_id_usage', 'site_id_meter', 'building_id_meter',
       'site_id_primary_use', 'building_id_primary_use', 'meter_primary_use']

train_features_df, test_features_df = utility.do_label_encoding(train_features_df, 
                                                            test_features_df, 
                                                            train_features_df, 
                                                            test_features_df, 
                                                            cat_features)

LabelEncoder the feature [site_id_building_id]
LabelEncoder the feature [site_building_meter_id]
LabelEncoder the feature [site_building_meter_id_usage]
LabelEncoder the feature [site_id_meter]
LabelEncoder the feature [building_id_meter]
LabelEncoder the feature [site_id_primary_use]
LabelEncoder the feature [building_id_primary_use]
LabelEncoder the feature [meter_primary_use]


In [14]:
# Convert boolean variables to 0 or 1
bool_fetaure_list = train_features_df.select_dtypes('bool').columns

def convert_to_int(df, feature_names):
    for feature_name in feature_names:
        df.loc[:, feature_name] = df[feature_name].astype('int')
    return df

In [15]:
train_features_df = convert_to_int(train_features_df, bool_fetaure_list)

test_features_df = convert_to_int(test_features_df, bool_fetaure_list)

In [16]:
train_merged = pd.concat([train_df, train_features_df], axis=1)
print(f'Shape of train_merged {train_merged.shape}')

del train_df, train_features_df
utility.trigger_gc()

test_merged = pd.concat([test_df, test_features_df], axis=1)
print(f'Shape of train_merged {test_merged.shape}')

del test_df, test_features_df
utility.trigger_gc()

Shape of train_merged (19822322, 40)
2630
Shape of train_merged (41697600, 40)
0


### Feature Engineering

In [17]:
train_merged.columns

Index(['site_id', 'building_id', 'timestamp', 'meter', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed', 'meter_reading',
       'year', 'month', 'quarter', 'weekofyear', 'day', 'dayofweek',
       'dayofyear', 'is_month_start', 'is_month_end', 'is_quarter_start',
       'is_quarter_end', 'is_year_start', 'is_year_end', 'hour',
       'site_id_building_id', 'site_building_meter_id',
       'site_building_meter_id_usage', 'site_id_meter', 'building_id_meter',
       'site_id_primary_use', 'building_id_primary_use', 'meter_primary_use',
       'building_age', 'primary_use_label'],
      dtype='object')

In [20]:
# Remove time stamp and meter reading
predictors = ['site_id', 'building_id', 'meter',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed', 
       'year', 'month', 'quarter', 'weekofyear', 'day', 'dayofweek',
       'dayofyear', 'is_month_start', 'is_month_end', 'is_quarter_start',
       'is_quarter_end', 'is_year_start', 'is_year_end', 'hour',
       'site_id_building_id', 'site_building_meter_id',
       'site_building_meter_id_usage', 'site_id_meter', 'building_id_meter',
       'site_id_primary_use', 'building_id_primary_use', 'meter_primary_use',
       'building_age', 'primary_use_label']

cat_fetaures = ['site_id', 'building_id', 'meter',
       'year_built', 'floor_count', 
       'year', 'month', 'quarter', 'weekofyear', 'day', 'dayofweek',
       'dayofyear', 'is_month_start', 'is_month_end', 'is_quarter_start',
       'is_quarter_end', 'is_year_start', 'is_year_end', 'hour',
       'site_id_building_id', 'site_building_meter_id',
       'site_building_meter_id_usage', 'site_id_meter', 'building_id_meter',
       'site_id_primary_use', 'building_id_primary_use', 'meter_primary_use', 
       'primary_use_label']

#Copied the params from Konstatien's kernel
lgb_params = {
                'objective':'regression',
                'boosting_type':'gbdt',
                'metric':'rmse',
                'n_jobs':-1,
                'learning_rate':0.05,
                'num_leaves': 2**8,
                'max_depth':-1,
                'tree_learner':'serial',
                'colsample_bytree': 0.9,
                'subsample_freq':1,
                'subsample':0.5,
                'n_estimators':2000,
                'max_bin':255,
                'verbose':-1,
                'seed': SEED,
                'early_stopping_rounds':100, 
                }

In [21]:
def train_model(training, validation, predictors, target, params, categorical_feature=None, test_X=None):
    
    train_X = training[predictors]
    train_Y = np.log1p(training[target])
    validation_X = validation[predictors]
    validation_Y = np.log1p(validation[target])

    print(f'Shape of train_X : {train_X.shape}')
    print(f'Shape of train_Y : {train_Y.shape}')
    print(f'Shape of validation_X : {validation_X.shape}')
    print(f'Shape of validation_Y : {validation_Y.shape}')
    
    dtrain = lgb.Dataset(train_X, label=train_Y)
    dvalid = lgb.Dataset(validation_X, validation_Y)
    
    print("Training model!")
    if categorical_feature:
        print(f'List of categorical features considered : {cat_fetaures}')
        bst = lgb.train(params, dtrain, valid_sets=[dvalid], verbose_eval=100, categorical_feature=categorical_feature)
    else:
        bst = lgb.train(params, dtrain, valid_sets=[dvalid], verbose_eval=100)
    
    valid_prediction = bst.predict(validation_X)
    valid_score = np.sqrt(metrics.mean_squared_error(validation_Y, valid_prediction))
    print(f'Validation Score {valid_score}')
    
    if test_X is not None:
        print('Do Nothing')
    else:
        return bst, valid_score

In [38]:
def make_prediction(df_train_X, df_train_Y, df_test_X, params, categorical_feature=None, n_splits=5):
    yoof = np.zeros(len(df_train_X))
    yhat = np.zeros(len(df_test_X))
    cv_scores = []
    result_dict = {}
    
    kf = KFold(n_splits=n_splits, random_state=SEED, shuffle=False)

    fold = 0
    for in_index, oof_index in kf.split(df_train_X, df_train_Y):
        fold += 1
        print(f'fold {fold} of {n_splits}')
        X_in, X_oof = df_train_X.iloc[in_index], df_train_X.iloc[oof_index]
        y_in, y_oof = df_train_Y.iloc[in_index], df_train_Y.iloc[oof_index]
        
        lgb_train = lgb.Dataset(X_in, y_in, categorical_feature=categorical_feature)
        lgb_eval = lgb.Dataset(X_oof, y_oof, reference=lgb_train, categorical_feature=categorical_feature)
        
        model = lgb.train(
            params,
            lgb_train,
            valid_sets = [lgb_train, lgb_eval],
            verbose_eval = 50
        )   
        
        del lgb_train, lgb_eval, in_index, X_in, y_in 
        gc.collect()
        
        print('Training completed')
        yoof[oof_index] = model.predict(X_oof)
        print('OOF Prediction completed.')
        #lgb_test = lgb.Dataset(df_test_X, categorical_feature=categorical_feature)
        prediction = model.predict(df_test_X.values)
        print('Shape of prediction', prediction.shape)
        yhat += np.expm1(prediction)
        print('Prediction completed')
        cv_oof_score = np.sqrt(metrics.mean_squared_error(y_oof, yoof[oof_index]))
        print(f'CV OOF Score for fold {fold} is {cv_oof_score}')
        cv_scores.append(cv_oof_score)
        
        del oof_index, X_oof, y_oof
        gc.collect()

    yhat /= n_splits

    oof_score = round(np.sqrt(metrics.mean_squared_error(df_train_Y, yoof)), 5)
    avg_cv_scores = round(sum(cv_scores)/len(cv_scores), 5)
    std_cv_scores = round(np.array(cv_scores).std(), 5)

    print(f'Combined OOF score : {oof_score}')
    print(f'Average of {fold} folds OOF score {avg_cv_scores}')
    print(f'std of {fold} folds OOF score {std_cv_scores}')
    
    result_dict['yoof'] = yoof
    result_dict['prediction'] = yhat
    result_dict['oof_score'] = oof_score
    result_dict['cv_scores'] = cv_scores
    result_dict['avg_cv_scores'] = avg_cv_scores
    result_dict['std_cv_scores'] = std_cv_scores
    
    return result_dict

### Validation 1 : 50% Training : 50% Holdout split without shuffle

In [22]:
training, validation = utility.get_data_splits_by_fraction(train_merged, valid_fraction=0.5)

Splitting the data into train and holdout with validation fraction 0.5...
Shape of the training data (9911161, 40) 
Shape of the validation data (9911161, 40)


In [23]:
bst, valid_score = train_model(training, validation, predictors, TARGET, params=lgb_params, categorical_feature=cat_fetaures)

Shape of train_X : (9911161, 37)
Shape of train_Y : (9911161,)
Shape of validation_X : (9911161, 37)
Shape of validation_Y : (9911161,)
Training model!
List of categorical features considered : ['site_id', 'building_id', 'meter', 'year_built', 'floor_count', 'year', 'month', 'quarter', 'weekofyear', 'day', 'dayofweek', 'dayofyear', 'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'is_year_start', 'is_year_end', 'hour', 'site_id_building_id', 'site_building_meter_id', 'site_building_meter_id_usage', 'site_id_meter', 'building_id_meter', 'site_id_primary_use', 'building_id_primary_use', 'meter_primary_use', 'primary_use_label']



Found `n_estimators` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument


categorical_feature in Dataset is overridden.
New categorical_feature is ['building_id', 'building_id_meter', 'building_id_primary_use', 'day', 'dayofweek', 'dayofyear', 'floor_count', 'hour', 'is_month_end', 'is_month_start', 'is_quarter_end', 'is_quarter_start', 'is_year_end', 'is_year_start', 'meter', 'meter_primary_use', 'month', 'primary_use_label', 'quarter', 'site_building_meter_id', 'site_building_meter_id_usage', 'site_id', 'site_id_building_id', 'site_id_meter', 'site_id_primary_use', 'weekofyear', 'year', 'year_built']



Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 1.17246
[200]	valid_0's rmse: 1.14124
[300]	valid_0's rmse: 1.13661
[400]	valid_0's rmse: 1.13476
[500]	valid_0's rmse: 1.13399
[600]	valid_0's rmse: 1.13396
[700]	valid_0's rmse: 1.13343
Early stopping, best iteration is:
[680]	valid_0's rmse: 1.13328
Validation Score 1.2241440724700703


In [24]:
del training, validation, bst, valid_score
gc.collect()

282

### Validation 2 : Train on first 4 months, skip next 4 months, Test on last 4 months

In [25]:
# Training on 1st four months
train_months = [1, 2, 3, 4]
# Holdout on last four months
validation_months = [9, 10, 11, 12]

training, validation = utility.get_data_splits_by_month(train_merged, 
                                                        train_months=train_months, 
                                                        validation_months=validation_months)

Splitting the data into train and holdout based on months...
Training months [1, 2, 3, 4]
Validation months [9, 10, 11, 12]
Shape of the training data (6134847, 40) 
Shape of the validation data (6857643, 40)


In [36]:
bst, valid_score = train_model(training, 
                               validation, 
                               predictors, 
                               TARGET, 
                               params=lgb_params, 
                               categorical_feature=cat_fetaures)

NameError: name 'training' is not defined

In [28]:
del train_months, validation_months, training, validation, bst, valid_score
gc.collect()

NameError: name 'train_months' is not defined

### Make Prediction on test data

In [29]:
train_X = train_merged[predictors]
train_Y = np.log1p(train_merged[TARGET])
test_X = test_merged[predictors]
test_row_id = test_merged['row_id']

print(f'Size of train_X {train_X.shape}')
print(f'Size of train_Y {train_Y.shape}')
print(f'Size of test_X {test_X.shape}')
print(f'Size of test_row_id {test_row_id.shape}')

Size of train_X (19822322, 37)
Size of train_Y (19822322,)
Size of test_X (41697600, 37)
Size of test_row_id (41697600,)


In [30]:
train_X.head()

Unnamed: 0,site_id,building_id,meter,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,year,month,quarter,weekofyear,day,dayofweek,dayofyear,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,hour,site_id_building_id,site_building_meter_id,site_building_meter_id_usage,site_id_meter,building_id_meter,site_id_primary_use,building_id_primary_use,meter_primary_use,building_age,primary_use_label
0,0,53,0,87742,1971,99,25.0,6.0,20.0,,1019.700012,0.0,0.0,2016,1,1,53,1,4,1,1,0,1,0,1,0,0,54,63,63,0,1607,3,937,6,48,6
1,1,105,0,50623,1800,5,3.8,,2.4,,1020.900024,240.0,3.1,2016,1,1,53,1,4,1,1,0,1,0,1,0,0,556,1076,1076,20,94,50,66,0,219,0
2,1,106,0,5374,1800,4,3.8,,2.4,,1020.900024,240.0,3.1,2016,1,1,53,1,4,1,1,0,1,0,1,0,0,557,1077,1077,20,105,50,77,0,219,0
3,1,106,3,5374,1800,4,3.8,,2.4,,1020.900024,240.0,3.1,2016,1,1,53,1,4,1,1,0,1,0,1,0,0,557,1078,1078,21,106,50,77,44,219,0
4,1,107,0,97532,2005,10,3.8,,2.4,,1020.900024,240.0,3.1,2016,1,1,53,1,4,1,1,0,1,0,1,0,0,558,1079,1079,20,128,50,88,0,14,0


In [31]:
del train_merged, test_merged
gc.collect()

74

In [39]:
result_dict = make_prediction(train_X, train_Y, test_X, params=lgb_params, categorical_feature=cat_fetaures, n_splits=2)

fold 1 of 2
Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 0.822757	valid_1's rmse: 1.27012
[100]	training's rmse: 0.667553	valid_1's rmse: 1.2472
[150]	training's rmse: 0.603522	valid_1's rmse: 1.24179
[200]	training's rmse: 0.574068	valid_1's rmse: 1.24214
Early stopping, best iteration is:
[141]	training's rmse: 0.611831	valid_1's rmse: 1.24069
Training completed
OOF Prediction completed.
Shape of prediction (41697600,)
Prediction completed
CV OOF Score for fold 1 is 1.2423634744343752
fold 2 of 2



Found `n_estimators` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument


Using categorical_feature in Dataset.



Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 0.810822	valid_1's rmse: 1.23689
[100]	training's rmse: 0.660822	valid_1's rmse: 1.17246
[150]	training's rmse: 0.602191	valid_1's rmse: 1.14727
[200]	training's rmse: 0.576548	valid_1's rmse: 1.14124
[250]	training's rmse: 0.560649	valid_1's rmse: 1.13836
[300]	training's rmse: 0.548958	valid_1's rmse: 1.13661
[350]	training's rmse: 0.540094	valid_1's rmse: 1.13607
[400]	training's rmse: 0.531355	valid_1's rmse: 1.13476
[450]	training's rmse: 0.524106	valid_1's rmse: 1.13419
[500]	training's rmse: 0.517497	valid_1's rmse: 1.13399
[550]	training's rmse: 0.511686	valid_1's rmse: 1.13385
[600]	training's rmse: 0.505975	valid_1's rmse: 1.13396
[650]	training's rmse: 0.500485	valid_1's rmse: 1.13354
[700]	training's rmse: 0.49599	valid_1's rmse: 1.13343
[750]	training's rmse: 0.491436	valid_1's rmse: 1.13334
Early stopping, best iteration is:
[680]	training's rmse: 0.497684	valid_1's rmse: 1.13328
Training 

In [40]:
len(result_dict)

6

In [42]:
result_dict

{'yoof': array([4.39619864, 4.15479624, 1.06703505, ..., 0.78703716, 5.32853859,
        1.35591889]),
 'prediction': array([196.56280576,  86.37350022,   7.5068043 , ...,   0.84989317,
        190.17161884,   3.13273441]),
 'oof_score': 1.23325,
 'cv_scores': [1.2423634744343752, 1.224060455383794],
 'avg_cv_scores': 1.23321,
 'std_cv_scores': 0.00915}

In [52]:
predictions = result_dict['prediction']

submission_df =  feather.read_dataframe(f'{utility.CREATED_DATA_DIR}/submission.feather')

submission_df.row_id = test_row_id
submission_df.meter_reading = predictions

In [53]:
submission_df.to_csv()

Unnamed: 0,row_id,meter_reading
0,0,196.562806
1,1,86.3735
2,2,7.506804
3,3,261.432631
4,4,1089.27463


In [54]:
submission_df.to_csv('submission_5.csv.gz', index=False, compression='gzip')

In [55]:
! /home/jupyter/.local/bin/kaggle competitions submit -c ashrae-energy-prediction -f submission_5.csv.gz -m "Cleaned site 0 data, added few interaction features, used cat encode from LGBM, 2 split CV"

100%|██████████| 473M/473M [00:10<00:00, 46.3MB/s] 
Successfully submitted to ASHRAE - Great Energy Predictor III

### Same prediction but with 5 folds

In [56]:
result_dict = make_prediction(train_X, train_Y, test_X, params=lgb_params, categorical_feature=cat_fetaures, n_splits=5)

fold 1 of 5



Found `n_estimators` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument


Using categorical_feature in Dataset.



Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 0.865321	valid_1's rmse: 1.12377
[100]	training's rmse: 0.715484	valid_1's rmse: 1.03269
[150]	training's rmse: 0.647954	valid_1's rmse: 1.00359
[200]	training's rmse: 0.616143	valid_1's rmse: 0.973678
[250]	training's rmse: 0.59628	valid_1's rmse: 0.965345
[300]	training's rmse: 0.583232	valid_1's rmse: 0.961298
[350]	training's rmse: 0.572794	valid_1's rmse: 0.958464
[400]	training's rmse: 0.563994	valid_1's rmse: 0.956909
[450]	training's rmse: 0.555862	valid_1's rmse: 0.95656
[500]	training's rmse: 0.549219	valid_1's rmse: 0.955996
[550]	training's rmse: 0.542776	valid_1's rmse: 0.955136
[600]	training's rmse: 0.537401	valid_1's rmse: 0.955112
[650]	training's rmse: 0.532124	valid_1's rmse: 0.954812
[700]	training's rmse: 0.527028	valid_1's rmse: 0.954376
[750]	training's rmse: 0.522735	valid_1's rmse: 0.95433
[800]	training's rmse: 0.518483	valid_1's rmse: 0.954037
[850]	training's rmse: 0.514339	v


Found `n_estimators` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument


Using categorical_feature in Dataset.



Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 0.88017	valid_1's rmse: 1.04366
[100]	training's rmse: 0.716121	valid_1's rmse: 0.997873
[150]	training's rmse: 0.647574	valid_1's rmse: 0.985472
[200]	training's rmse: 0.61396	valid_1's rmse: 0.979925
[250]	training's rmse: 0.594097	valid_1's rmse: 0.976737
[300]	training's rmse: 0.579365	valid_1's rmse: 0.974877
[350]	training's rmse: 0.56744	valid_1's rmse: 0.973232
[400]	training's rmse: 0.558216	valid_1's rmse: 0.972015
[450]	training's rmse: 0.549897	valid_1's rmse: 0.971424
[500]	training's rmse: 0.54283	valid_1's rmse: 0.970599
[550]	training's rmse: 0.536259	valid_1's rmse: 0.970044
[600]	training's rmse: 0.530241	valid_1's rmse: 0.969486
[650]	training's rmse: 0.524683	valid_1's rmse: 0.969054
[700]	training's rmse: 0.519372	valid_1's rmse: 0.968658
[750]	training's rmse: 0.515264	valid_1's rmse: 0.968379
[800]	training's rmse: 0.510645	valid_1's rmse: 0.968183
[850]	training's rmse: 0.506218	


Found `n_estimators` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument


Using categorical_feature in Dataset.



Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 0.870649	valid_1's rmse: 1.09633
[100]	training's rmse: 0.716353	valid_1's rmse: 0.999069
[150]	training's rmse: 0.649105	valid_1's rmse: 0.973469
[200]	training's rmse: 0.615011	valid_1's rmse: 0.965474
[250]	training's rmse: 0.596326	valid_1's rmse: 0.962981
[300]	training's rmse: 0.58266	valid_1's rmse: 0.961624
[350]	training's rmse: 0.570926	valid_1's rmse: 0.960325
[400]	training's rmse: 0.562496	valid_1's rmse: 0.959767
[450]	training's rmse: 0.554565	valid_1's rmse: 0.959441
[500]	training's rmse: 0.547224	valid_1's rmse: 0.958989
[550]	training's rmse: 0.54109	valid_1's rmse: 0.958706
[600]	training's rmse: 0.53491	valid_1's rmse: 0.957938
[650]	training's rmse: 0.528921	valid_1's rmse: 0.958069
[700]	training's rmse: 0.523254	valid_1's rmse: 0.957791
Shape of prediction (41697600,)
Prediction completed
CV OOF Score for fold 3 is 1.1375314575151576
fold 4 of 5



Found `n_estimators` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument


Using categorical_feature in Dataset.



Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 0.858164	valid_1's rmse: 1.23243
[100]	training's rmse: 0.710911	valid_1's rmse: 1.16078
[150]	training's rmse: 0.645053	valid_1's rmse: 1.13926
[200]	training's rmse: 0.613005	valid_1's rmse: 1.13117
[250]	training's rmse: 0.595005	valid_1's rmse: 1.12678
[300]	training's rmse: 0.581913	valid_1's rmse: 1.12397
[350]	training's rmse: 0.571355	valid_1's rmse: 1.12136
[400]	training's rmse: 0.562219	valid_1's rmse: 1.12076
[450]	training's rmse: 0.554083	valid_1's rmse: 1.11981
[700]	training's rmse: 0.521834	valid_1's rmse: 1.11806
[750]	training's rmse: 0.517607	valid_1's rmse: 1.11812
Early stopping, best iteration is:
[657]	training's rmse: 0.525985	valid_1's rmse: 1.11791
Training completed
OOF Prediction completed.
Shape of prediction (41697600,)
Prediction completed
CV OOF Score for fold 4 is 1.2138541656302122
fold 5 of 5



Found `n_estimators` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument


Using categorical_feature in Dataset.



Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 0.872992	valid_1's rmse: 1.08154
[100]	training's rmse: 0.709507	valid_1's rmse: 1.02638
[150]	training's rmse: 0.644461	valid_1's rmse: 1.01824
[200]	training's rmse: 0.612542	valid_1's rmse: 1.01917
[250]	training's rmse: 0.592708	valid_1's rmse: 1.01891
Early stopping, best iteration is:
[157]	training's rmse: 0.63929	valid_1's rmse: 1.01762
Training completed
OOF Prediction completed.
Shape of prediction (41697600,)
Prediction completed
CV OOF Score for fold 5 is 1.1324071557572988
Combined OOF score : 1.13372
Average of 5 folds OOF score 1.13028
std of 5 folds OOF score 0.08817


In [57]:
predictions = result_dict['prediction']

submission_df =  feather.read_dataframe(f'{utility.CREATED_DATA_DIR}/submission.feather')

submission_df.row_id = test_row_id
submission_df.meter_reading = predictions


.labels was deprecated in version 0.24.0. Use .codes instead.



In [58]:
submission_df.to_csv('submission_6.csv.gz', index=False, compression='gzip')

In [2]:
! /home/jupyter/.local/bin/kaggle competitions submit -c ashrae-energy-prediction -f submission_6.csv.gz -m "Submission 5 + 5 splits"

100%|██████████| 473M/473M [00:08<00:00, 58.1MB/s] 
Successfully submitted to ASHRAE - Great Energy Predictor III