In [10]:
import pandas as pd
import numpy as np
import random
import feather
import gc
import json
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import lightgbm as lgb
from sklearn.model_selection import KFold
import sys

sys.path.insert(0, "/home/jupyter/kaggle/energy/src")
import utility

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [37]:
SEED = 42
LOCAl_TEST = True
utility.set_seed(SEED)
TARGET = 'meter_reading'

In [12]:
# Gather the original data frames
train_df = pd.read_feather(f'{utility.CREATED_FEATURE_DIR}/train_data_cleaned_site_0.feather')
test_df = pd.read_feather(f'{utility.CREATED_DATA_DIR}/test_merged.feather')

# Gather the generated features
train_features_df = pd.read_feather(f'{utility.CREATED_FEATURE_DIR}/train_features.feather')
test_features_df = pd.read_feather(f'{utility.CREATED_FEATURE_DIR}/test_features.feather')

print(train_df.shape, test_df.shape, train_features_df.shape, test_features_df.shape)
#print(train_df.shape, test_df.shape)

# Remove the minute and second related features
train_features_df.drop(['minute', 'second'], axis=1, inplace=True)
test_features_df.drop(['minute', 'second'], axis=1, inplace=True)

(19822322, 16) (41697600, 16) (19822322, 25) (41697600, 25)


In [14]:
utility.trigger_gc()

22


In [None]:
# Label Encode primary_use
train_features_df, test_features_df = utility.do_label_encoding(train_df, 
                                                        test_df, 
                                                        train_features_df, 
                                                        test_features_df, 
                                                        ['primary_use'], 'label')

LabelEncoder of feature [primary_use] is saved at [primary_use_label]


In [16]:
cat_features = ['site_id_building_id', 'site_building_meter_id',
       'site_building_meter_id_usage', 'site_id_meter', 'building_id_meter',
       'site_id_primary_use', 'building_id_primary_use', 'meter_primary_use']

train_features_df, test_features_df = utility.do_label_encoding(train_features_df, 
                                                            test_features_df, 
                                                            train_features_df, 
                                                            test_features_df, 
                                                            cat_features)

LabelEncoder the feature [site_id_building_id]
LabelEncoder the feature [site_building_meter_id]
LabelEncoder the feature [site_building_meter_id_usage]
LabelEncoder the feature [site_id_meter]
LabelEncoder the feature [building_id_meter]
LabelEncoder the feature [site_id_primary_use]
LabelEncoder the feature [building_id_primary_use]
LabelEncoder the feature [meter_primary_use]


In [17]:
# Convert boolean variables to 0 or 1
bool_fetaure_list = train_features_df.select_dtypes('bool').columns



In [22]:
train_features_df = utility.convert_to_int(train_features_df, bool_fetaure_list)

test_features_df = utility.convert_to_int(test_features_df, bool_fetaure_list)

In [23]:
train_merged = pd.concat([train_df, train_features_df], axis=1)
print(f'Shape of train_merged {train_merged.shape}')

del train_df, train_features_df
utility.trigger_gc()

test_merged = pd.concat([test_df, test_features_df], axis=1)
print(f'Shape of train_merged {test_merged.shape}')

del test_df, test_features_df
utility.trigger_gc()

Shape of train_merged (19822322, 40)
1487
Shape of train_merged (41697600, 40)
0


### Feature Engineering

In [24]:
train_merged.columns

Index(['site_id', 'building_id', 'timestamp', 'meter', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed', 'meter_reading',
       'year', 'month', 'quarter', 'weekofyear', 'day', 'dayofweek',
       'dayofyear', 'is_month_start', 'is_month_end', 'is_quarter_start',
       'is_quarter_end', 'is_year_start', 'is_year_end', 'hour',
       'site_id_building_id', 'site_building_meter_id',
       'site_building_meter_id_usage', 'site_id_meter', 'building_id_meter',
       'site_id_primary_use', 'building_id_primary_use', 'meter_primary_use',
       'building_age', 'primary_use_label'],
      dtype='object')

In [25]:
# Remove time stamp and meter reading
predictors = ['site_id', 'building_id', 'meter',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed', 
       'year', 'month', 'quarter', 'weekofyear', 'day', 'dayofweek',
       'dayofyear', 'is_month_start', 'is_month_end', 'is_quarter_start',
       'is_quarter_end', 'is_year_start', 'is_year_end', 'hour',
       'site_id_building_id', 'site_building_meter_id',
       'site_building_meter_id_usage', 'site_id_meter', 'building_id_meter',
       'site_id_primary_use', 'building_id_primary_use', 'meter_primary_use',
       'building_age', 'primary_use_label']

# No cat features
# cat_fetaures = ['site_id', 'building_id', 'meter',
#        'year_built', 'floor_count', 
#        'year', 'month', 'quarter', 'weekofyear', 'day', 'dayofweek',
#        'dayofyear', 'is_month_start', 'is_month_end', 'is_quarter_start',
#        'is_quarter_end', 'is_year_start', 'is_year_end', 'hour',
#        'site_id_building_id', 'site_building_meter_id',
#        'site_building_meter_id_usage', 'site_id_meter', 'building_id_meter',
#        'site_id_primary_use', 'building_id_primary_use', 'meter_primary_use', 
#        'primary_use_label']

#Copied the params from Konstatien's kernel
lgb_params = {
                'objective':'regression',
                'boosting_type':'gbdt',
                'metric':'rmse',
                'n_jobs':-1,
                'learning_rate':0.05,
                'num_leaves': 2**8,
                'max_depth':-1,
                'tree_learner':'serial',
                'colsample_bytree': 0.9,
                'subsample_freq':1,
                'subsample':0.5,
                'n_estimators':2000,
                'max_bin':255,
                'verbose':-1,
                'seed': SEED,
                'early_stopping_rounds':100, 
                }

### Validation 1 : 50% Training : 50% Holdout split without shuffle

In [26]:
training, validation = utility.get_data_splits_by_fraction(train_merged, valid_fraction=0.5)

Splitting the data into train and holdout with validation fraction 0.5...
Shape of the training data (9911161, 40) 
Shape of the validation data (9911161, 40)


In [27]:
bst, valid_score = utility.train_model(training, validation, predictors, TARGET, params=lgb_params)

Shape of train_X : (9911161, 37)
Shape of train_Y : (9911161,)
Shape of validation_X : (9911161, 37)
Shape of validation_Y : (9911161,)
Training model!



Found `n_estimators` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument



Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 1.27745
[200]	valid_0's rmse: 1.24682
[300]	valid_0's rmse: 1.2395
[400]	valid_0's rmse: 1.23646
[500]	valid_0's rmse: 1.23587
Early stopping, best iteration is:
[449]	valid_0's rmse: 1.23495
Validation Score 1.2349522481491462


In [29]:
del training, validation, bst, valid_score
gc.collect()

NameError: name 'training' is not defined

### Validation 2 : Train on first 4 months, skip next 4 months, Test on last 4 months

In [30]:
# Training on 1st four months
train_months = [1, 2, 3, 4]
# Holdout on last four months
validation_months = [9, 10, 11, 12]

training, validation = utility.get_data_splits_by_month(train_merged, 
                                                        train_months=train_months, 
                                                        validation_months=validation_months)

Splitting the data into train and holdout based on months...
Training months [1, 2, 3, 4]
Validation months [9, 10, 11, 12]
Shape of the training data (6134847, 40) 
Shape of the validation data (6857643, 40)


In [31]:
bst, valid_score = utility.train_model(training, 
                               validation, 
                               predictors, 
                               TARGET, 
                               params=lgb_params)

Shape of train_X : (6134847, 37)
Shape of train_Y : (6134847,)
Shape of validation_X : (6857643, 37)
Shape of validation_Y : (6857643,)
Training model!



Found `n_estimators` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument



Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 1.51817
[200]	valid_0's rmse: 1.50913
Early stopping, best iteration is:
[154]	valid_0's rmse: 1.50159
Validation Score 1.501587432653443


In [32]:
del train_months, validation_months, training, validation, bst, valid_score
gc.collect()

246

### Make Prediction on test data

In [33]:
train_X = train_merged[predictors]
train_Y = np.log1p(train_merged[TARGET])
test_X = test_merged[predictors]
test_row_id = test_merged['row_id']

print(f'Size of train_X {train_X.shape}')
print(f'Size of train_Y {train_Y.shape}')
print(f'Size of test_X {test_X.shape}')
print(f'Size of test_row_id {test_row_id.shape}')

Size of train_X (19822322, 37)
Size of train_Y (19822322,)
Size of test_X (41697600, 37)
Size of test_row_id (41697600,)


In [34]:
train_X.head()

Unnamed: 0,site_id,building_id,meter,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,year,month,quarter,weekofyear,day,dayofweek,dayofyear,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,hour,site_id_building_id,site_building_meter_id,site_building_meter_id_usage,site_id_meter,building_id_meter,site_id_primary_use,building_id_primary_use,meter_primary_use,building_age,primary_use_label
0,0,53,0,87742,1971,99,25.0,6.0,20.0,,1019.700012,0.0,0.0,2016,1,1,53,1,4,1,1,0,1,0,1,0,0,54,63,63,0,1607,3,937,6,48,6
1,1,105,0,50623,1800,5,3.8,,2.4,,1020.900024,240.0,3.1,2016,1,1,53,1,4,1,1,0,1,0,1,0,0,556,1076,1076,20,94,50,66,0,219,0
2,1,106,0,5374,1800,4,3.8,,2.4,,1020.900024,240.0,3.1,2016,1,1,53,1,4,1,1,0,1,0,1,0,0,557,1077,1077,20,105,50,77,0,219,0
3,1,106,3,5374,1800,4,3.8,,2.4,,1020.900024,240.0,3.1,2016,1,1,53,1,4,1,1,0,1,0,1,0,0,557,1078,1078,21,106,50,77,44,219,0
4,1,107,0,97532,2005,10,3.8,,2.4,,1020.900024,240.0,3.1,2016,1,1,53,1,4,1,1,0,1,0,1,0,0,558,1079,1079,20,128,50,88,0,14,0


In [35]:
del train_merged, test_merged
gc.collect()

22

In [40]:
def make_prediction(df_train_X, df_train_Y, df_test_X, params, categorical_feature=None, n_splits=5):
    yoof = np.zeros(len(df_train_X))
    yhat = np.zeros(len(df_test_X))
    cv_scores = []
    result_dict = {}
    
    kf = KFold(n_splits=n_splits, random_state=SEED, shuffle=False)

    fold = 0
    for in_index, oof_index in kf.split(df_train_X, df_train_Y):
        fold += 1
        print(f'fold {fold} of {n_splits}')
        X_in, X_oof = df_train_X.iloc[in_index], df_train_X.iloc[oof_index]
        y_in, y_oof = df_train_Y.iloc[in_index], df_train_Y.iloc[oof_index]
        
        if categorical_feature:
            lgb_train = lgb.Dataset(X_in, y_in, categorical_feature=categorical_feature)
            lgb_eval = lgb.Dataset(X_oof, y_oof, reference=lgb_train, categorical_feature=categorical_feature)
        else:
            lgb_train = lgb.Dataset(X_in, y_in)
            lgb_eval = lgb.Dataset(X_oof, y_oof, reference=lgb_train)
        
        model = lgb.train(
            params,
            lgb_train,
            valid_sets = [lgb_train, lgb_eval],
            verbose_eval = 50
        )   
        
        del lgb_train, lgb_eval, in_index, X_in, y_in 
        gc.collect()
        
        print('Training completed')
        yoof[oof_index] = model.predict(X_oof)
        print('OOF Prediction completed.')
        prediction = model.predict(df_test_X.values)
        print('Shape of prediction', prediction.shape)
        yhat += np.expm1(prediction)
        print('Prediction completed')
        cv_oof_score = np.sqrt(metrics.mean_squared_error(y_oof, yoof[oof_index]))
        print(f'CV OOF Score for fold {fold} is {cv_oof_score}')
        cv_scores.append(cv_oof_score)
        
        del oof_index, X_oof, y_oof
        gc.collect()

    yhat /= n_splits

    oof_score = round(np.sqrt(metrics.mean_squared_error(df_train_Y, yoof)), 5)
    avg_cv_scores = round(sum(cv_scores)/len(cv_scores), 5)
    std_cv_scores = round(np.array(cv_scores).std(), 5)

    print(f'Combined OOF score : {oof_score}')
    print(f'Average of {fold} folds OOF score {avg_cv_scores}')
    print(f'std of {fold} folds OOF score {std_cv_scores}')
    
    result_dict['yoof'] = yoof
    result_dict['prediction'] = yhat
    result_dict['oof_score'] = oof_score
    result_dict['cv_scores'] = cv_scores
    result_dict['avg_cv_scores'] = avg_cv_scores
    result_dict['std_cv_scores'] = std_cv_scores
    
    return result_dict

In [41]:
SEED= 42
result_dict = make_prediction(train_X, train_Y, test_X, params=lgb_params, n_splits=2)

fold 1 of 2



Found `n_estimators` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument



Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 1.08064	valid_1's rmse: 1.39217
[100]	training's rmse: 0.900642	valid_1's rmse: 1.32154
[150]	training's rmse: 0.8126	valid_1's rmse: 1.30329
[200]	training's rmse: 0.759432	valid_1's rmse: 1.29535
[250]	training's rmse: 0.723463	valid_1's rmse: 1.29132
[300]	training's rmse: 0.695121	valid_1's rmse: 1.29066
[350]	training's rmse: 0.674813	valid_1's rmse: 1.29032
[400]	training's rmse: 0.657783	valid_1's rmse: 1.29088
Early stopping, best iteration is:
[331]	training's rmse: 0.681802	valid_1's rmse: 1.2898
Training completed
OOF Prediction completed.
Shape of prediction (41697600,)
Prediction completed
CV OOF Score for fold 1 is 1.2898028146827905
fold 2 of 2



Found `n_estimators` in params. Will use it instead of argument


Found `early_stopping_rounds` in params. Will use it instead of argument



Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 1.05793	valid_1's rmse: 1.32977
[100]	training's rmse: 0.883487	valid_1's rmse: 1.27746
[150]	training's rmse: 0.798166	valid_1's rmse: 1.25987
[200]	training's rmse: 0.751229	valid_1's rmse: 1.24673
[250]	training's rmse: 0.719529	valid_1's rmse: 1.24165
[300]	training's rmse: 0.69635	valid_1's rmse: 1.23941
[350]	training's rmse: 0.674947	valid_1's rmse: 1.23691
[400]	training's rmse: 0.660671	valid_1's rmse: 1.2364
[450]	training's rmse: 0.645606	valid_1's rmse: 1.23508
[500]	training's rmse: 0.631892	valid_1's rmse: 1.23583
Early stopping, best iteration is:
[449]	training's rmse: 0.645839	valid_1's rmse: 1.2349
Training completed
OOF Prediction completed.
Shape of prediction (41697600,)
Prediction completed
CV OOF Score for fold 2 is 1.2349035395707257
Combined OOF score : 1.26265
Average of 2 folds OOF score 1.26235
std of 2 folds OOF score 0.02745


In [43]:
len(result_dict)

6

In [44]:
result_dict

{'yoof': array([2.77492473, 4.18114844, 0.95238502, ..., 0.81422228, 4.85963582,
        1.49403132]),
 'prediction': array([128.62145318,  55.29981104,  13.48442946, ...,   1.52492067,
        131.53845171,   3.53290853]),
 'oof_score': 1.26265,
 'cv_scores': [1.2898028146827905, 1.2349035395707257],
 'avg_cv_scores': 1.26235,
 'std_cv_scores': 0.02745}

In [45]:
predictions = result_dict['prediction']

submission_df =  feather.read_dataframe(f'{utility.CREATED_DATA_DIR}/submission.feather')

submission_df.row_id = test_row_id
submission_df.meter_reading = predictions


.labels was deprecated in version 0.24.0. Use .codes instead.



In [None]:
submission_df.to_csv()

In [None]:
submission_df.to_csv('submission_7.csv.gz', index=False, compression='gzip')

In [None]:
! /home/jupyter/.local/bin/kaggle competitions submit -c ashrae-energy-prediction -f submission_7.csv.gz -m "Instead of LGBM taking care of cat variables, I am doing vanilla Label Encoding"