In [10]:
import pandas as pd
import numpy as np
import feather
import gc
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [11]:
CREATED_DATA_DIR = '/home/jupyter/kaggle/energy/data/read_only_feather/v1'

def read_data(data_dir):
    print('Reading Data...')
    train_df = feather.read_dataframe(f'{data_dir}/train_merged.feather')
    test_df = feather.read_dataframe(f'{data_dir}/test_merged.feather')
    print(f'Shape of train_df : {train_df.shape}')
    print(f'Shape of test_df : {test_df.shape}')
    return train_df, test_df


def create_date_features(df, feature_name):
    '''
    Create new features related to dates
    
    df : The complete dataframe
    feature_name : Name of the feature of date type which needs to be decomposed.
    '''
    df.loc[:, 'year'] = df.loc[:, feature_name].dt.year.astype('uint32')
    df.loc[:, 'month'] = df.loc[:, feature_name].dt.month.astype('uint32')
    df.loc[:, 'quarter'] = df.loc[:, feature_name].dt.quarter.astype('uint32')
    df.loc[:, 'weekofyear'] = df.loc[:, feature_name].dt.weekofyear.astype('uint32')
    
    df.loc[:, 'day'] = df.loc[:, feature_name].dt.day.astype('uint32')
    df.loc[:, 'dayofweek'] = df.loc[:, feature_name].dt.dayofweek.astype('uint32')
    df.loc[:, 'dayofyear'] = df.loc[:, feature_name].dt.dayofyear.astype('uint32')
    df.loc[:, 'is_month_start'] = df.loc[:, feature_name].dt.is_month_start
    df.loc[:, 'is_month_end'] = df.loc[:, feature_name].dt.is_month_end
    df.loc[:, 'is_quarter_start']= df.loc[:, feature_name].dt.is_quarter_start
    df.loc[:, 'is_quarter_end'] = df.loc[:, feature_name].dt.is_quarter_end
    df.loc[:, 'is_year_start'] = df.loc[:, feature_name].dt.is_year_start
    df.loc[:, 'is_year_end'] = df.loc[:, feature_name].dt.is_year_end
    
    df.loc[:, 'hour'] = df.loc[:, feature_name].dt.hour.astype('uint32')
    df.loc[:, 'minute'] = df.loc[:, feature_name].dt.minute.astype('uint32')
    df.loc[:, 'second'] = df.loc[:, feature_name].dt.second.astype('uint32')
    
    return df

In [14]:
train_df, test_df = read_data(CREATED_DATA_DIR)

train_ordered_column_names = ['site_id', 'building_id', 'timestamp', 'meter',
       'primary_use', 'square_feet', 'year_built', 'floor_count',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'meter_reading']

#Include row_id. All columns except meter_reading
test_ordered_column_names = ['row_id', 'site_id', 'building_id', 'timestamp', 'meter',
       'primary_use', 'square_feet', 'year_built', 'floor_count',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed']

# Order the column names in convenient order
train_df = train_df[train_ordered_column_names]
test_df = test_df[test_ordered_column_names]
print(f'Shape of train : {train_df.shape}')
print(f'Shape of train : {test_df.shape}')

Reading Data...


  labels, = index.labels


Shape of train_df : (20216100, 16)
Shape of test_df : (41697600, 16)
Shape of train : (20216100, 16)
Shape of train : (41697600, 16)


In [15]:
gc.collect()

22

### Feature Creation

In [18]:
%%time
# Sort train and test based on time
train_df.sort_values(['timestamp', 'site_id', 'building_id'], inplace=True)
test_df.sort_values(['timestamp', 'site_id', 'building_id'], inplace=True)

# Do label encoding for the String type of feature
feature_name = 'primary_use'
lb = LabelEncoder()
lb.fit(list(train_df[feature_name].values) + list(test_df[feature_name].values))
train_df[feature_name] = lb.transform(list(train_df[feature_name].values))
test_df[feature_name] = lb.transform(list(test_df[feature_name].values))

# Add date related features.
train_df = create_date_features(train_df, 'timestamp')
test_df = create_date_features(test_df, 'timestamp')

# Remove time stamp and meter reading
predictors = ['site_id', 'building_id', 'meter', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed',
       'year', 'month', 'quarter', 'weekofyear', 'day', 'dayofweek',
       'dayofyear', 'is_month_start', 'is_month_end', 'is_quarter_start',
       'is_quarter_end', 'is_year_start', 'is_year_end', 'hour', 'minute',
       'second']
target =  'meter_reading'

CPU times: user 3min 16s, sys: 13.8 s, total: 3min 30s
Wall time: 2min 20s


In [20]:
train_X = train_df[predictors]
train_Y = np.log1p(train_df[target])
test_X = test_df[predictors]

print(f'Shape of train_X : {train_X.shape}')
print(f'Shape of train_Y : {train_Y.shape}')
print(f'Shape of test_X : {test_X.shape}')

Shape of train_X : (20216100, 30)
Shape of train_Y : (20216100,)
Shape of test_X : (41697600, 30)


In [21]:
%time
reg = lgb.LGBMRegressor(n_jobs=16)
reg.fit(train_X, train_Y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=16, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [22]:
del train_X, train_Y, train_df
gc.collect()

3164

In [23]:
%time
prediction = reg.predict(test_X)

In [25]:
%time
prediction = np.expm1(prediction)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.91 µs


In [27]:
submission_df =  feather.read_dataframe(f'{CREATED_DATA_DIR}/submission.feather')

submission_df.row_id = test_df.row_id

submission_df.meter_reading = prediction

  labels, = index.labels


In [31]:
submission_df.to_csv('submission_2.csv', index=False)

In [34]:
!ls -lht

total 1.5G
-rw-r--r-- 1 jupyter jupyter  12K Oct 25 04:30 submission_2.ipynb
-rw-r--r-- 1 jupyter jupyter 441M Oct 25 04:29 submission_2.zip
-rw-r--r-- 1 jupyter jupyter 1.1G Oct 25 04:28 submission_2.csv


In [33]:
! zip submission_2.zip submission_2.csv

  adding: submission_2.csv (deflated 59%)


In [35]:
! rm -rf submission_2.csv

In [37]:
! /home/jupyter/.local/bin/kaggle competitions submit -c ashrae-energy-prediction -f submission_2.zip -m "On top of submission_1 just applied log1p on meter_reading"

100%|██████████| 441M/441M [00:19<00:00, 24.2MB/s] 
Successfully submitted to ASHRAE - Great Energy Predictor III