In [1]:
import pandas as pd
import numpy as np
import feather
import gc
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [2]:
CREATED_DATA_DIR = '/home/jupyter/kaggle/energy/data/read_only_feather/v1'

In [3]:
def read_data(data_dir):
    train_df = feather.read_dataframe(f'{data_dir}/train_merged.feather')
    test_df = feather.read_dataframe(f'{data_dir}/test_merged.feather')
    print(f'Shape of train_df : {train_df.shape}')
    print(f'Shape of test_df : {test_df.shape}')
    return train_df, test_df


def create_date_features(df, feature_name):
    '''
    Create new features related to dates
    
    df : The complete dataframe
    feature_name : Name of the feature of date type which needs to be decomposed.
    '''
    df.loc[:, 'year'] = df.loc[:, feature_name].dt.year.astype('uint32')
    df.loc[:, 'month'] = df.loc[:, feature_name].dt.month.astype('uint32')
    df.loc[:, 'quarter'] = df.loc[:, feature_name].dt.quarter.astype('uint32')
    df.loc[:, 'weekofyear'] = df.loc[:, feature_name].dt.weekofyear.astype('uint32')
    
    df.loc[:, 'day'] = df.loc[:, feature_name].dt.day.astype('uint32')
    df.loc[:, 'dayofweek'] = df.loc[:, feature_name].dt.dayofweek.astype('uint32')
    df.loc[:, 'dayofyear'] = df.loc[:, feature_name].dt.dayofyear.astype('uint32')
    df.loc[:, 'is_month_start'] = df.loc[:, feature_name].dt.is_month_start
    df.loc[:, 'is_month_end'] = df.loc[:, feature_name].dt.is_month_end
    df.loc[:, 'is_quarter_start']= df.loc[:, feature_name].dt.is_quarter_start
    df.loc[:, 'is_quarter_end'] = df.loc[:, feature_name].dt.is_quarter_end
    df.loc[:, 'is_year_start'] = df.loc[:, feature_name].dt.is_year_start
    df.loc[:, 'is_year_end'] = df.loc[:, feature_name].dt.is_year_end
    
    df.loc[:, 'hour'] = df.loc[:, feature_name].dt.hour.astype('uint32')
    df.loc[:, 'minute'] = df.loc[:, feature_name].dt.minute.astype('uint32')
    df.loc[:, 'second'] = df.loc[:, feature_name].dt.second.astype('uint32')
    
    return df

In [4]:
train_df, test_df = read_data(CREATED_DATA_DIR)

train_ordered_column_names = ['site_id', 'building_id', 'timestamp', 'meter',
       'primary_use', 'square_feet', 'year_built', 'floor_count',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'meter_reading']

#Include row_id. All columns except meter_reading
test_ordered_column_names = ['row_id', 'site_id', 'building_id', 'timestamp', 'meter',
       'primary_use', 'square_feet', 'year_built', 'floor_count',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed']

# Order the column names in convenient order
train_df = train_df[train_ordered_column_names]
test_df = test_df[test_ordered_column_names]
print(f'Shape of train : {train_df.shape}')
print(f'Shape of train : {test_df.shape}')

  labels, = index.labels


Shape of train_df : (20216100, 16)
Shape of test_df : (41697600, 16)
Shape of train : (20216100, 16)
Shape of train : (41697600, 16)


In [6]:
gc.collect()

22

### Feature Creation

In [7]:
# Sort train and test based on time
train_df.sort_values(['timestamp', 'site_id', 'building_id'], inplace=True)
test_df.sort_values(['timestamp', 'site_id', 'building_id'], inplace=True)

In [46]:
# predictors = ['site_id', 'building_id', 'timestamp', 'meter', 'primary_use',
#        'square_feet', 'year_built', 'floor_count', 'air_temperature',
#        'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr',
#        'sea_level_pressure', 'wind_direction', 'wind_speed']
# target = 'meter_reading'

# train_index = train_df.shape[0]
# combined_df = pd.concat([train_df[predictors], test_df[predictors]], axis=0)

# print(f'Shape of combined_df : {combined_df.shape}')

# del combined_df
# gc.collect()

In [8]:
# Do label encoding for the String type of feature
feature_name = 'primary_use'
lb = LabelEncoder()
lb.fit(list(train_df[feature_name].values) + list(test_df[feature_name].values))
train_df[feature_name] = lb.transform(list(train_df[feature_name].values))
test_df[feature_name] = lb.transform(list(test_df[feature_name].values))

In [9]:
# Add date related features.
train_df = create_date_features(train_df, 'timestamp')
test_df = create_date_features(test_df, 'timestamp')

In [10]:
# Remove time stamp and meter reading
predictors = ['site_id', 'building_id', 'meter', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed',
       'year', 'month', 'quarter', 'weekofyear', 'day', 'dayofweek',
       'dayofyear', 'is_month_start', 'is_month_end', 'is_quarter_start',
       'is_quarter_end', 'is_year_start', 'is_year_end', 'hour', 'minute',
       'second']
target =  'meter_reading'

In [11]:
train_X = train_df[predictors]
train_Y = train_df[target]
test_X = test_df[predictors]

print(f'Shape of train_X : {train_X.shape}')
print(f'Shape of train_Y : {train_Y.shape}')
print(f'Shape of test_X : {test_X.shape}')

Shape of train_X : (20216100, 30)
Shape of train_Y : (20216100,)
Shape of test_X : (41697600, 30)


In [12]:
%who DataFrame

test_X	 test_df	 train_X	 train_df	 


In [13]:
%who

CREATED_DATA_DIR	 LabelEncoder	 create_date_features	 feather	 feature_name	 gc	 lb	 lgb	 np	 
pd	 predictors	 read_data	 target	 test_X	 test_df	 test_ordered_column_names	 train_X	 train_Y	 
train_df	 train_ordered_column_names	 


In [14]:
reg = lgb.LGBMRegressor(n_jobs=16)
reg.fit(train_X, train_Y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=16, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [16]:
del train_X, train_Y, train_df
gc.collect()

5715

In [None]:
prediction = reg.predict(test_X)

In [21]:
prediction

array([  78.46021408,   78.46021408,   78.46021408,   78.46021408,
        321.19454048,   78.46021408,   47.35351131,  321.19454048,
       1914.56117302,  150.95286668])

In [27]:
submission_df =  feather.read_dataframe(f'{CREATED_DATA_DIR}/submission.feather')

submission_df.row_id = test_df.row_id

submission_df.meter_reading = prediction

In [29]:
submission_df.to_csv('submission_1.csv', index=False)

In [30]:
submission_df.head()

Unnamed: 0,row_id,meter_reading
0,0,78.460214
1,1,78.460214
2,2,78.460214
3,3,78.460214
4,4,321.19454


In [32]:
! /home/jupyter/.local/bin/kaggle competitions submit -c ashrae-energy-prediction -f submission_1.csv -m "1st baseline sub with just day features, LGB, No validation"

100%|██████████| 1.05G/1.05G [00:34<00:00, 32.5MB/s] 
Successfully submitted to ASHRAE - Great Energy Predictor III