In [1]:
import pandas as pd
import numpy as np
import random
import feather
import gc
import json
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import lightgbm as lgb
from sklearn.model_selection import KFold

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [2]:
CREATED_DATA_DIR = '/home/jupyter/kaggle/energy/data/read_only_feather/v1'

def set_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)


def read_data(data_dir):
    print('Reading Data...')
    train_df = feather.read_dataframe(f'{data_dir}/train_merged.feather')
    test_df = feather.read_dataframe(f'{data_dir}/test_merged.feather')
    print(f'Shape of train_df : {train_df.shape}')
    print(f'Shape of test_df : {test_df.shape}')
    return train_df, test_df


def create_date_features(df, feature_name):
    '''
    Create new features related to dates
    
    df : The complete dataframe
    feature_name : Name of the feature of date type which needs to be decomposed.
    '''
    print('Creating date related fetaures...')
    df.loc[:, 'year'] = df.loc[:, feature_name].dt.year.astype('uint16')
    df.loc[:, 'month'] = df.loc[:, feature_name].dt.month.astype('uint8')
    df.loc[:, 'quarter'] = df.loc[:, feature_name].dt.quarter.astype('uint8')
    df.loc[:, 'weekofyear'] = df.loc[:, feature_name].dt.weekofyear.astype('uint8')
    
    df.loc[:, 'day'] = df.loc[:, feature_name].dt.day.astype('uint16')
    df.loc[:, 'dayofweek'] = df.loc[:, feature_name].dt.dayofweek.astype('uint8')
    df.loc[:, 'dayofyear'] = df.loc[:, feature_name].dt.dayofyear.astype('uint16')
    df.loc[:, 'is_month_start'] = df.loc[:, feature_name].dt.is_month_start
    df.loc[:, 'is_month_end'] = df.loc[:, feature_name].dt.is_month_end
    df.loc[:, 'is_quarter_start']= df.loc[:, feature_name].dt.is_quarter_start
    df.loc[:, 'is_quarter_end'] = df.loc[:, feature_name].dt.is_quarter_end
    df.loc[:, 'is_year_start'] = df.loc[:, feature_name].dt.is_year_start
    df.loc[:, 'is_year_end'] = df.loc[:, feature_name].dt.is_year_end
    
    df.loc[:, 'hour'] = df.loc[:, feature_name].dt.hour.astype('uint8')    
    return df


def get_data_splits_by_fraction(dataframe, valid_fraction=0.1):
    """
    Creating holdout set from the train data based on fraction
    """
    print(f'Splitting the data into train and holdout with validation fraction {valid_fraction}...')
    valid_size = int(len(dataframe) * valid_fraction)
    train = dataframe[:valid_size]
    validation = dataframe[valid_size:]
    print(f'Shape of the training data {train.shape} ')
    print(f'Shape of the validation data {validation.shape}')
    return train, validation


def get_data_splits_by_month(dataframe, train_months, validation_months):
    """
    Creating holdout set from the train data based on months
    """
    print(f'Splitting the data into train and holdout based on months...')
    print(f'Training months {train_months}')
    print(f'Validation months {validation_months}')
    training = dataframe[dataframe.month.isin(train_months)]
    validation = dataframe[dataframe.month.isin(validation_months)]
    print(f'Shape of the training data {training.shape} ')
    print(f'Shape of the validation data {validation.shape}')
    return training, validation


def train_model(training, validation,predictors, taget,  params, test_X=None):
    
    train_X = training[predictors]
    train_Y = np.log1p(training[target])
    validation_X = validation[predictors]
    validation_Y = np.log1p(validation[target])

    print(f'Shape of train_X : {train_X.shape}')
    print(f'Shape of train_Y : {train_Y.shape}')
    print(f'Shape of validation_X : {validation_X.shape}')
    print(f'Shape of validation_Y : {validation_Y.shape}')
    
    dtrain = lgb.Dataset(train_X, label=train_Y)
    dvalid = lgb.Dataset(validation_X, validation_Y)
    
    print("Training model!")
    bst = lgb.train(params, dtrain, valid_sets=[dvalid], verbose_eval=100)
    
    valid_prediction = bst.predict(validation_X)
    valid_score = np.sqrt(metrics.mean_squared_error(validation_Y, valid_prediction))
    print(f'Validation Score {valid_score}')
    
    if test_X is not None:
        print('Do Nothing')
    else:
        return bst, valid_score


def make_prediction(df_train_X, df_train_Y, df_test_X, params, n_splits=5):
    yoof = np.zeros(len(df_train_X))
    yhat = np.zeros(len(df_test_X))
    cv_scores = []
    result_dict = {}
    
    kf = KFold(n_splits=n_splits, random_state=SEED, shuffle=False)

    fold = 0
    for in_index, oof_index in kf.split(df_train_X, df_train_Y):
        fold += 1
        print(f'fold {fold} of {n_splits}')
        X_in, X_oof = df_train_X.iloc[in_index].values, df_train_X.iloc[oof_index].values
        y_in, y_oof = df_train_Y.iloc[in_index].values, df_train_Y.iloc[oof_index].values
        
        lgb_train = lgb.Dataset(X_in, y_in)
        lgb_eval = lgb.Dataset(X_oof, y_oof, reference=lgb_train)
        
        model = lgb.train(
            params,
            lgb_train,
            valid_sets = [lgb_train, lgb_eval],
            verbose_eval = 50,
        )   
        
        del lgb_train, lgb_eval, in_index, X_in, y_in 
        gc.collect()
        
        print('Training completed')
        yoof[oof_index] = model.predict(X_oof)
        print('OOF Prediction completed.')
        prediction = model.predict(df_test_X.values)
        print('Shape of prediction', prediction.shape)
        yhat += np.expm1(prediction)
        print('Prediction completed')
        cv_oof_score = np.sqrt(metrics.mean_squared_error(y_oof, yoof[oof_index]))
        print(f'CV OOF Score for fold {fold} is {cv_oof_score}')
        cv_scores.append(cv_oof_score)
        
        del oof_index, X_oof, y_oof
        gc.collect()

    yhat /= n_splits

    oof_score = round(np.sqrt(metrics.mean_squared_error(df_train_Y, yoof)), 5)
    avg_cv_scores = round(sum(cv_scores)/len(cv_scores), 5)
    std_cv_scores = round(np.array(cv_scores).std(), 5)

    print(f'Combined OOF score : {oof_score}')
    print(f'Average of {fold} folds OOF score {avg_cv_scores}')
    print(f'std of {fold} folds OOF score {std_cv_scores}')
    
    result_dict['yoof'] = yoof
    result_dict['prediction'] = yhat
    result_dict['oof_score'] = oof_score
    result_dict['cv_scores'] = cv_scores
    result_dict['avg_cv_scores'] = avg_cv_scores
    result_dict['std_cv_scores'] = std_cv_scores
    
    return result_dict



In [3]:
SEED = 42
LOCAl_TEST = True
set_seed(SEED)
target = 'meter_reading'

In [4]:
%%time
train_df, test_df = read_data(CREATED_DATA_DIR)

train_ordered_column_names = ['site_id', 'building_id', 'timestamp', 'meter',
       'primary_use', 'square_feet', 'year_built', 'floor_count',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'meter_reading']

#Include row_id. All columns except meter_reading
test_ordered_column_names = ['row_id', 'site_id', 'building_id', 'timestamp', 'meter',
       'primary_use', 'square_feet', 'year_built', 'floor_count',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed']

# Order the column names in convenient order
train_df = train_df[train_ordered_column_names]
test_df = test_df[test_ordered_column_names]
print(f'Shape of train : {train_df.shape}')
print(f'Shape of test : {test_df.shape}')

# Sort train and test based on time
print('Sorting values based on timestamp, site_id, building_id...')
train_df.sort_values(['timestamp', 'site_id', 'building_id'], inplace=True)
test_df.sort_values(['timestamp', 'site_id', 'building_id'], inplace=True)

Reading Data...


  labels, = index.labels


Shape of train_df : (20216100, 16)
Shape of test_df : (41697600, 16)
Shape of train : (20216100, 16)
Shape of test : (41697600, 16)
Sorting values based on timestamp, site_id, building_id...
CPU times: user 55.4 s, sys: 21.8 s, total: 1min 17s
Wall time: 3min 28s


### Feature Engineering

1. Create date features
2. LabelEncoding for primary_use

In [5]:
%%time
# Do label encoding for the String type of feature
feature_name = 'primary_use'
print('Label Encoding...')
lb = LabelEncoder()
lb.fit(list(train_df[feature_name].values) + list(test_df[feature_name].values))
train_df[feature_name] = lb.transform(list(train_df[feature_name].values))
test_df[feature_name] = lb.transform(list(test_df[feature_name].values))
print('Label Encoding completed...')

# Add date related features.
train_df = create_date_features(train_df, 'timestamp')
test_df = create_date_features(test_df, 'timestamp')

Label Encoding...
Label Encoding completed...
Creating date related fetaures...
Creating date related fetaures...
CPU times: user 4min 37s, sys: 1min 45s, total: 6min 23s
Wall time: 3min 30s


In [6]:
# Remove time stamp and meter reading
predictors = ['site_id', 'building_id', 'meter', 'primary_use',
       'square_feet', 'year_built', 'floor_count', 'air_temperature',
       'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed',
       'year', 'month', 'quarter', 'weekofyear', 'day', 'dayofweek',
       'dayofyear', 'is_month_start', 'is_month_end', 'is_quarter_start',
       'is_quarter_end', 'is_year_start', 'is_year_end', 'hour']

#Copied the params from Konstatien's kernel
lgb_params = {
                'objective':'regression',
                'boosting_type':'gbdt',
                'metric':'rmse',
                'n_jobs':-1,
                'learning_rate':0.05,
                'num_leaves': 2**8,
                'max_depth':-1,
                'tree_learner':'serial',
                'colsample_bytree': 0.9,
                'subsample_freq':1,
                'subsample':0.5,
                'n_estimators':2000,
                'max_bin':255,
                'verbose':-1,
                'seed': SEED,
                'early_stopping_rounds':100, 
                }

### Validation 1 : 50% Training : 50% Holdout split without shuffle

In [7]:
training, validation = get_data_splits_by_fraction(train_df, valid_fraction=0.5)

Splitting the data into train and holdout with validation fraction 0.5...
Shape of the training data (10108050, 30) 
Shape of the validation data (10108050, 30)


In [8]:
bst, valid_score = train_model(training, validation, predictors, target, params=lgb_params)

Shape of train_X : (10108050, 28)
Shape of train_Y : (10108050,)
Shape of validation_X : (10108050, 28)
Shape of validation_Y : (10108050,)
Training model!




Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 1.59343
Early stopping, best iteration is:
[54]	valid_0's rmse: 1.56869
Validation Score 1.5686904753390896


In [9]:
del training, validation, bst, valid_score
gc.collect()

135

### Validation 2 : Train on first 4 months, skip next 4 months, Test on last 4 months

In [10]:
# Training on 1st four months
train_months = [1, 2, 3, 4]
# Holdout on last four months
validation_months = [9, 10, 11, 12]

training, validation = get_data_splits_by_month(train_df, train_months=train_months, validation_months=validation_months)

Splitting the data into train and holdout based on months...
Training months [1, 2, 3, 4]
Validation months [9, 10, 11, 12]
Shape of the training data (6465489, 30) 
Shape of the validation data (6857643, 30)


In [11]:
bst, valid_score = train_model(training, validation, predictors, target, params=lgb_params)

Shape of train_X : (6465489, 28)
Shape of train_Y : (6465489,)
Shape of validation_X : (6857643, 28)
Shape of validation_Y : (6857643,)
Training model!




Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 1.70705
Early stopping, best iteration is:
[37]	valid_0's rmse: 1.6958
Validation Score 1.695804545268327


In [12]:
del train_months, validation_months, training, validation, bst, valid_score
gc.collect()

373

### Make Prediction on test data

In [13]:
train_X = train_df[predictors]
train_Y = np.log1p(train_df[target])
test_X = test_df[predictors]
test_row_id = test_df['row_id']

print(f'Size of train_X {train_X.shape}')
print(f'Size of train_Y {train_Y.shape}')
print(f'Size of test_X {test_X.shape}')
print(f'Size of test_row_id {test_row_id.shape}')

Size of train_X (20216100, 28)
Size of train_Y (20216100,)
Size of test_X (41697600, 28)
Size of test_row_id (41697600,)


In [14]:
del train_df, test_df
gc.collect()

44

In [None]:
result_dict = make_prediction(train_X, train_Y, test_X, params=lgb_params, n_splits=5)

fold 1 of 5




Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 1.1718	valid_1's rmse: 1.30494
[100]	training's rmse: 0.984889	valid_1's rmse: 1.13393
[150]	training's rmse: 0.898141	valid_1's rmse: 1.08127
[200]	training's rmse: 0.84526	valid_1's rmse: 1.04139
[250]	training's rmse: 0.806716	valid_1's rmse: 1.01566
[300]	training's rmse: 0.779948	valid_1's rmse: 0.997234
[350]	training's rmse: 0.755243	valid_1's rmse: 0.983163
[400]	training's rmse: 0.736672	valid_1's rmse: 0.975835
[450]	training's rmse: 0.721366	valid_1's rmse: 0.966542
[500]	training's rmse: 0.704898	valid_1's rmse: 0.959113
[550]	training's rmse: 0.691294	valid_1's rmse: 0.955199
[600]	training's rmse: 0.679722	valid_1's rmse: 0.950524
[650]	training's rmse: 0.67059	valid_1's rmse: 0.947532
[700]	training's rmse: 0.660253	valid_1's rmse: 0.944575
[750]	training's rmse: 0.651208	valid_1's rmse: 0.941595
[800]	training's rmse: 0.643628	valid_1's rmse: 0.939291
[850]	training's rmse: 0.636312	vali



Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 1.16839	valid_1's rmse: 1.32043
[100]	training's rmse: 0.985335	valid_1's rmse: 1.19483
[150]	training's rmse: 0.89868	valid_1's rmse: 1.14604
[200]	training's rmse: 0.837092	valid_1's rmse: 1.11511
[250]	training's rmse: 0.797243	valid_1's rmse: 1.09898
[300]	training's rmse: 0.766515	valid_1's rmse: 1.08743
[350]	training's rmse: 0.744	valid_1's rmse: 1.07756


In [None]:
len(result_dict)

In [None]:
predictions = result_dict.prediction