Training and Predicting
===========================================================

#### Forked From Kaggle - Andy Harless - XGBoost, LightGBM, OLS and NN
Public Score
0.0643646

1. Global Hyper Parameters
---------------------------------------

In [1]:
FUDGE_FACTOR = 1.1200  # Multiply forecasts by this

XGB_WEIGHT = 0.6200
BASELINE_WEIGHT = 0.0100
OLS_WEIGHT = 0.0620
NN_WEIGHT = 0.0800

XGB1_WEIGHT = 0.8000  # Weight of first in combination of two XGB models

BASELINE_PRED = 0.0115   # Baseline based on mean of training data, per Oleg

2. Import Packages
-----------------------------

In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler,Normalizer
import lightgbm as lgb
import gc
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,make_scorer
from sklearn.model_selection import cross_val_score,KFold
import random
import datetime as dt
from collections import OrderedDict
import copy

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.layers.noise import GaussianDropout
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer

Using Theano backend.
Using cuDNN version 5110 on context None
Preallocating 9011/11264 Mb (0.800000) on cuda0
Mapped name None to device cuda0: GeForce GTX 1080 Ti (0000:06:00.0)


3. Preparation
--------------------------------

In [3]:
month_train=range(1,13)
month_test=range(10,13)
category_features=['propertyzoningdesc','taxdelinquencyyear','propertycountylandusecode','airconditioningtypeid', 'architecturalstyletypeid', 'buildingclasstypeid', 'buildingqualitytypeid', 'decktypeid', 'fips', 'hashottuborspa', 'heatingorsystemtypeid', 'poolcnt', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7', 'propertylandusetypeid', 'regionidcounty', 'storytypeid', 'threequarterbathnbr', 'typeconstructiontypeid', 'numberofstories', 'fireplaceflag', 'taxdelinquencyflag']+['basementsqft', 'finishedfloor1squarefeet', 'finishedsquarefeet13', 'finishedsquarefeet15', 'finishedsquarefeet50', 'finishedsquarefeet6', 'fireplacecnt', 'garagecarcnt', 'garagetotalsqft', 'poolsizesum', 'yardbuildingsqft17', 'yardbuildingsqft26']
numerical_features=[ 'yearbuilt', 'assessmentyear','bathroomcnt', 'bedroomcnt', 'calculatedbathnbr', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'fullbathcnt', 'lotsizesquarefeet', 'roomcnt', 'unitcnt', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'landtaxvaluedollarcnt', 'taxamount']
extra_features=['latitude', 'longitude',   'rawcensustractandblock', 'regionidcity', 'regionidzip',  'censustractandblock']
prediction_dates=['2016-10-15','2016-11-15','2016-12-15','2017-10-15','2017-11-15','2017-12-15']
prediction_columns=['201610','201611','201612','201710','201711','201712']
prop_lightgbm_path='Data/prop_clean.csv'
prop_xgboost_path='Data/prop_clean.csv'
prop_ols_path='Data/prop_clean.csv'
prop_nn_path='Data/prop_clean.csv'
train_path='Data/train.csv'
sample_path='Data/bak/sample_submission.csv'

In [4]:
def loader(properties_path,drop_list,outlier_bound=(None,None),parse_date=False,labelencode=False,onehotencode=False,test_only=False):
    print( "\nReading data from disk ...")
    print("   Read properties file ...")
    properties = pd.read_csv(properties_path)
    print("   ...")
    if onehotencode:
        print("   One Hot Encoding ...")
        properties=pd.get_dummies(properties, columns=category_features, drop_first=False)
        print("   ...")
    if labelencode:
        print("   Label Encoding ...")
        for c in category_features:
            le=LabelEncoder()
            le.fit(properties[c])
            properties[c]=le.transform(properties[c])
        print("   ...")
    print("   Read training file ...")
    train = pd.read_csv(train_path, parse_dates=["transactiondate"])
    print("   ...")
    train_properties = train.merge(properties, how='left', on='parcelid')
    if parse_date:
        train_properties["transactiondate_year"] = train_properties["transactiondate"].dt.year
        train_properties["transactiondate_month"] = train_properties["transactiondate"].dt.month
    if outlier_bound[0] is not None:
        train_properties=train_properties[train_properties.logerror > outlier_bound[0]]
    if outlier_bound[1] is not None:
        train_properties=train_properties[train_properties.logerror < outlier_bound[1]]
    train_properties=train_properties[train_properties['transactiondate'].dt.month.isin(month_train)]
    train_drop_list=copy.copy(drop_list)
    train_drop_list_all=copy.copy(train_drop_list)
    for dropitem in train_drop_list_all:
        if dropitem not in train_properties.columns.tolist():
            train_drop_list.remove(dropitem)
    y_train = train_properties['logerror']
    x_train = train_properties.drop(train_drop_list, axis=1)
    print(x_train.shape, y_train.shape)
    print("\nPrepare for prediction ...")
    print("   Read sample file ...")
    sample = pd.read_csv(sample_path)
    sample['parcelid'] = sample['ParcelId']
    print("   ...")
    print("   Merge with property data ...")
    if test_only:
        test_properties = train.merge(properties, how='left', on='parcelid')
        test_properties=test_properties[test_properties['transactiondate'].dt.month.isin(month_test)]
    else:
        test_properties = sample.merge(properties, on='parcelid', how='left')
    parcelid=test_properties['parcelid']
    if parse_date:
        test_properties["transactiondate_year"] = 2016.5
        test_properties["transactiondate_month"] = np.mean(month_test)
    test_drop_list=copy.copy(drop_list)+['ParcelId','201610','201611','201612','201710','201711','201712']
    test_drop_list_all=copy.copy(test_drop_list)
    for dropitem in test_drop_list_all:
        if dropitem not in test_properties.columns.tolist():
            test_drop_list.remove(dropitem)
    x_test=test_properties.drop(test_drop_list, axis=1)
    print(x_test.shape)
    return x_train,y_train,x_test,parcelid

In [5]:
def normalize(x_test):
    ssr=StandardScaler()
    ssr.fit(x_test)
    return ssr

4. LightGBM
-------------------------

### Process Data for LightGBM

In [7]:
print( "\nProcessing data for LightGBM ..." )

x_train,y_train,x_test,parcelid=loader(prop_lightgbm_path,['parcelid', 'logerror', 'transactiondate'],labelencode=True)
features=category_features+numerical_features+extra_features
d_train = lgb.Dataset(x_train[features], label=y_train, categorical_feature=category_features,free_raw_data=False)


Processing data for LightGBM ...

Reading data from disk ...
   Read properties file ...
   ...
   Label Encoding ...
   ...
   Read training file ...
   ...
((167888, 57), (167888L,))

Prepare for prediction ...
   Read sample file ...
   ...
   Merge with property data ...
(2985217, 57)


### Set LightGBM Hyper Parameters

In [8]:
params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.0021 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1'          # or 'mae'
params['sub_feature'] = 0.345    # feature_fraction (small values => use very different submodels)
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['num_leaves'] = 512        # num_leaf
params['min_data'] = 500         # min_data_in_leaf
params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
params['verbose'] = 0
params['feature_fraction_seed'] = 2
params['bagging_seed'] = 3

np.random.seed(0)
random.seed(0)

### Cross Validation For LightGBM

In [9]:
print("\nCross validing LightGBM model ...")

score=lgb.cv(params, d_train, 430, metrics='mae', categorical_feature=category_features)

print('Cross Validated Mean Absolute Error :     %s'%(str(score.values()[0][-1])))
print('Cross Validated Mean Absolute Error Std :     %s'%(np.mean(score.values()[1][-1])))


Cross validing LightGBM model ...




Cross Validated Mean Absolute Error :     0.0623537715374
Cross Validated Mean Absolute Error Std :     0.00883101016585


### Run LightGBM

In [10]:
print("\nFitting LightGBM model ...")

clf = lgb.train(params, d_train, 430, categorical_feature=category_features)


Fitting LightGBM model ...


### LightGBM Prediction

In [11]:
print("\nStart LightGBM prediction ...")

lgb_pred = clf.predict(x_test)

print( "\nUnadjusted LightGBM predictions:" )
print( pd.DataFrame(lgb_pred).head() )


Start LightGBM prediction ...

Unadjusted LightGBM predictions:
          0
0  0.027944
1  0.029488
2  0.029634
3  0.028672
4  0.029576


### Clean for LightGBM

In [12]:
del x_train,y_train,x_test,parcelid,d_train,score
gc.collect()

136

5. XGBoost
-----------------------------

### Process Data For XGBoost

In [13]:
print( "\nProcessing data for XGBoost ...")

x_train,y_train,x_test,parcelid=loader(prop_lightgbm_path,['parcelid', 'logerror', 'transactiondate'],outlier_bound=(-0.4,0.419),labelencode=True)
features=category_features+numerical_features+extra_features
y_mean = np.mean(y_train)
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)


Processing data for XGBoost ...

Reading data from disk ...
   Read properties file ...
   ...
   Label Encoding ...
   ...
   Read training file ...
   ...
((164477, 57), (164477L,))

Prepare for prediction ...
   Read sample file ...
   ...
   Merge with property data ...
(2985217, 57)


### Set XGBoost Hyper Parameters

In [14]:
xgb_params = {
    'eta': 0.037,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}
num_boost_rounds = 250

### Cross Validation For XGBoost

In [15]:
print("\nCross validing XGBoost model ...")

score=xgb.cv(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds, nfold=5, metrics='mae')

print('Cross Validated Mean Absolute Error :     %s'%(str(score.iloc[-1,0])))
print('Cross Validated Mean Absolute Error Std :     %s'%(str(score.iloc[-1,1])))


Cross validing XGBoost model ...
Cross Validated Mean Absolute Error :     0.0525704
Cross Validated Mean Absolute Error Std :     0.000451096708035


### Run XGBoost

In [16]:
print("\nFitting XGBoost model ...")

model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)


Fitting XGBoost model ...


### XGBoost Prediction

In [17]:
print( "\nPredicting with XGBoost ...")

xgb_pred1 = model.predict(dtest)

print( "\nFirst XGBoost predictions:" )
print( pd.DataFrame(xgb_pred1).head() )


Predicting with XGBoost ...

First XGBoost predictions:
          0
0  0.013468
1 -0.006351
2 -0.045269
3 -0.009459
4 -0.046619


### Set XGBoost Hyper Parameters Again

In [18]:
xgb_params = {
    'eta': 0.033,
    'max_depth': 6,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'base_score': y_mean,
    'silent': 1
}
num_boost_rounds = 150

### Cross Validation For XGBoost  Again

In [19]:
print("\nCross validing LightGBM model ...")

score=xgb.cv(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds, nfold=5, metrics='mae')

print('Cross Validated Mean Absolute Error :     %s'%(str(score.iloc[-1,0])))
print('Cross Validated Mean Absolute Error Std :     %s'%(str(score.iloc[-1,1])))


Cross validing LightGBM model ...
Cross Validated Mean Absolute Error :     0.0526002
Cross Validated Mean Absolute Error Std :     0.000467388660539


### Run XGBoost Again

In [20]:
print( "\nTraining XGBoost again ...")

model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)


Training XGBoost again ...


### XGBoost Prediction Again

In [21]:
print( "\nPredicting with XGBoost again ...")

xgb_pred2 = model.predict(dtest)

print( "\nSecond XGBoost predictions:" )
print( pd.DataFrame(xgb_pred2).head() )


Predicting with XGBoost again ...

Second XGBoost predictions:
          0
0  0.029393
1 -0.003237
2 -0.051331
3 -0.001733
4 -0.039052


### Combine Twice XGBoost Predictions

In [22]:
xgb_pred = XGB1_WEIGHT*xgb_pred1 + (1-XGB1_WEIGHT)*xgb_pred2

print( "\nCombined XGBoost predictions:" )
print( pd.DataFrame(xgb_pred).head() )


Combined XGBoost predictions:
          0
0  0.016653
1 -0.005728
2 -0.046481
3 -0.007914
4 -0.045106


### Clean for XGBoost

In [23]:
del x_train,y_train,x_test,parcelid,dtrain,dtest
gc.collect()

279

6. Neural Network
----------------------

### Process Data For Neural Network

In [83]:
print( "\n\nProcessing data for Neural Network ...")

x_train,y_train,x_test,parcelid=loader(prop_lightgbm_path,['parcelid', 'logerror','transactiondate']+extra_features,parse_date=True,onehotencode=True)
ssr=normalize(x_test)
x_train=ssr.transform(x_train)

print(x_train.shape)
len_x=x_train.shape[1]



Processing data for Neural Network ...

Reading data from disk ...
   Read properties file ...
   ...
   One Hot Encoding ...
   ...
   Read training file ...
   ...
((167888, 177), (167888L,))

Prepare for prediction ...
   Read sample file ...
   ...
   Merge with property data ...
(2985217, 177)
(167888L, 177L)


### Set Neural Network

In [25]:
print("\nSetting up neural network model...")
nn = Sequential()
nn.add(Dense(units = 400 , kernel_initializer = 'normal', input_dim = len_x))
nn.add(PReLU())
nn.add(Dropout(.4))
nn.add(Dense(units = 160 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(units = 64 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.5))
nn.add(Dense(units = 26, kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(1, kernel_initializer='normal'))
nn.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))


Setting up neural network model...


### Cross Validation For NN

In [95]:
kf=KFold(n_splits=5)
loss=[]
for train_index, test_index in kf.split(x_train,y_train):
    nncvhistory=nn.fit(np.array(x_train[train_index]), np.array(y_train[train_index]), batch_size = 4096, epochs = 35, verbose=0,validation_data=(np.array(x_train[test_index]), np.array(y_train[test_index])))
    loss.append(nncvhistory.history['val_loss'][-1])
    print('   ...')

print('Cross Validated Mean Absolute Error :     %s'%(str(np.mean(loss))))

   ...
   ...
   ...
   ...
   ...
Cross Validated Mean Absolute Error :     0.0670682084726


### Run Neural Network

In [97]:
print("\nFitting neural network model...")

nnhistory=nn.fit(np.array(x_train), np.array(y_train), batch_size = 4096, epochs = 70, verbose=0)

print(nnhistory.history['loss'][-1])


Fitting neural network model...
0.0674957613229


### Neural Network Prediction

In [28]:
print("\nPredicting with neural network model...")

preds_dictionary=OrderedDict()
preds_dictionary['parcelid']=parcelid
for i,date in enumerate(prediction_columns):
    x_test_i=x_test.copy()
    x_test_i["transactiondate_year"] = date[:4]
    x_test_i["transactiondate_month"] = date[4:]
    x_test_i=ssr.transform(x_test_i)
    preds_dictionary[date]=nn.predict(x_test_i).flatten()
    print "   ..."
    del x_test_i
nn_pred = pd.DataFrame(preds_dictionary)

print( "\nNeural Network predictions:" )
print( pd.DataFrame(nn_pred).head() )


Predicting with neural network model...
   ...
   ...
   ...
   ...
   ...
   ...

Neural Network predictions:
   parcelid    201610    201611    201612    201710    201711    201712
0  10754147  0.010996  0.015159  0.020081  0.015015  0.019944  0.025643
1  10759547  0.040656  0.046502  0.053228  0.056032  0.062591  0.069539
2  10843547  0.055088  0.061783  0.068511  0.068065  0.074808  0.081741
3  10859147  0.050606  0.056030  0.061424  0.058380  0.063728  0.069067
4  10879947  0.010137  0.015866  0.022031  0.022216  0.027066  0.032445


### Clean for NN

In [29]:
del x_train,y_train,x_test,parcelid,preds_dictionary
gc.collect()

2858

7. OLS
-------------------

### Process Data For OLS

In [78]:
print( "\n\nProcessing data for OLS ...")
x_train,y_train,x_test,parcelid=loader(prop_lightgbm_path,['parcelid', 'logerror','transactiondate']+extra_features,parse_date=True,onehotencode=True)
ssr=normalize(x_test)
x_train=ssr.transform(x_train)

print(x_train.shape)



Processing data for OLS ...

Reading data from disk ...
   Read properties file ...
   ...
   One Hot Encoding ...
   ...
   Read training file ...
   ...
((167888, 177), (167888L,))

Prepare for prediction ...
   Read sample file ...
   ...
   Merge with property data ...
(2985217, 177)
(167888L, 177L)


### Set OLS

In [31]:
np.random.seed(17)
random.seed(17)

### Cross Validation For OLS

In [32]:
print("\nCross validing OLS model ...")

reg = LinearRegression(n_jobs=-1)
score=cross_val_score(reg,x_train, y_train, scoring=make_scorer(mean_absolute_error), cv=5)

print('Cross Validated Mean Absolute Error :     %s'%(str(np.mean(score))))


Cross validing OLS model ...
Cross Validated Mean Absolute Error :     0.0698411270332


### Run OLS

In [33]:
print("\nFitting OLS...")

reg = LinearRegression(n_jobs=-1)
reg.fit(x_train, y_train)


Fitting OLS...


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

### OLS Prediction

In [34]:
print("\nPredicting with OLS model...")

preds_dictionary=OrderedDict()
preds_dictionary['parcelid']=parcelid
for i,date in enumerate(prediction_columns):
    x_test_i=x_test.copy()
    x_test_i["transactiondate_year"] = date[:4]
    x_test_i["transactiondate_month"] = date[4:]
    x_test_i=ssr.transform(x_test_i)
    preds_dictionary[date]=reg.predict(x_test_i)
    print "   ..."
    del x_test_i
ols_pred = pd.DataFrame(preds_dictionary)

print( "\nOLS predictions:" )
print( pd.DataFrame(ols_pred).head() )


Predicting with OLS model...
   ...
   ...
   ...
   ...
   ...
   ...

OLS predictions:
   parcelid        201610        201611        201612        201710  \
0  10754147  5.206299e-02  5.243874e-02  5.281448e-02  5.705261e-02   
1  10759547  6.622478e+11  6.622478e+11  6.622478e+11  6.622478e+11   
2  10843547  9.053040e-02  9.090614e-02  9.128189e-02  9.552002e-02   
3  10859147  5.348206e-02  5.385780e-02  5.423355e-02  5.847168e-02   
4  10879947  6.269073e-02  6.306648e-02  6.344223e-02  6.768036e-02   

         201711        201712  
0  5.742836e-02  5.780411e-02  
1  6.622478e+11  6.622478e+11  
2  9.589577e-02  9.627151e-02  
3  5.884743e-02  5.922318e-02  
4  6.805611e-02  6.843185e-02  


### Clean for OLS

In [35]:
del x_train,y_train,x_test,preds_dictionary
gc.collect()

218

8. Combine And Save
----------------------------------

### Combine Predictionsm

In [76]:
pred0.shape

(2985217L, 6L)

In [81]:
print( "\nCombining XGBoost, LightGBM, and baseline predicitons ..." )
lgb_weight = 1 - XGB_WEIGHT - BASELINE_WEIGHT - NN_WEIGHT - OLS_WEIGHT 
lgb_weight0 = lgb_weight / (1 - OLS_WEIGHT)
xgb_weight0 = XGB_WEIGHT / (1 - OLS_WEIGHT)
baseline_weight0 =  BASELINE_WEIGHT / (1 - OLS_WEIGHT)
nn_weight0 = NN_WEIGHT / (1 - OLS_WEIGHT)
pred0 = baseline_weight0*BASELINE_PRED
pred0 += lgb_weight0*lgb_pred
pred0 += xgb_weight0*xgb_pred

print( "\nCombined XGB/LGB/baseline predictions:" )
print( pd.DataFrame(pred0).head() )

print( "\nCombining with XGB/LGB/NN/OLS/baseline predicitons: ..." )
pred0 =np.tile(pred0[:,None],[1,6]) + nn_weight0*nn_pred.values[:,1:]
pred0 = FUDGE_FACTOR * ( OLS_WEIGHT*ols_pred.values[:,1:] + (1-OLS_WEIGHT)*pred0 )
submission_dict=OrderedDict()
submission_dict['parcelid']=parcelid
for i,date in enumerate(prediction_columns):
    submission_dict[date]=pred0[:,i]
submission=pd.DataFrame(submission_dict)
for c in submission.columns[1:]:
    submission[c]=submission[c].apply(lambda x:float(format(x, '.4f')))

print( "\nCombined XGB/LGB/NN/baseline/OLS predictions:" )
print( submission.head() )


Combining XGBoost, LightGBM, and baseline predicitons ...

Combined XGB/LGB/baseline predictions:
          0
0  0.017922
1  0.003504
2 -0.023398
3  0.001861
4 -0.022502

Combining with XGB/LGB/NN/OLS/baseline predicitons: ...

Combined XGB/LGB/NN/baseline/OLS predictions:
   parcelid        201610        201611        201612        201710  \
0  10754147  2.340000e-02  2.380000e-02  2.430000e-02  2.410000e-02   
1  10759547  4.598649e+10  4.598649e+10  4.598649e+10  4.598649e+10   
2  10843547 -1.340000e-02 -1.270000e-02 -1.210000e-02 -1.180000e-02   
3  10859147  1.020000e-02  1.070000e-02  1.120000e-02  1.120000e-02   
4  10879947 -1.840000e-02 -1.780000e-02 -1.730000e-02 -1.690000e-02   

         201711        201712  
0  2.460000e-02  2.510000e-02  
1  4.598649e+10  4.598649e+10  
2 -1.120000e-02 -1.060000e-02  
3  1.180000e-02  1.230000e-02  
4 -1.650000e-02 -1.600000e-02  


### Write Results

In [82]:
from datetime import datetime

print( "\nWriting results to disk ..." )
submission.to_csv('sub{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False)

print( "\nFinished ...")


Writing results to disk ...

Finished ...
