In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
# gc ==> garbage collection

In [6]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import os

os.listdir('data')

['zillow_data_dictionary.xlsx',
 'sample_submission.csv',
 'properties_2016.csv',
 'train_2016_v2.csv']

In [7]:
##### READ IN RAW DATA

print( "\nReading data from disk ...")
prop = pd.read_csv('data/properties_2016.csv')
train = pd.read_csv("data/train_2016_v2.csv")


Reading data from disk ...


In [8]:
##### PROCESS DATA FOR LIGHTGBM

print( "\nProcessing data for LightGBM ..." )
for c, dtype in zip(prop.columns, prop.dtypes):	
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)



Processing data for LightGBM ...


In [9]:
df_train = train.merge(prop, how='left', on='parcelid')
df_train.fillna(df_train.median(),inplace = True)

x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 
                         'propertyzoningdesc', 'propertycountylandusecode', 
                         'fireplacecnt', 'fireplaceflag'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)


(90275, 53) (90275,)


In [14]:
x_train.dtypes[x_train.dtypes == object].index.values

array(['hashottuborspa', 'taxdelinquencyflag'], dtype=object)

In [19]:
x_train['hashottuborspa'].unique(), x_train['taxdelinquencyflag'].unique()

(array([1.0], dtype=object), array([nan, 'Y'], dtype=object))

In [20]:
train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

del df_train; gc.collect()


91

In [22]:
x_train['hashottuborspa'].unique(), x_train['taxdelinquencyflag'].unique()

(array([ True]), array([False]))

In [25]:
x_train['taxdelinquencyflag'].head(), x_train['hashottuborspa'].head()

(0    False
 1    False
 2    False
 3    False
 4    False
 Name: taxdelinquencyflag, dtype: bool, 0    True
 1    True
 2    True
 3    True
 4    True
 Name: hashottuborspa, dtype: bool)

In [21]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90275 entries, 0 to 90274
Data columns (total 53 columns):
airconditioningtypeid           90275 non-null float32
architecturalstyletypeid        90275 non-null float32
basementsqft                    90275 non-null float32
bathroomcnt                     90275 non-null float32
bedroomcnt                      90275 non-null float32
buildingclasstypeid             90275 non-null float32
buildingqualitytypeid           90275 non-null float32
calculatedbathnbr               90275 non-null float32
decktypeid                      90275 non-null float32
finishedfloor1squarefeet        90275 non-null float32
calculatedfinishedsquarefeet    90275 non-null float32
finishedsquarefeet12            90275 non-null float32
finishedsquarefeet13            90275 non-null float32
finishedsquarefeet15            90275 non-null float32
finishedsquarefeet50            90275 non-null float32
finishedsquarefeet6             90275 non-null float32
fips       

In [26]:
##### RUN LIGHTGBM

params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.0021 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1'          # or 'mae'
params['sub_feature'] = 0.5      # feature_fraction -- OK, back to .5, but maybe later increase this
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['num_leaves'] = 512        # num_leaf
params['min_data'] = 500         # min_data_in_leaf
params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
params['verbose'] = 0

In [32]:
d_train = lgb.Dataset(x_train, y_train)
d_eval = lgb.Dataset(x_train, y_train)

In [37]:
print("fit lightgbm model")
clf = lgb.train(params, d_train, 100)

fit lightgbm model


In [38]:
pred = clf.predict(x_train)
np.testing.assert_almost_equal(pred,y_train)

AssertionError: 
Arrays are not almost equal to 7 decimals

(mismatch 100.0%)
 x: array([0.0108497, 0.0113083, 0.0082246, ..., 0.0140485, 0.0103898,
       0.0144195])
 y: array([ 0.0276, -0.1684, -0.004 , ..., -0.2679,  0.0602,  0.4207])

In [31]:
print("\nFitting LightGBM model ...")
clf = lgb.train(params, d_train, 430)

del d_train; gc.collect()
del x_train; gc.collect()



Fitting LightGBM model ...


TypeError: Training only accepts Dataset object

In [None]:

# lgb_train = lgb.Dataset(X_train, y_train)
# lgb_eval = lgb.Dataset(X_train, y_train)

# params = {
#     'objective': 'regression',
#     'metric': 'auc',
#     'verbose': -1,
#     'boost_from_average': False,
#     'min_data': 1,
#     'num_leaves': 2,
#     'learning_rate': 1,
#     'min_data_in_bin': 1,
#     'min_data_per_group': 1,
#     'cat_smooth': 1,
#     'cat_l2': 0,
#     'max_cat_to_onehot': 1,
#     'zero_as_missing': True,
#     'categorical_column': 0
# }
# evals_result = {}
# gbm = lgb.train(params, lgb_train,
#                 num_boost_round=1,
#                 valid_sets=lgb_eval,
#                 verbose_eval=True,
#                 evals_result=evals_result)
# pred = gbm.predict(X_train)
# np.testing.assert_almost_equal(pred, y)

In [39]:
del d_train
x_train = x_train.values.astype(np.float32, copy=False)
d_train = lgb.Dataset(x_train, label=y_train)

##### RUN LIGHTGBM

params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.0021 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1'          # or 'mae'
params['sub_feature'] = 0.5      # feature_fraction -- OK, back to .5, but maybe later increase this
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['num_leaves'] = 512        # num_leaf
params['min_data'] = 500         # min_data_in_leaf
params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
params['verbose'] = 0

print("\nFitting LightGBM model ...")
clf = lgb.train(params, d_train, 430)

del d_train; gc.collect()
del x_train; gc.collect()


Fitting LightGBM model ...


0

In [40]:
os.listdir('data')

['zillow_data_dictionary.xlsx',
 'sample_submission.csv',
 'properties_2016.csv',
 'train_2016_v2.csv']

In [42]:
print("\nPrepare for LightGBM prediction ...")
print("   Read sample file ...")
sample = pd.read_csv('data/sample_submission.csv')


Prepare for LightGBM prediction ...
   Read sample file ...


In [43]:
sample.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0,0,0,0,0,0
1,10759547,0,0,0,0,0,0
2,10843547,0,0,0,0,0,0
3,10859147,0,0,0,0,0,0
4,10879947,0,0,0,0,0,0


In [None]:
# We can force the Garbage Collector to release unreferenced
# memory with gc.collect()

In [44]:
train_columns

Index(['airconditioningtypeid', 'architecturalstyletypeid', 'basementsqft',
       'bathroomcnt', 'bedroomcnt', 'buildingclasstypeid',
       'buildingqualitytypeid', 'calculatedbathnbr', 'decktypeid',
       'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet',
       'finishedsquarefeet12', 'finishedsquarefeet13', 'finishedsquarefeet15',
       'finishedsquarefeet50', 'finishedsquarefeet6', 'fips', 'fullbathcnt',
       'garagecarcnt', 'garagetotalsqft', 'hashottuborspa',
       'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet',
       'poolcnt', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7',
       'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity',
       'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt',
       'storytypeid', 'threequarterbathnbr', 'typeconstructiontypeid',
       'unitcnt', 'yardbuildingsqft17', 'yardbuildingsqft26', 'yearbuilt',
       'numberofstories', 'structuretaxvaluedollarcnt', 'taxval

In [45]:
sample['parcelid'] = sample['ParcelId']
print("   Merge with property data ...")
df_test = sample.merge(prop, on='parcelid', how='left')
print("   ...")
del sample, prop; gc.collect()

print("   ...")
x_test = df_test[train_columns]
print("   ...")
del df_test; gc.collect()
print("   Preparing x_test...")

for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)
print("   ...")
x_test = x_test.values.astype(np.float32, copy=False)


   Merge with property data ...
   ...
   ...
   ...
   Preparing x_test...
   ...


In [46]:
print("\nStart LightGBM prediction ...")
# num_threads > 1 will predict very slow in kernal
clf.reset_parameter({"num_threads":1})
p_test = clf.predict(x_test)

del x_test; gc.collect()

print( "\nUnadjusted LightGBM predictions:" )
print( pd.DataFrame(p_test).head() )



Start LightGBM prediction ...

Unadjusted LightGBM predictions:
          0
0  0.035491
1  0.038709
2  0.009419
3  0.007204
4  0.008871


In [47]:
##### RE-READ PROPERTIES FILE
##### (I tried keeping a copy, but the program crashed.)

print( "\nRe-reading properties file ...")
properties = pd.read_csv('data/properties_2016.csv')



##### PROCESS DATA FOR XGBOOST

print( "\nProcessing data for XGBoost ...")
count  = 0
for c in properties.columns:
    properties[c]=properties[c].fillna(-1)
    if properties[c].dtype == 'object':
        count += 1
        lbl = LabelEncoder()
        lbl.fit(list(properties[c].values))
        properties[c] = lbl.transform(list(properties[c].values))



Re-reading properties file ...

Processing data for XGBoost ...


In [48]:
count

5

In [51]:
train_df = train.merge(properties, how='left', on='parcelid')
x_train = train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
x_test = properties.drop(['parcelid'], axis=1)
# shape        
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))


Shape train: (90275, 57)
Shape test: (2985217, 57)


In [52]:
# drop out ouliers
train_df=train_df[ train_df.logerror > -0.4 ]
train_df=train_df[ train_df.logerror < 0.418 ]
x_train=train_df.drop(['parcelid', 'logerror','transactiondate'], axis=1)
y_train = train_df["logerror"].values.astype(np.float32)
y_mean = np.mean(y_train)

print('After removing outliers:')     
print('Shape train: {}\nShape test: {}'.format(x_train.shape, x_test.shape))



After removing outliers:
Shape train: (88525, 57)
Shape test: (2985217, 57)


In [53]:



##### RUN XGBOOST

print("\nSetting up data for XGBoost ...")
# xgboost params
xgb_params = {
    'eta': 0.037,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}
# Enough with the ridiculously overfit parameters.
# I'm going back to my version 20 instead of copying Jayaraman.
# I want a num_boost_rounds that's chosen by my CV,
# not one that's chosen by overfitting the public leaderboard.
# (There may be underlying differences between the train and test data
#  that will affect some parameters, but they shouldn't affect that.)

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

# cross-validation
#print( "Running XGBoost CV ..." )
#cv_result = xgb.cv(xgb_params, 
#                   dtrain, 
#                   nfold=5,
#                   num_boost_round=350,
#                   early_stopping_rounds=50,
#                   verbose_eval=10, 
#                   show_stdv=False
#                  )
#num_boost_rounds = len(cv_result)

# num_boost_rounds = 150
num_boost_rounds = 242
print("\nXGBoost tuned with CV in:")
print("   https://www.kaggle.com/aharless/xgboost-without-outliers-tweak ")
print("num_boost_rounds="+str(num_boost_rounds))

# train model
print( "\nTraining XGBoost ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)



Setting up data for XGBoost ...

XGBoost tuned with CV in:
   https://www.kaggle.com/aharless/xgboost-without-outliers-tweak 
num_boost_rounds=242

Training XGBoost ...


In [54]:
print( "\nPredicting with XGBoost ...")
xgb_pred = model.predict(dtest)

print( "\nXGBoost predictions:" )
print( pd.DataFrame(xgb_pred).head() )



Predicting with XGBoost ...

XGBoost predictions:
          0
0 -0.040630
1 -0.034169
2  0.055068
3  0.084436
4 -0.000358


In [58]:
p_test.shape,xgb_pred.shape

((2985217,), (2985217,))

In [59]:
# Parameters
XGB_WEIGHT = 0.6500
BASELINE_WEIGHT = 0.0056

BASELINE_PRED = 0.0115

In [61]:
BASELINE_PRED*BASELINE_WEIGHT

6.44e-05

In [62]:
##### COMBINE PREDICTIONS

print( "\nCombining XGBoost, LightGBM, and baseline predicitons ..." )
lgb_weight = 1 - XGB_WEIGHT - BASELINE_WEIGHT
pred = XGB_WEIGHT*xgb_pred + BASELINE_WEIGHT*BASELINE_PRED + lgb_weight*p_test

print( "\nCombined predictions:" )
print( pd.DataFrame(pred).head() )


Combining XGBoost, LightGBM, and baseline predicitons ...

Combined predictions:
          0
0 -0.014122
1 -0.008814
2  0.039102
3  0.057429
4  0.002887


In [63]:
##### WRITE THE RESULTS

print( "\nPreparing results for write ..." )
y_pred=[]

for i,predict in enumerate(pred):
    y_pred.append(str(round(predict,4)))
y_pred=np.array(y_pred)


Preparing results for write ...


In [64]:
output = pd.DataFrame({'ParcelId': properties['parcelid'].astype(np.int32),
        '201610': y_pred, '201611': y_pred, '201612': y_pred,
        '201710': y_pred, '201711': y_pred, '201712': y_pred})
# set col 'ParceID' to first col


In [65]:
cols = output.columns.tolist()
cols = cols[-1:] + cols[:-1]
output = output[cols]

In [66]:
from datetime import datetime

print( "\nWriting results to disk ..." )
output.to_csv('sub{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False)

print( "\nFinished ..." )


Writing results to disk ...

Finished ...
