# Predictive Modelling

In [1]:
%store -r  df_model features_x features_y

In [2]:
# Data handling

import pandas as pd
import numpy as np




from sklearn.model_selection import train_test_split
import xgboost as xgb

### XGBoost

In [3]:
df_model.sales = df_model.sales.apply(lambda x: np.nan if x == 0 else x)
df_model.loc[df_model['is_train'] == 1, 'saleslog'] = np.log(1+df_model.loc[df_model['is_train'] == 1]['sales'])

In [4]:
# Train Test Split

data = df_model.loc[(df_model['is_train'] == 1)]
x_train, x_test, y_train, y_test = train_test_split(data[features_x], 
                                                    data[features_y], 
                                                    test_size=0.2, 
                                                    random_state=3)

In [5]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(675470, 16) (675470, 1) (168868, 16) (168868, 1)


In [7]:
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)

num_round = 1000
evallist = [(dtrain, 'train'), (dtest, 'test')]

param = {'max_depth': 9,
         'eta': 0.01,
         'subsample': 0.75,
         'colsample_bytree': 0.6, 
         'objective': 'reg:squarederror',}

plst = list(param.items())

In [8]:
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe

def rmspe_xg(yhat, y):
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe

In [9]:
model = xgb.train(plst, dtrain, num_round, evallist,
                  feval=rmspe_xg, verbose_eval=250, early_stopping_rounds=100)

[0]	train-rmse:8.18573	train-rmspe:0.99986	test-rmse:8.18800	test-rmspe:0.99986


KeyboardInterrupt: 

In [None]:
#make a submission dataframe to test RMSPE for unseen test-data (test.csv)
submit = df_model.loc[df_model['is_train'] == 0]
dsubmit = xgb.DMatrix(submit[features_x])
predictions = model.predict(dsubmit)

df_predictions = submit['id'].reset_index()
df_predictions['Id'] = df_predictions['id'].astype('int')
df_predictions['Sales'] = (np.exp(predictions) - 1) * 0.985 #Scale Back

df_predictions.sort_values('Id', inplace=True)
df_predictions[['Id', 'Sales']].to_csv('submit_xgboost_03.csv', index=False)

In [None]:
#Print Feature Importance
plt.figure(figsize=(18,8))
from xgboost import plot_importance
plot_importance(model)
plt.show()
plt.savefig('xgboost_03_feature_importance.png')

In [None]:
# save dataframes on disk
df.to_csv('dataframe_raw', index=False)
df_model.to_csv('dataframe_raw_model3', index=False)
data.to_csv('dataframe_modeldata3', index=False)

In [None]:
# save model using pickle
import pickle
filename = 'model_xgboost_03.sav'
pickle.dump(model, open(filename, 'wb'))


In [None]:
# load pickled model
import pickle
#loaded_model = pickle.load(open("/Users/bur.oez/Desktop/Capstone-Project-Rossman-Sales Kopie/model_xgboost_01.sav", 'rb'))