# Predictive Modelling

In [1]:
%store -r  df_model features_x features_y

In [3]:
# Data handling

import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import plot_importance


from sklearn.ensemble import RandomForestRegressor
#from fbprophet import Prophet

In [4]:
df_model.sales = df_model.sales.apply(lambda x: np.nan if x == 0 else x)
df_model.loc[df_model['is_train'] == 1, 'saleslog'] = np.log(1+df_model.loc[df_model['is_train'] == 1]['sales'])

In [5]:
# Train Test Split

data = df_model.loc[(df_model['is_train'] == 1)]
x_train, x_test, y_train, y_test = train_test_split(data[features_x], 
                                                    data[features_y], 
                                                    test_size=0.2, 
                                                    random_state=3)

In [6]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(675470, 21) (675470, 1) (168868, 21) (168868, 1)


## XGBoost

In [6]:
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)

num_round = 1000
evallist = [(dtrain, 'train'), (dtest, 'test')]

param = {'max_depth': 9,
         'eta': 0.01,
         'subsample': 0.75,
         'colsample_bytree': 0.6, 
         'objective': 'reg:squarederror',}

plst = list(param.items())

In [13]:
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(yhat, y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe

def rmspe_xg(yhat, y):
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe", rmspe

In [20]:
model_xgb = xgb.train(plst, dtrain, num_round, evallist,
                  feval=rmspe_xg, verbose_eval=250, early_stopping_rounds=100)

[0]	train-rmse:8.18570	train-rmspe:0.99986	test-rmse:8.18796	test-rmspe:0.99986


KeyboardInterrupt: 

In [8]:
#make a submission dataframe to test RMSPE for unseen test-data (test.csv)
submit = df_model.loc[df_model['is_train'] == 0]
dsubmit = xgb.DMatrix(submit[features_x])
predictions = model_xgb.predict(dsubmit)

df_predictions = submit['id'].reset_index()
df_predictions['Id'] = df_predictions['id'].astype('int')
df_predictions['Sales'] = (np.exp(predictions) - 1) * 0.985 #Scale Back

df_predictions.sort_values('Id', inplace=True)
df_predictions[['Id', 'Sales']].to_csv('submit_xgboost_03.csv', index=False)

NameError: name 'model_xgb' is not defined

In [None]:
#Print Feature Importance
plt.figure(figsize=(18,8))

plot_importance(model)
plt.show()
plt.savefig('xgboost_03_feature_importance.png')

In [None]:
# save dataframes on disk
df.to_csv('dataframe_raw', index=False)
df_model.to_csv('dataframe_raw_model3', index=False)
data.to_csv('dataframe_modeldata3', index=False)

In [None]:
# save model using pickle
import pickle
filename = 'model_xgboost_03.sav'
pickle.dump(model, open(filename, 'wb'))


In [None]:
# load pickled model
import pickle
#loaded_model = pickle.load(open("/Users/bur.oez/Desktop/Capstone-Project-Rossman-Sales Kopie/model_xgboost_01.sav", 'rb'))

## Support Vector Machine

## Random Forest Regressor

In [7]:
rfr= RandomForestRegressor(n_estimators=100,
                          criterion="mse",
                          max_depth=5,
                          min_samples_split=2,
                          min_samples_leaf=1,
                          min_weight_fraction_leaf=0.0,
                          max_features="auto",
                          max_leaf_nodes=None,
                          min_impurity_decrease=0.0,
                          min_impurity_split=None,
                          bootstrap=True,
                          oob_score=False,
                          random_state=70,
                
                          )

In [8]:
model_rfr= rfr.fit(x_train,y_train)

  model_rfr= rfr.fit(x_train,y_train)


In [9]:
x_test.isnull().sum()

holidaysthisweek           0
salespercustomersperday    0
promoyesterday             1
store                      0
stateholiday               0
holidayslastweek           0
competitiondistance        0
dateint                    0
0_prominterval             0
schoolholiday              0
promo                      0
dayofweek                  0
salesperday                0
competitionopen            0
customersperday            0
storetype                  0
promosince                 0
holidaysnextweek           0
open                       0
promotomorrow              0
assortment                 0
dtype: int64

In [10]:
x_test.fillna(-999, inplace=True)
y_hat_rfr=model_rfr.predict(x_test)

In [47]:
y_hat_rfr = y_hat_rfr.astype(np.float64)
y_test_values = y_test.values.astype(np.float64).reshape(-1,)

In [48]:
rmspe(y_hat_rfr,y_test_values)

0.025131548128583227

In [55]:
submit.fillna(-999, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [56]:
#make a submission dataframe to test RMSPE for unseen test-data (test.csv)
#submit = df_model.loc[df_model['is_train'] == 0]
predictions = model_rfr.predict(submit[features_x])



In [57]:
df_predictions = submit['id'].reset_index()
df_predictions['Id'] = df_predictions['id'].astype('int')
df_predictions['Sales'] = (np.exp(predictions) - 1) * 0.985 #Scale Back

df_predictions.sort_values('Id', inplace=True)
df_predictions[['Id', 'Sales']].to_csv('submit_rfr_03.csv', index=False)

## Prophet

In [38]:
df_prophet = df_model.drop(["saleslog","id","is_train","assortment", "competitiondistance","competitionopen","salespercustomersperday","stateholiday","storetype", \
                         "dayofyear","day","month","year","promosince", "0_prominterval","dateint","promotomorrow","promoyesterday", \
                         "schoolholiday","promo","open","customers","dayofweek","store","salesperday","customersperday","holidaysnextweek","holidaysthisweek","holidayslastweek"],axis=1)

In [39]:
df_prophet_log = df_model.drop(["sales","id","is_train","assortment", "competitiondistance","competitionopen","salespercustomersperday","stateholiday","storetype", \
                         "dayofyear","day","month","year","promosince", "0_prominterval","dateint","promotomorrow","promoyesterday", \
                         "schoolholiday","promo","open","customers","dayofweek","store","salesperday","customersperday","holidaysnextweek","holidaysthisweek","holidayslastweek"],axis=1)

In [50]:
df_prophet.columns = ["y","ds"]
df_prophet = df_prophet[["ds","y"]]
df_prophet = df_prophet.sort_index(ascending = False)

In [51]:
df_prophet_log.columns = ["y","ds"]
df_prophet_log = df_prophet_log[["ds","y"]]
df_prophet_log = df_prophet_log.sort_index(ascending = False)

In [52]:
df_prophet

Unnamed: 0,ds,y
885425,2015-08-01,
885424,2015-08-01,
885423,2015-08-01,
885422,2015-08-01,
885421,2015-08-01,
...,...,...
4,2015-07-31,4822.0
3,2015-07-31,13995.0
2,2015-07-31,8314.0
1,2015-07-31,6064.0


In [53]:
m =Prophet()

In [54]:
model_prophet= m.fit(df_prophet)

INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


In [55]:
future_sales= m.make_future_dataframe(periods=48, freq="D",include_history =True)
forecast_sales= m.predict(future_sales)
forecast_sales.head()

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2013-01-01,7170.719691,3708.827432,11291.456175,7170.719691,7170.719691,212.628801,212.628801,212.628801,-62.85083,-62.85083,-62.85083,275.479631,275.479631,275.479631,0.0,0.0,0.0,7383.348492
1,2013-01-02,7170.272284,3139.964336,10784.904364,7170.272284,7170.272284,-271.63817,-271.63817,-271.63817,-409.446365,-409.446365,-409.446365,137.808194,137.808194,137.808194,0.0,0.0,0.0,6898.634114
2,2013-01-03,7169.824876,3295.621102,10630.271758,7169.824876,7169.824876,-343.824145,-343.824145,-343.824145,-349.805028,-349.805028,-349.805028,5.980883,5.980883,5.980883,0.0,0.0,0.0,6826.000731
3,2013-01-04,7169.377469,3257.579025,10792.886813,7169.377469,7169.377469,-178.117668,-178.117668,-178.117668,-59.362344,-59.362344,-59.362344,-118.755325,-118.755325,-118.755325,0.0,0.0,0.0,6991.259801
4,2013-01-05,7168.930062,1782.272238,9434.350865,7168.930062,7168.930062,-1509.939051,-1509.939051,-1509.939051,-1274.6133,-1274.6133,-1274.6133,-235.325751,-235.325751,-235.325751,0.0,0.0,0.0,5658.991011


In [56]:
forecast_sales.shape

(1038, 19)