In [14]:
import shap
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import xgboost as xgb
from time import time
import os
PATH = "data/rossmann"
print(os.listdir(PATH))
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper")
sns.set_palette("cubehelix", 8)#"deep")
from mlxtend.evaluate import feature_importance_permutation
from copy import deepcopy

['store.csv', 'test.csv', 'train.csv']


In [2]:
train = pd.read_csv(os.path.join(PATH,"train.csv"),parse_dates=[2], low_memory=False)
test = pd.read_csv(os.path.join(PATH,"test.csv"),parse_dates=[3], low_memory=False)
store = pd.read_csv(os.path.join(PATH,"store.csv"), low_memory=False)
test.fillna(1, inplace=True)
store.fillna(0, inplace=True)
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
1,1,4,2015-07-30,5020,546,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
2,1,3,2015-07-29,4782,523,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
3,1,2,2015-07-28,5011,560,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
4,1,1,2015-07-27,6102,612,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0


In [3]:
def features_create(data):
    
    
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)
    
    
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear
    
    
    data['CompetitionOpenSinceYear'] = data['CompetitionOpenSinceYear'].replace(1900.0, 0)
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + (data.Month - data.CompetitionOpenSinceMonth)
    data.loc[data['CompetitionOpen']>=2013*12,'CompetitionOpen'] = 0
    data['CompetitionOpen'] = data.CompetitionOpen.apply(lambda x: x if x > 0 else 0) 
    
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) +         (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data.loc[data['PromoOpen']>2013*12,'PromoOpen'] = 0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
  
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',              7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

    return data

def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)

In [4]:
train = features_create(train)
test = features_create(test)
train = train.sort_values('Date')
test = test.sort_values('Date')

train = train[train['Open']!=0]
train.drop(['Date','PromoInterval','monthStr', 'Customers','Open', 'CompetitionOpenSinceMonth','Promo2SinceWeek'],axis=1,inplace =True)

split = 6*7*1115
train_train = train.iloc[split:]
train_test = train.iloc[:split]

xtrain = train_train.drop(['Sales'],axis=1)
ytrain = np.log1p(train_train.Sales)

xtest = train_test.drop(['Sales'],axis=1)
ytest = np.log1p(train_test.Sales)


## Baseline

In [6]:
model = xgb.XGBRegressor(objective =  "reg:linear",
                         booster =  "gbtree",
                         eta =  0.03,
                         max_depth =  10,
                         subsample =  0.9,
                         colsample_bytree =  0.7,
                         silent =  1,
                         seed=   10,
                         n_jobs = 100,
                         n_estimators = 500).fit(xtrain, ytrain, eval_set= [(xtest,ytest)], eval_metric=rmspe_xg, verbose=False)
preds = model.predict(xtest)
error = rmspe(preds,ytest)
error



0.020971784409782715

In [34]:
def eval_estimator(fi_series, fractions=[1,2,3,4], xtrain=deepcopy(xtrain), ytrain=deepcopy(ytrain), xtest=deepcopy(xtest), ytest=deepcopy(ytest)):
    errors = []
    for i in fractions:
        print(i)
        important_features = fi_series.iloc[:i].index
        xtrain_new = xtrain.drop(important_features, axis = 1)
        xtest_new = xtest.drop(important_features, axis = 1)
        model = xgb.XGBRegressor(objective =  "reg:linear",
                             booster =  "gbtree",
                             eta =  0.03,
                             max_depth =  10,
                             subsample =  0.9,
                             colsample_bytree =  0.7,
                             silent =  1,
                             seed=   10,
                             n_jobs = 100,
                             n_estimators = 500).fit(xtrain_new, ytrain, eval_set= [(xtest_new,ytest)], eval_metric=rmspe_xg, verbose=False)
        preds = model.predict(xtest_new)
        error = rmspe(preds,ytest)
        errors.append(error)
    return errors

### Permutation FI

In [11]:
names = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceYear', 'Year', 'Month', 'Day', 'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']
def model_predict(x):
    x = pd.DataFrame(x, columns = names)
    return model.predict(x)
mean_fi, all_fi = feature_importance_permutation(xtrain.values, ytrain, model_predict, 'r2' )
permutation_fi_df = pd.DataFrame(mean_fi, index = names).sort_values(0, ascending=False)
permutation_fi_df

Unnamed: 0,0
CompetitionDistance,0.401268
Store,0.267237
Promo,0.253062
DayOfWeek,0.197735
Assortment,0.142312
Promo2SinceYear,0.115501
StoreType,0.085871
CompetitionOpenSinceYear,0.084351
Day,0.083804
WeekOfYear,0.048069


In [33]:
permutation_fi_errors = eval_estimator(permutation_fi_df)
permutation_fi_errors



### SHAP FI

In [35]:
shap_values = pd.read_csv('all_shap_values.csv')
shap_values = shap_values.drop('Unnamed: 0', axis = 1)
shap_values.columns = names
shap_values_fi = shap_values.abs().mean().sort_values( ascending=False)
shap_values_fi

Promo                       0.138947
CompetitionDistance         0.094030
Store                       0.074863
Assortment                  0.061622
DayOfWeek                   0.055424
Promo2SinceYear             0.045694
Day                         0.042881
CompetitionOpenSinceYear    0.029848
StoreType                   0.026163
WeekOfYear                  0.024113
Month                       0.019409
Year                        0.018485
CompetitionOpen             0.017252
Promo2                      0.015813
PromoOpen                   0.013383
SchoolHoliday               0.005090
IsPromoMonth                0.002239
StateHoliday                0.000396
dtype: float64

In [36]:
shap_values_fi_errors = eval_estimator(shap_values_fi)
shap_values_fi_errors

1




2
3
4


### XGBoost FI

In [41]:
xgboost_fi = pd.DataFrame(model.feature_importances_, index = names).sort_values(by= 0,ascending=False)
xgboost_fi

Unnamed: 0,0
Promo,0.20768
Promo2,0.117262
CompetitionDistance,0.113792
Promo2SinceYear,0.103658
Assortment,0.086414
StoreType,0.078195
Store,0.07694
CompetitionOpenSinceYear,0.075984
DayOfWeek,0.03206
StateHoliday,0.018906


In [42]:
xgboost_fi_errors = eval_estimator(xgboost_fi)
xgboost_fi_errors

1




2
3
4


## Evaluation

In [63]:
fi_values_df = pd.concat([permutation_fi_df[0], xgboost_fi[0], shap_values_fi], axis=1).reset_index()
fi_values_df.columns =['Feature','Permutation','GAIN','SHAP']
fi_values_df

Unnamed: 0,Permutation,GAIN,SHAP
Assortment,0.142312,0.086414,0.061622
CompetitionDistance,0.401268,0.113792,0.09403
CompetitionOpen,0.047056,0.018447,0.017252
CompetitionOpenSinceYear,0.084351,0.075984,0.029848
Day,0.083804,0.013676,0.042881
DayOfWeek,0.197735,0.03206,0.055424
IsPromoMonth,0.003483,0.004298,0.002239
Month,0.032218,0.013665,0.019409
Promo,0.253062,0.20768,0.138947
Promo2,0.015338,0.117262,0.015813


In [68]:
fi_ranking_df = pd.DataFrame([list(permutation_fi_df[0].index), list(xgboost_fi[0].index), list(shap_values_fi.index)]).T.reset_index()
fi_ranking_df.columns =['Ranking','Permutation','GAIN','SHAP']
fi_ranking_df = fi_ranking_df.set_index('Ranking')
fi_ranking_df.to_csv('FI_Ranking_Roar.csv')

In [67]:
fi_ranking_df.set_index('Ranking')

Unnamed: 0_level_0,Permutation,GAIN,SHAP
Ranking,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,CompetitionDistance,Promo,Promo
1,Store,Promo2,CompetitionDistance
2,Promo,CompetitionDistance,Store
3,DayOfWeek,Promo2SinceYear,Assortment
4,Assortment,Assortment,DayOfWeek
5,Promo2SinceYear,StoreType,Promo2SinceYear
6,StoreType,Store,Day
7,CompetitionOpenSinceYear,CompetitionOpenSinceYear,CompetitionOpenSinceYear
8,Day,DayOfWeek,StoreType
9,WeekOfYear,StateHoliday,WeekOfYear


In [61]:
error_df = pd.DataFrame([permutation_fi_errors, shap_values_fi_errors, xgboost_fi_errors], columns=[1,2,3,4]).T.reset_index()
error_df.columns = ['Fraction','Permutation','SHAP','GAIN']
error_df = error_df-error
error_df = error_df.append(error_df.mean().rename('Mean')).round(5)
error_df.to_csv('Roar_4.csv')

In [91]:
error_df = pd.DataFrame([permutation_fi_errors, shap_values_fi_errors, xgboost_fi_errors], columns=[1,2,3,4]).T
error_df = (error_df-error)*100 / error


error_df = error_df.append(error_df.mean().rename('Mean')).round(2)
error_df = error_df.reset_index()
error_df.columns = ['Fraction','Permutation','SHAP','GAIN']
error_df = error_df.set_index('Fraction')
#error_df = error_df.set_index('Fraction')

In [92]:
error_df

Unnamed: 0_level_0,Permutation,SHAP,GAIN
Fraction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4.83,4.67,4.67
2,87.41,10.84,5.58
3,91.51,91.51,10.71
4,104.55,100.26,16.81
Mean,72.08,51.82,9.44


In [94]:
error_df.to_csv('Roar_4_relative.csv')