In [17]:
# 为这个项目导入需要的库
import numpy as np
import pandas as pd
from time import time
from IPython.display import display # 允许为DataFrame使用display()
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import plot_importance
%matplotlib inline

In [18]:
#!conda install -y -c conda-forge xgboost

In [19]:
data_train = pd.read_csv("train.csv",parse_dates=True,index_col='Date')
data_store = pd.read_csv("store.csv")
data_test=pd.read_csv('test.csv',parse_dates=True,index_col='Date')

Train Data处理

In [20]:
data_store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


In [21]:
#提取特征
data_train['Year']=data_train.index.year
data_train['Month'] = data_train.index.month
data_train['Day'] =data_train.index.day
data_train['WeekOfYear'] = data_train.index.weekofyear

#边缘数据清除
data_train=data_train[data_train.Sales<32000]

#转换Categorical特征
label_encoder = LabelEncoder()
data_train['StateHoliday']=label_encoder.fit_transform(data_train['StateHoliday'].astype(str))

#Delete rows with Sales is 0
data_train=data_train[data_train.Sales!=0]

#Delete the feature as it is not exist in Test set.
data_train.drop(['Customers'],axis=1,inplace=True)

Store Data处理

In [22]:
#转换Categorical特征
data_store['StoreType']=label_encoder.fit_transform(data_store['StoreType'])
data_store['Assortment']=label_encoder.fit_transform(data_store['Assortment'])
#data_store['PromoInterval']=label_encoder.fit_transform((data_store['PromoInterval']).astype(str))

#填充missing value
data_store['CompetitionDistance']=data_store.CompetitionDistance.fillna(data_store.CompetitionDistance.median())
#data_store['CompetitionDistance']=data_store.CompetitionDistance.fillna(0)


Store Train 数据合并及相关新特征的生成

In [23]:
#数据合并
data_train_all=data_train.merge(right=data_store,on='Store')


#创建新特征
data_train_all['CompetitionPeriod'] = 12 * (data_train_all.Year - data_train_all.CompetitionOpenSinceYear) + \
        (data_train_all.Month - data_train_all.CompetitionOpenSinceMonth)
    

data_train_all['PromoPeriod'] = 12 * (data_train_all.Year - data_train_all.Promo2SinceYear) + \
        (data_train_all.WeekOfYear - data_train_all.Promo2SinceWeek) / 4.0

#填充missing value
data_train_all=data_train_all.fillna(0)

data_train_all.drop(['Open'],axis=1,inplace=True)

month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
data_train_all['monthStr'] = data_train_all.Month.map(month2str)
data_train_all.loc[data_train_all.PromoInterval == 0, 'PromoInterval'] = ''
data_train_all['IsPromoMonth'] = 0
for interval in data_train_all.PromoInterval.unique():
    if interval != '':
        for month in interval.split(','):
            data_train_all.loc[(data_train_all.monthStr == month) & (data_train_all.PromoInterval == interval), 'IsPromoMonth'] = 1

del data_train_all['PromoInterval']
del data_train_all['monthStr']

测试数据处理

In [24]:
data_test['Year']=data_test.index.year
data_test['Month'] = data_test.index.month
data_test['Day'] =data_test.index.day
data_test['WeekOfYear'] = data_test.index.weekofyear

data_test.drop(['Open'],axis=1,inplace=True)
data_test_all=data_test.merge(right=data_store,on='Store')
data_test_all=data_test_all.set_index('Id')



# competition open time (in months)
data_test_all['CompetitionPeriod'] = 12 * (data_test_all.Year - data_test_all.CompetitionOpenSinceYear) + \
        (data_test_all.Month - data_test_all.CompetitionOpenSinceMonth)
    
# Promo open time
data_test_all['PromoPeriod'] = 12 * (data_test_all.Year - data_test_all.Promo2SinceYear) + \
        (data_test_all.WeekOfYear - data_test_all.Promo2SinceWeek) / 4.0

data_test_all=data_test_all.fillna(0)

data_test_all['StateHoliday']=label_encoder.fit_transform(data_test_all['StateHoliday'])


month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',              7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
data_test_all['monthStr'] = data_test_all.Month.map(month2str)
data_test_all.loc[data_test_all.PromoInterval == 0, 'PromoInterval'] = ''
data_test_all['IsPromoMonth'] = 0
for interval in data_test_all.PromoInterval.unique():
    if interval != '':
        for month in interval.split(','):
            data_test_all.loc[(data_test_all.monthStr == month) & (data_test_all.PromoInterval == interval), 'IsPromoMonth'] = 1

del data_test_all['PromoInterval']
del data_test_all['monthStr']

In [25]:
data_train_all_x=data_train_all.drop('Sales',axis=1)

In [26]:
# define rmspe for xgb(code from https://www.kaggle.com/cast42/xgboost-in-python-with-rmspe-v2/code)
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)

In [27]:
data_train_all_y=data_train_all.Sales

X_train, X_valid, y_train, y_valid = train_test_split(data_train_all_x,data_train_all_y,test_size=0.1,random_state=1)


In [29]:
X_valid

Unnamed: 0,Store,DayOfWeek,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,WeekOfYear,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,CompetitionPeriod,PromoPeriod,IsPromoMonth
784130,1038,3,1,0,0,2013,4,10,15,3,0,17290.0,10.0,2013.0,0,0.0,0.0,-6.0,0.00,0
386549,512,4,0,0,0,2015,4,23,17,1,1,590.0,0.0,0.0,1,5.0,2013.0,0.0,27.00,0
683898,906,3,0,0,1,2013,1,2,1,0,0,90.0,7.0,2010.0,0,0.0,0.0,30.0,0.00,0
592028,784,1,1,0,0,2014,3,17,12,0,0,560.0,10.0,2014.0,1,10.0,2014.0,-7.0,0.50,0
70319,93,2,0,0,0,2013,6,25,26,0,0,16690.0,0.0,0.0,1,14.0,2011.0,0.0,27.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20772,28,4,1,0,0,2014,12,4,49,0,0,1200.0,10.0,2014.0,1,6.0,2015.0,2.0,-1.25,1
34639,46,1,1,0,0,2013,5,13,20,2,0,1200.0,9.0,2005.0,1,14.0,2011.0,92.0,25.50,0
777576,1030,2,0,0,1,2014,10,28,44,0,0,36410.0,4.0,2008.0,0,0.0,0.0,78.0,0.00,0
715122,948,4,0,0,0,2015,3,26,13,1,1,1430.0,0.0,0.0,0,0.0,0.0,0.0,0.00,0


In [28]:
X_train.columns

Index(['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 'Year',
       'Month', 'Day', 'WeekOfYear', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'CompetitionPeriod', 'PromoPeriod', 'IsPromoMonth'],
      dtype='object')

In [89]:

#基准模型
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators = 15)
clf.fit(X_train, np.log1p(y_train))
# validation
y_pred = clf.predict(X_valid)
error = rmspe(y_valid, np.expm1(y_pred))
print('RMSPE: {:.4f}'.format(error))

RMSPE: 0.1431


In [None]:
import xgboost

dtrain = xgboost.DMatrix(X_train, np.log(y_train+1))
dtest = xgboost.DMatrix(X_valid, np.log(y_valid+1))

num_round = 20000
evallist = [(dtrain, 'train'), (dtest, 'test')]

param = {'bst:max_depth':12,
         'bst:eta':0.01,
         'subsample':0.8,
         
         'colsample_bytree':0.7,
         'min_child_weight':6,
         'objective':'reg:linear',
         'nthread':4,
         'seed':1}

plst = param.items()

bst = xgboost.train(plst, dtrain, num_round, evallist, feval=rmspe_xg, verbose_eval=250, early_stopping_rounds=250)

[0]	train-rmse:5.79493	test-rmse:5.79401	train-rmspe:0.99681	test-rmspe:0.99680
Multiple eval metrics have been passed: 'test-rmspe' will be used for early stopping.

Will train until test-rmspe hasn't improved in 250 rounds.
[250]	train-rmse:0.12056	test-rmse:0.12347	train-rmspe:0.18971	test-rmspe:0.14689
[500]	train-rmse:0.10298	test-rmse:0.10744	train-rmspe:0.17041	test-rmspe:0.13172
[750]	train-rmse:0.09600	test-rmse:0.10207	train-rmspe:0.16112	test-rmspe:0.12719
[1000]	train-rmse:0.09183	test-rmse:0.09907	train-rmspe:0.15301	test-rmspe:0.12282
[1250]	train-rmse:0.08870	test-rmse:0.09700	train-rmspe:0.13552	test-rmspe:0.11925
[1500]	train-rmse:0.08635	test-rmse:0.09550	train-rmspe:0.12255	test-rmspe:0.11656
[1750]	train-rmse:0.08450	test-rmse:0.09466	train-rmspe:0.11951	test-rmspe:0.11593
[2000]	train-rmse:0.08299	test-rmse:0.09393	train-rmspe:0.11875	test-rmspe:0.11462
[2250]	train-rmse:0.08166	test-rmse:0.09334	train-rmspe:0.11575	test-rmspe:0.11305
[2500]	train-rmse:0.08049	test

In [16]:
dta=xgboost.DMatrix(data_test_all)
submission= bst.predict(dta)
#indices = submission < 0
#submission[indices] = 0
#引用此处对于用于错误修正的权重 https://www.kaggle.com/xwxw2929/rossmann-sales-top1/notebook
sub = pd.DataFrame({"Id": data_test_all.index, "Sales": np.exp(submission*0.995)-1})
sub.to_csv("18SageMaker3v9-2.csv", index=False)

In [3]:

plot_importance(bst)
plt.show()

NameError: name 'bst' is not defined