In [180]:
import math
import matplotlib_inline
import matplotlib as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
from joblib import Parallel, delayed

from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess, Fourier

warnings.filterwarnings("ignore")

In [181]:
df_trn = pd.read_csv("../dataset/train.csv", usecols=['store_nbr', 'family', 'date', 'sales'])
df_holiday = pd.read_csv("../dataset/holidays_events.csv")
df_oil = pd.read_csv("../dataset/oil.csv")
df_store = pd.read_csv("../dataset/stores.csv")

In [182]:
df_trn["date"] = pd.to_datetime(df_trn["date"])
df_trn["date"] = df_trn["date"].dt.to_period('D')
df_trn = df_trn.set_index(['store_nbr', 'family', 'date']).sort_index()
display(df_trn.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
store_nbr,family,date,Unnamed: 3_level_1
1,AUTOMOTIVE,2013-01-01,0.0
1,AUTOMOTIVE,2013-01-02,2.0
1,AUTOMOTIVE,2013-01-03,3.0
1,AUTOMOTIVE,2013-01-04,3.0
1,AUTOMOTIVE,2013-01-05,5.0


In [183]:
df_trnY = df_trn.unstack(['store_nbr', 'family'])
display(df_trnY.head())

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales
store_nbr,1,1,1,1,1,1,1,1,1,1,...,54,54,54,54,54,54,54,54,54,54
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2013-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-01-02,2.0,0.0,2.0,1091.0,0.0,470.652,0.0,1060.0,579.0,164.069,...,0.0,73.771,228.0,0.0,0.0,15.514,61.0,0.0,0.0,3.0
2013-01-03,3.0,0.0,0.0,919.0,0.0,310.655,0.0,836.0,453.0,151.582,...,0.0,50.257,156.0,0.0,0.0,4.313,1.0,0.0,0.0,2.0
2013-01-04,3.0,0.0,3.0,953.0,0.0,198.366,0.0,827.0,460.0,131.411,...,0.0,40.223,146.0,0.0,0.0,26.743,38.0,0.0,0.0,2.0
2013-01-05,5.0,0.0,3.0,1160.0,0.0,301.057,0.0,811.0,464.0,118.613,...,0.0,43.431,205.0,0.0,0.0,31.118,32.0,0.0,0.0,1.0


In [184]:
df_oil["date"] = pd.to_datetime(df_oil["date"])
df_holiday["date"] = pd.to_datetime(df_holiday["date"])

df_holidayの重複を解除

In [185]:
df_holiday = df_holiday.loc[df_holiday["locale"] == "National", ["date", "type"]]
df_holiday = df_holiday.groupby("date").first()
df_holiday.reset_index(inplace=True)


df_trnXの作成

In [186]:
fourier = CalendarFourier(freq='W', order=2)
dp = DeterministicProcess(index=df_trnY.index,
                          constant=False,
                          order=1,
                          seasonal=False,
                          additional_terms=[fourier],
                          drop=True)
df_trnX = dp.in_sample()
df_trnX = df_trnX.reset_index("date")
df_trnX["date"] = df_trnX["date"].astype(str)
df_trnX["date"] = pd.to_datetime(df_trnX["date"])
df_trnX = df_trnX.merge(df_oil, on="date", how="left")
df_trnX = df_trnX.merge(df_holiday[["date", "type"]], on="date", how="left")
# Day of Week
df_trnX["DoW"] = df_trnX["date"].dt.dayofweek
print(len(df_trnX))
df_trnX["dcoilwtico"].fillna(method="bfill", inplace=True)
df_trnX.set_index("date", inplace=True)
display(df_trnX.head())

休みを表すisHoliday列を追加

In [191]:
df_trnX["isHoliday"] = 0
df_trnX.loc[df_trnX["DoW"] > 4, "isHoliday"] = 1
df_trnX.loc[df_trnX["type"] == "Work Day", "isHoliday"] = 0
df_trnX.loc[df_trnX["type"] == "Holiday", "isHoliday"] = 1
display(df_trnX.head())

Unnamed: 0_level_0,dcoilwtico,type,DoW,isHoliday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-01-01,93.14,Holiday,1,1
2013-01-02,93.14,,2,0
2013-01-03,92.97,,3,0
2013-01-04,93.12,,4,0
2013-01-05,93.2,Work Day,5,0


カテゴリ変数 => ダミー変数

In [192]:
df_trnX = pd.get_dummies(df_trnX, columns=["DoW"], drop_first=True)
df_trnX = pd.get_dummies(df_trnX, columns=["type"], drop_first=False)
display(df_trnX.head())

Unnamed: 0_level_0,dcoilwtico,isHoliday,DoW_1,DoW_2,DoW_3,DoW_4,DoW_5,DoW_6,type_Additional,type_Bridge,type_Event,type_Holiday,type_Transfer,type_Work Day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2013-01-01,93.14,1,1,0,0,0,0,0,0,0,0,1,0,0
2013-01-02,93.14,0,0,1,0,0,0,0,0,0,0,0,0,0
2013-01-03,92.97,0,0,0,1,0,0,0,0,0,0,0,0,0
2013-01-04,93.12,0,0,0,0,1,0,0,0,0,0,0,0,0
2013-01-05,93.2,0,0,0,0,0,1,0,0,0,0,0,0,1


### モデル作成

In [None]:
model = Ridge(fit_intercept=True, solver="auto", alpha=0.5, normalize=True)
model.fit(df_trnX, df_trnY)
y_pred = pd.DataFrame(model.predict(df_trnX), index=df_trnX.index, columns=df_trnY.columns)

In [None]:
y_pred   = y_pred.stack(['store_nbr', 'family']).reset_index()
y_target = df_trnY.stack(['store_nbr', 'family']).reset_index().copy()

y_target['sales_pred'] = y_pred['sales'].clip(0.)

y_target.groupby('family').apply(lambda x: mean_squared_log_error(x['sales'], x['sales_pred']))

family
AUTOMOTIVE                     0.376546
BABY CARE                      0.050620
BEAUTY                         0.312066
BEVERAGES                      2.503103
BOOKS                          0.036731
BREAD/BAKERY                   1.401841
CELEBRATION                    1.334512
CLEANING                       1.887497
DAIRY                          1.611794
DELI                           1.109775
EGGS                           0.898139
FROZEN FOODS                   0.987379
GROCERY I                      2.835772
GROCERY II                     0.541499
HARDWARE                       0.295215
HOME AND KITCHEN I             1.875071
HOME AND KITCHEN II            1.273589
HOME APPLIANCES                0.158751
HOME CARE                      7.499204
LADIESWEAR                     1.147582
LAWN AND GARDEN                0.482171
LINGERIE                       0.547123
LIQUOR,WINE,BEER               2.177946
MAGAZINES                      0.459100
MEATS                          1.

In [None]:
class SalesRegressor():
    def __init__(self, n_jobs=-1, verbose=0):
        self.n_jobs = n_jobs
        self.verbose = verbose
        self.estimators_ = None
        
    def _estimator_(self, X, y):
        model = RandomForestRegressor(n_estimators = 100, n_jobs=-1, random_state=1)
        model.fit(X, y)
        return model

    def fit(self, X, y):
        self.estimators_ = Parallel(n_jobs=self.n_jobs, 
                              verbose=self.verbose,
                              )(delayed(self._estimator_)(X, y.iloc[:, i]) for i in range(y.shape[1]))
    
    def predict(self, X):
        y_pred = Parallel(n_jobs=self.n_jobs, 
                          verbose=self.verbose)(delayed(e.predict)(X) for e in self.estimators_)
        return np.stack(y_pred, axis=1)

In [None]:
model = SalesRegressor()
model.fit(df_trnX, df_trnY)
y_pred = pd.DataFrame(model.predict(df_trnX), index=df_trnX.index, columns=df_trnY.columns)

In [None]:
y_pred = y_pred.stack(['store_nbr', 'family']).reset_index()
y_target = df_trnY.stack(['store_nbr', 'family']).reset_index().copy()
y_target['sales_pred'] = y_pred['sales'].clip(0.)
y_target.groupby('family').apply(lambda x: mean_squared_log_error(x['sales'], x['sales_pred']))

family
AUTOMOTIVE                    0.069244
BABY CARE                     0.007092
BEAUTY                        0.051129
BEVERAGES                     0.151319
BOOKS                         0.002850
BREAD/BAKERY                  0.084005
CELEBRATION                   0.062123
CLEANING                      0.126641
DAIRY                         0.097433
DELI                          0.068914
EGGS                          0.067802
FROZEN FOODS                  0.090327
GROCERY I                     0.176110
GROCERY II                    0.083919
HARDWARE                      0.055705
HOME AND KITCHEN I            0.097664
HOME AND KITCHEN II           0.049204
HOME APPLIANCES               0.023370
HOME CARE                     0.207017
LADIESWEAR                    0.048918
LAWN AND GARDEN               0.064276
LINGERIE                      0.095562
LIQUOR,WINE,BEER              0.295076
MAGAZINES                     0.036056
MEATS                         0.079860
PERSONAL CARE     

### テストデータの作成

In [None]:
df_test = pd.read_csv("../dataset/test.csv", usecols=['store_nbr', 'family', 'date'])
df_test["date"] = pd.to_datetime(df_test["date"])
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()
display(df_test.head())

store_nbr,family,date
1,AUTOMOTIVE,2017-08-16
1,AUTOMOTIVE,2017-08-17
1,AUTOMOTIVE,2017-08-18
1,AUTOMOTIVE,2017-08-19
1,AUTOMOTIVE,2017-08-20


****

In [None]:
# df_testX = pd.DataFrame()
df_testX = dp.out_of_sample(steps=16)
df_testX["date"] = df_testX.index
df_testX["date"] = df_test.index.get_level_values("date").unique()
print(df_testX.info())
df_testX = df_testX.merge(df_oil, on="date", how="left")

# Day of Week
df_testX["DoW"] = df_testX["date"].dt.dayofweek
df_testX["dcoilwtico"].fillna(method="bfill", inplace=True)

df_testX["isHoliday"] = 0
df_testX.loc[df_testX["DoW"] > 4, "isHoliday"] = 1

df_testX = pd.get_dummies(df_testX, columns=["DoW"], drop_first=True)
df_testX.set_index("date", inplace=True)
# df_testの期間にholidayがないため0埋め
df_testX[["type_Additional","type_Bridge","type_Event", "type_Holiday", "type_Transfer", "type_Work Day"]] = 0



display(df_testX.head())

<class 'pandas.core.frame.DataFrame'>
PeriodIndex: 16 entries, 2017-08-16 to 2017-08-31
Freq: D
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   trend              16 non-null     float64       
 1   sin(1,freq=W-SUN)  16 non-null     float64       
 2   cos(1,freq=W-SUN)  16 non-null     float64       
 3   sin(2,freq=W-SUN)  16 non-null     float64       
 4   cos(2,freq=W-SUN)  16 non-null     float64       
 5   date               16 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(5)
memory usage: 896.0 bytes
None


Unnamed: 0_level_0,trend,"sin(1,freq=W-SUN)","cos(1,freq=W-SUN)","sin(2,freq=W-SUN)","cos(2,freq=W-SUN)",dcoilwtico,isHoliday,DoW_1,DoW_2,DoW_3,DoW_4,DoW_5,DoW_6,type_Additional,type_Bridge,type_Event,type_Holiday,type_Transfer,type_Work Day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-08-16,1685.0,0.974928,-0.222521,-0.433884,-0.900969,46.8,0,0,1,0,0,0,0,0,0,0,0,0,0
2017-08-17,1686.0,0.433884,-0.900969,-0.781831,0.62349,47.07,0,0,0,1,0,0,0,0,0,0,0,0,0
2017-08-18,1687.0,-0.433884,-0.900969,0.781831,0.62349,48.59,0,0,0,0,1,0,0,0,0,0,0,0,0
2017-08-19,1688.0,-0.974928,-0.222521,0.433884,-0.900969,47.39,1,0,0,0,0,1,0,0,0,0,0,0,0
2017-08-20,1689.0,-0.781831,0.62349,-0.974928,-0.222521,47.39,1,0,0,0,0,0,1,0,0,0,0,0,0


In [None]:
sales_pred = pd.DataFrame(model.predict(df_testX), index=df_testX.index, columns=df_trnY.columns)
display(sales_pred.head())
sales_pred = sales_pred.stack(['store_nbr', 'family'])
display(sales_pred.head())
sales_pred[sales_pred < 0] = 0.
display(sales_pred.head())

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales
store_nbr,1,1,1,1,1,1,1,1,1,1,...,54,54,54,54,54,54,54,54,54,54
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2017-08-16,4.65,0.0,4.35,2035.82,0.03,393.504316,14.29,751.47,815.09,135.71572,...,1.56,48.39148,146.6,0.05,2.11,48.109291,93.63,564.76955,0.0,2.25
2017-08-17,4.23,0.0,5.32,2043.23,0.03,307.304314,15.5,649.75,588.19,120.56346,...,1.47,56.83619,130.69,0.04,2.3,53.78242,75.81,511.42945,0.0,1.95
2017-08-18,4.67,0.0,4.63,2126.37,0.06,327.621961,24.53,777.12,683.67,176.59122,...,0.88,46.6785,149.11,0.07,2.88,68.16715,67.09,564.57807,0.0,0.83
2017-08-19,5.08,0.0,4.08,1916.35,0.0,279.212131,9.49,441.39,599.49,112.71271,...,2.28,54.727801,176.37,0.0,3.36,82.14627,85.06,701.44318,0.0,1.96
2017-08-20,1.34,0.0,1.65,884.64,0.0,137.64486,1.77,197.88,311.46,52.55268,...,2.06,68.79772,264.93,0.0,2.32,85.682548,106.5,869.67369,0.0,2.24


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
date,store_nbr,family,Unnamed: 3_level_1
2017-08-16,1,AUTOMOTIVE,4.65
2017-08-16,1,BABY CARE,0.0
2017-08-16,1,BEAUTY,4.35
2017-08-16,1,BEVERAGES,2035.82
2017-08-16,1,BOOKS,0.03


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
date,store_nbr,family,Unnamed: 3_level_1
2017-08-16,1,AUTOMOTIVE,4.65
2017-08-16,1,BABY CARE,0.0
2017-08-16,1,BEAUTY,4.35
2017-08-16,1,BEVERAGES,2035.82
2017-08-16,1,BOOKS,0.03


In [None]:
df_output = pd.read_csv('../dataset/sample_submission.csv', index_col='id')
df_output["sales"] = sales_pred.values
df_output.to_csv('./output.csv')