In [1]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
import copy
from sklearn.metrics import mean_squared_error

from tqdm import tqdm_notebook
import torch
import torch.nn as nn
import torch.utils.data as Data

In [2]:
df_train = pd.read_csv(
    '../input/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    "../input/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    "../input/items.csv",
).set_index("item_nbr")

df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

In [37]:
df_2017.head()

Unnamed: 0_level_0,date,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.098612,1.098612,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.0
1,99197,0.0,0.0,1.386294,0.693147,0.693147,0.693147,1.098612,0.0,0.0,0.693147,...,0.0,1.098612,0.0,1.098612,0.0,0.0,0.0,0.0,0.0,0.0
1,103520,0.0,0.693147,1.098612,0.0,1.098612,1.386294,0.693147,0.0,0.693147,0.693147,...,0.0,0.0,1.386294,0.0,1.386294,0.693147,0.693147,0.693147,0.0,0.0
1,103665,0.0,0.0,0.0,1.386294,1.098612,1.098612,0.693147,1.098612,0.0,2.079442,...,0.693147,1.098612,0.0,2.079442,2.302585,1.098612,0.0,0.0,0.693147,0.693147
1,105574,0.0,0.0,1.791759,2.564949,2.302585,1.94591,1.609438,1.098612,1.386294,2.302585,...,0.0,1.791759,2.079442,1.94591,2.397895,1.791759,1.791759,0.0,1.386294,1.609438


In [5]:
promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

In [6]:
df_2017.head()

Unnamed: 0_level_0,date,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.098612,1.098612,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.0
1,99197,0.0,0.0,1.386294,0.693147,0.693147,0.693147,1.098612,0.0,0.0,0.693147,...,0.0,1.098612,0.0,1.098612,0.0,0.0,0.0,0.0,0.0,0.0
1,103520,0.0,0.693147,1.098612,0.0,1.098612,1.386294,0.693147,0.0,0.693147,0.693147,...,0.0,0.0,1.386294,0.0,1.386294,0.693147,0.693147,0.693147,0.0,0.0
1,103665,0.0,0.0,0.0,1.386294,1.098612,1.098612,0.693147,1.098612,0.0,2.079442,...,0.693147,1.098612,0.0,2.079442,2.302585,1.098612,0.0,0.0,0.693147,0.693147
1,105574,0.0,0.0,1.791759,2.564949,2.302585,1.94591,1.609438,1.098612,1.386294,2.302585,...,0.0,1.791759,2.079442,1.94591,2.397895,1.791759,1.791759,0.0,1.386294,1.609438


In [7]:
items = items.reindex(df_2017.index.get_level_values(1))

In [None]:
# 准备数据

In [8]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [9]:
print("Preparing dataset...")
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

Preparing dataset...


In [10]:
stores_items = pd.DataFrame(index=df_2017.index)

In [11]:
test_ids = df_test[['id']]

In [12]:
items = items.reindex( stores_items.index.get_level_values(1) )

In [13]:
X_train.shape

(1005090, 40)

In [14]:
X_train = X_train.as_matrix()
X_test = X_test.as_matrix()
X_val = X_val.as_matrix()
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
X_val = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)

(1005090, 1, 40)
(167515, 1, 40)
(167515, 1, 40)


# pytorch构建lstm

In [29]:
EPOCH = 5               # train the training data n times, to save time, we just train 1 epoch
BATCH_SIZE = 512
TIME_STEP = 1         # rnn time step / image height
INPUT_SIZE = 40         # rnn input size / image width
LR = 0.001               # learning rate

num_workers = 4

val_pred = []
test_pred = []

## 构建模型

In [30]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()

        self.rnn = nn.LSTM(         # if use nn.RNN(), it hardly learns
            input_size=INPUT_SIZE,
            hidden_size=32,         # rnn hidden unit
            num_layers=1,           # number of rnn layer
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )
        
        self.out = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(32, 32),
            nn.Dropout(0.2),
            nn.Linear(32, 1),
        )

    def forward(self, x):
        # x shape (batch, time_step, input_size)
        # r_out shape (batch, time_step, output_size)
        # h_n shape (n_layers, batch, hidden_size)
        # h_c shape (n_layers, batch, hidden_size)
        r_out, (h_n, h_c) = self.rnn(x, None)   # None represents zero initial hidden state
        
        # choose r_out at the last time step
        out = self.out(r_out[:, -1, :])
        return out

In [31]:
rnn = RNN()
print(rnn)

RNN(
  (rnn): LSTM(40, 32, batch_first=True)
  (out): Sequential(
    (0): Dropout(p=0.1)
    (1): Linear(in_features=32, out_features=32, bias=True)
    (2): Dropout(p=0.2)
    (3): Linear(in_features=32, out_features=1, bias=True)
  )
)


## 构建自定义损失函数

In [32]:
# pytorch自定义损失函数 Normalized Weighted Root Mean Squared Logarithmic Error(NWRMSLE)
# 这里y真实值需要提前进行log1p的操作
from torch.functional import F

class my_rmseloss(nn.Module):
    
    def __init__(self):
        super(my_rmseloss, self).__init__()
        return 
    
    def forward(self, input, target, sample_weights=None):
        self._assert_no_grad(target)
        f_revis = lambda a, b, w: ((a - b) ** 2) * w # 重写
        return self._pointwise_loss(f_revis, torch._C._nn.mse_loss,
                           input, target, sample_weights)
    
    # 重写_pointwise_loss
    def _pointwise_loss(self, lambd, lambd_optimized, input, target, sample_weights):
        if target.requires_grad:
            d = lambd(input, target, sample_weights)
#             return torch.sqrt(torch.div(torch.sum(d), torch.sum(sample_weights)))
            return torch.div(torch.sum(d), torch.sum(sample_weights))
        else:
            if sample_weights is not None:
                unrooted_res = torch.div(torch.sum(torch.mul(lambd_optimized(input, target),sample_weights)),torch.sum(sample_weights))
#                 return torch.sqrt(unrooted_res)
                return unrooted_res
            return lambd_optimized(input, target, 1)
    
    def _assert_no_grad(self, tensor):
        assert not tensor.requires_grad, \
            "nn criterions don't compute the gradient w.r.t. targets - please " \
            "mark these tensors as not requiring gradients"

In [33]:
optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all cnn parameters
# loss_func = nn.MSELoss()                       # the target label is not one-hotted
loss_func = my_rmseloss() # self define loss

## 数据处理成tensor，并跑lstm模型

In [None]:
# numpy数据处理成tensor

trainX_tensor = torch.from_numpy(X_train).type(torch.FloatTensor)
valX_tensor = torch.from_numpy(X_val).type(torch.FloatTensor)
testX_tesnsor = torch.from_numpy(X_test).type(torch.FloatTensor)

# sample weights
sample_weights=np.array(pd.concat([items["perishable"]] * 6) * 0.25 + 1)
sample_weights_train_tensor = torch.from_numpy(sample_weights).type(torch.FloatTensor)
sample_weights_val_tensor = torch.from_numpy(np.array(items["perishable"] * 0.25 + 1)).type(torch.FloatTensor)

# 总共要预测16列
for i in tqdm_notebook(range(16)):
    
    best_model = None
    val_loss_prev = 9999

    # 预测16个日期的销量，每个i是一个日期
    trainY_tensor = torch.from_numpy(y_train[:,i]).type(torch.FloatTensor)
    valY_tensor = torch.from_numpy(y_val[:,i]).type(torch.FloatTensor)

    # 组装成dataset，到时候放入dataloader(放入dataloader是为了进行批训练)
    torch_dataset = Data.TensorDataset(trainX_tensor, trainY_tensor, sample_weights_train_tensor)
    train_loader = Data.DataLoader(
        dataset=torch_dataset,      # torch TensorDataset format
        batch_size=BATCH_SIZE,      # mini batch size
        shuffle=True,               # random shuffle for training
        num_workers=num_workers,              # subprocesses for loading data
    )
    
    for epoch in range(EPOCH):
        for step, (b_x, b_y, sample_w) in enumerate(train_loader):        # gives batch data

            output = rnn(b_x).squeeze()                     # rnn output
            loss = loss_func(output, b_y, sample_weights=sample_w)         # calc NWRMSE loss
            optimizer.zero_grad()                           # clear gradients for this training step
            loss.backward()                                 # backpropagation, compute gradients
            optimizer.step()                                # apply gradients

        val_output = rnn(valX_tensor).squeeze()                   # (samples, time_step, input_size)
        # 然后比较一下验证集的输出和真实值算mse
        val_loss = loss_func(val_output, valY_tensor, sample_weights=sample_weights_val_tensor)
        val_loss = float(val_loss.detach().data.numpy())
        if  val_loss < val_loss_prev:
            best_model = copy.deepcopy(rnn)
            val_loss_prev = val_loss
        print('Epoch: ', epoch, '| train loss: %.4f' % loss.data.numpy(), '| val loss: %.4f' % val_loss)
    val_pred.append(best_model(valX_tensor).squeeze().data.numpy())
    test_pred.append(best_model(testX_tesnsor).squeeze().data.numpy())

Epoch:  0 | train loss: 0.2754 | val loss: 0.3264
Epoch:  1 | train loss: 0.3201 | val loss: 0.3200
Epoch:  2 | train loss: 0.3005 | val loss: 0.3105


In [None]:
y_test = np.array(test_pred).squeeze().transpose()
df_preds = pd.DataFrame(
    y_test, index=stores_items.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")

df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)
submission = test_ids.join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('../output/lstm_v5.csv', float_format='%.4f', index=None)

# 数据封装 (不用管)

In [None]:
trainX_tensor = torch.from_numpy(X_train).type(torch.FloatTensor)
trainY_tensor = torch.from_numpy(y_train[:,0]).type(torch.FloatTensor)
valX_tensor = torch.from_numpy(X_val).type(torch.FloatTensor)
valY_tensor = torch.from_numpy(y_val[:,0]).type(torch.FloatTensor)
testX_tesnsor = torch.from_numpy(X_test).type(torch.FloatTensor)

sample_weights=np.array(pd.concat([items["perishable"]] * 6) * 0.25 + 1)
sample_weights_train_tensor = torch.from_numpy(sample_weights).type(torch.FloatTensor)
sample_weights_val_tensor = torch.from_numpy(np.array(items["perishable"] * 0.25 + 1)).type(torch.FloatTensor)

In [None]:
# 组装成dataset，到时候放入dataloader(放入dataloader是为了进行批训练)
torch_dataset = Data.TensorDataset(trainX_tensor, trainY_tensor, sample_weights_train_tensor)
train_loader = Data.DataLoader(
    dataset=torch_dataset,      # torch TensorDataset format
    batch_size=BATCH_SIZE,      # mini batch size
    shuffle=True,               # random shuffle for training
    num_workers=num_workers,              # subprocesses for loading data
)