In [1]:
import numpy as np
import optuna
import pandas as pd
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import restore_demand_beta as dem

# Чтение таблиц

In [67]:
df = pd.read_csv("MERGE_TABLE_STORE_4600.csv", sep=";")

In [68]:
# Товары

In [69]:
id_list = df.product_id.value_counts().nlargest(40).index.tolist()

In [70]:
print(id_list)

[555800, 616400, 564900, 582700, 404500, 589400, 582800, 1518900, 835000, 587400, 617400, 819800, 1843100, 631500, 7562300, 11637400, 3539700, 3540400, 12906800, 4095600, 886100, 4212800, 706600, 9339400, 589700, 625700, 560100, 559800, 490400, 1617800, 744200, 720500, 4285500, 615200, 1453400, 4043300, 571300, 808700, 101300, 6783400]


# Перевод даты

In [71]:
def get_date_df(df, id_list):
    df_model = df[df.product_id.isin(id_list)]
    df_model.loc[:, "curr_date"] = pd.to_datetime(df_model.curr_date)
    df_model = df_model.set_index("curr_date")
    return df_model

# Создаём датасет для трейна

In [72]:
def df_preprocessing(df_train):
    # add data as a feature
    df_train["data"] = df_train.index

    # add dates
    df_train["day"] = pd.to_datetime(df_train["data"]).dt.day
    df_train["weekday"] = pd.to_datetime(df_train["data"]).dt.weekday
    df_train["month"] = pd.to_datetime(df_train["data"]).dt.month
    df_train["year"] = pd.to_datetime(df_train["data"]).dt.year

    # add Deficit days
    df_train["deficit"] = np.where(
        (df_train["stock"] <= 0)
        | (df_train["stock"] <= df_train["s_qty"])
        | (df_train["stock"].isna())
        | (df_train["s_qty"].isna()),
        1,
        0,
    )

    return df_train

In [73]:
def df_preprocess_windows(df_train):
    # Без сдвига
    df_train["s_qty win15"] = (
        df_train["s_qty"]
        .rolling(15, min_periods=1, center=True)
        .apply(np.nanmean)
    )
    df_train["s_qty win15"].fillna(method="ffill", inplace=True)
    df_train["s_qty win15"].fillna(0, inplace=True)

    # Сдвиг на 7 дней назад
    df_train["s_qty win15 m7"] = (
        df_train["s_qty"]
        .rolling(15, min_periods=1, center=True)
        .apply(np.nanmean)
        .shift(-7)
    )
    df_train["s_qty win15 m7"].fillna(method="ffill", inplace=True)
    df_train["s_qty win15 m7"].fillna(0, inplace=True)

    # Сдвиг на 7 дней вперёд
    df_train["s_qty win15 p7"] = (
        df_train["s_qty"]
        .rolling(15, min_periods=1, center=True)
        .apply(np.nanmean)
        .shift(7)
    )
    df_train["s_qty win15 p7"].fillna(method="ffill", inplace=True)
    df_train["s_qty win15 p7"].fillna(0, inplace=True)

    # Без сдвига
    df_train["s_qty win7"] = (
        df_train["s_qty"]
        .rolling(7, min_periods=1, center=True)
        .apply(np.nanmean)
    )
    df_train["s_qty win7"].fillna(method="ffill", inplace=True)
    df_train["s_qty win7"].fillna(0, inplace=True)

    # Сдвиг на 7 дней назад
    df_train["s_qty win7 m7"] = (
        df_train["s_qty"]
        .rolling(7, min_periods=1, center=True)
        .apply(np.nanmean)
        .shift(-7)
    )
    df_train["s_qty win7 m7"].fillna(method="ffill", inplace=True)
    df_train["s_qty win7 m7"].fillna(0, inplace=True)

    # Сдвиг на 7 дней вперёд
    df_train["s_qty win7 p7"] = (
        df_train["s_qty"]
        .rolling(7, min_periods=1, center=True)
        .apply(np.nanmean)
        .shift(7)
    )
    df_train["s_qty win7 p7"].fillna(method="ffill", inplace=True)
    df_train["s_qty win7 p7"].fillna(0, inplace=True)

    return df_train

# Выберем магазины для обучения

In [9]:
shops_list = [16500]

In [74]:
for shop in shops_list:
    df = pd.read_csv("MERGE_TABLE_STORE_{}.csv".format(shop), sep=";")
    df_model = get_date_df(df, id_list)
    df_model = df_preprocessing(df_model)
    df_model = df_preprocess_windows(df_model)
    df_train = df_train.append(df_model)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [75]:
df_train

Unnamed: 0_level_0,product_id,store_id,s_qty,flg_spromo,stock,data,day,weekday,month,year,deficit,s_qty win15,s_qty win15 m7,s_qty win15 p7,s_qty win7,s_qty win7 m7,s_qty win7 p7,real demand
curr_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2016-04-15,147600,16500,0.0,0,0.0,2016-04-15,15,4,4,2016,1,0.500000,0.500000,0.0,0.0,1.0,0.0,
2016-04-16,147600,16500,,0,3.0,2016-04-16,16,5,4,2016,1,0.666667,0.666667,0.0,0.5,1.0,0.0,
2016-04-17,147600,16500,,0,3.0,2016-04-17,17,6,4,2016,1,0.666667,0.666667,0.0,0.5,1.0,0.0,
2016-04-18,147600,16500,,0,3.0,2016-04-18,18,0,4,2016,1,0.666667,0.666667,0.0,0.5,1.0,0.0,
2016-04-19,147600,16500,1.0,0,3.0,2016-04-19,19,1,4,2016,0,0.666667,0.666667,0.0,1.0,0.5,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-10-30,12906800,32700,,0,2.0,2019-10-30,30,2,10,2019,1,0.000000,0.000000,0.0,0.0,0.0,0.0,
2019-10-31,12906800,32700,,0,2.0,2019-10-31,31,3,10,2019,1,0.000000,0.000000,0.0,0.0,0.0,0.0,
2019-11-01,12906800,32700,,0,2.0,2019-11-01,1,4,11,2019,1,0.000000,0.000000,0.0,0.0,0.0,0.0,
2019-11-02,12906800,32700,,0,2.0,2019-11-02,2,5,11,2019,1,0.000000,0.000000,0.0,0.0,0.0,0.0,


# Отделим часть, которая подходит под обучение

In [76]:
df_train["real demand"] = np.where(df_train["s_qty"] < df_train["stock"], df_train["s_qty"], np.nan)

In [77]:
df_train_demand = df_train[df_train["real demand"].notna()]
df_train_demand

Unnamed: 0_level_0,product_id,store_id,s_qty,flg_spromo,stock,data,day,weekday,month,year,deficit,s_qty win15,s_qty win15 m7,s_qty win15 p7,s_qty win7,s_qty win7 m7,s_qty win7 p7,real demand
curr_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2016-04-19,147600,16500,1.0,0,3.0,2016-04-19,19,1,4,2016,0,0.666667,0.666667,0.000000,1.0,0.5,0.0,1.0
2016-04-23,147600,16500,1.0,0,2.0,2016-04-23,23,5,4,2016,0,0.666667,0.500000,0.666667,1.0,0.0,0.5,1.0
2016-04-29,147600,16500,0.0,0,1.0,2016-04-29,29,4,4,2016,0,0.500000,0.500000,0.500000,0.0,0.0,1.0,0.0
2016-05-13,147600,16500,1.0,0,4.0,2016-05-13,13,4,5,2016,0,1.000000,1.000000,0.500000,1.0,1.0,0.0,1.0
2016-05-26,147600,16500,1.0,0,3.0,2016-05-26,26,3,5,2016,0,0.666667,0.666667,1.000000,1.0,0.5,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-04-24,12906800,32700,1.0,0,5.0,2019-04-24,24,2,4,2019,0,1.000000,1.000000,1.500000,1.0,1.0,2.0,1.0
2019-05-19,12906800,32700,1.0,0,4.0,2019-05-19,19,6,5,2019,0,1.000000,1.000000,1.000000,1.0,1.0,1.0,1.0
2019-07-13,12906800,32700,1.0,0,3.0,2019-07-13,13,5,7,2019,0,1.000000,0.666667,1.000000,1.0,1.0,1.0,1.0
2019-07-25,12906800,32700,1.0,0,2.0,2019-07-25,25,3,7,2019,0,0.500000,0.500000,1.000000,0.5,0.0,1.0,1.0


# Создание модели

In [78]:
# Создание метрики
def my_smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

smape = make_scorer(my_smape, greater_is_better=False)

In [79]:
y = df_train_demand["real demand"]
X = df_train_demand.drop(columns=["real demand", "s_qty", "stock", "data"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=808)

In [80]:
X

Unnamed: 0_level_0,product_id,store_id,flg_spromo,day,weekday,month,year,deficit,s_qty win15,s_qty win15 m7,s_qty win15 p7,s_qty win7,s_qty win7 m7,s_qty win7 p7
curr_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2016-04-19,147600,16500,0,19,1,4,2016,0,0.666667,0.666667,0.000000,1.0,0.5,0.0
2016-04-23,147600,16500,0,23,5,4,2016,0,0.666667,0.500000,0.666667,1.0,0.0,0.5
2016-04-29,147600,16500,0,29,4,4,2016,0,0.500000,0.500000,0.500000,0.0,0.0,1.0
2016-05-13,147600,16500,0,13,4,5,2016,0,1.000000,1.000000,0.500000,1.0,1.0,0.0
2016-05-26,147600,16500,0,26,3,5,2016,0,0.666667,0.666667,1.000000,1.0,0.5,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-04-24,12906800,32700,0,24,2,4,2019,0,1.000000,1.000000,1.500000,1.0,1.0,2.0
2019-05-19,12906800,32700,0,19,6,5,2019,0,1.000000,1.000000,1.000000,1.0,1.0,1.0
2019-07-13,12906800,32700,0,13,5,7,2019,0,1.000000,0.666667,1.000000,1.0,1.0,1.0
2019-07-25,12906800,32700,0,25,3,7,2019,0,0.500000,0.500000,1.000000,0.5,0.0,1.0


In [81]:
numeric = [
    "s_qty win15",
    "s_qty win15 m7",
    "s_qty win15 p7",
    "s_qty win7",
    "s_qty win7 m7",
    "s_qty win7 p7",
]

categorical = [
    "product_id",
    "store_id",
    "day",
    "weekday",
    "month",
    "year",
    "deficit",
    "flg_spromo",
]

In [82]:
column_transformer = ColumnTransformer(
    [
        ("ohe", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("scaling", StandardScaler(), numeric),
    ]
)

In [83]:
def objective(trial):
    y = df_train_demand["real demand"]
    X = df_train_demand.drop(columns=["real demand", "s_qty", "stock", "data"])
    
    pipeline = Pipeline(
        steps=[
            ("ohe_and_scaling", column_transformer),
        ]
    )
    X = pipeline.fit_transform(X, y)
    train_x, test_x, train_y, test_y = train_test_split(
        X, y, test_size=0.2, random_state=808
    )

    param = {
        "lambda": trial.suggest_loguniform("lambda", 1e-3, 10.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-3, 10.0),
        "colsample_bytree": trial.suggest_categorical(
            "colsample_bytree", [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
        ),
        "subsample": trial.suggest_categorical(
            "subsample", [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
        ),
        "learning_rate": trial.suggest_categorical(
            "learning_rate", [0.17, 0.2, 0.22, 0.26, 0.28, 0.31]
        ),
        "n_estimators": 4000,
        "max_depth": trial.suggest_categorical("max_depth", [3, 4, 5, 6]),
        "random_state": trial.suggest_categorical(
            "random_state", [24, 808, 2020]
        ),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 300),
    }
    model = xgb.XGBRegressor(**param)
    
    model.fit(
        train_x,
        train_y,
        eval_set=[(test_x, test_y)],
        early_stopping_rounds=100,
        verbose=False,
    )
    preds = model.predict(test_x)
    smape_error = my_smape(preds, test_y)

    return smape_error

In [84]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-06-05 03:04:25,127][0m A new study created in memory with name: no-name-f37a7112-0cf2-423a-ad16-15a865256540[0m
[32m[I 2021-06-05 03:04:27,480][0m Trial 0 finished with value: 48.612584882059075 and parameters: {'lambda': 0.03048758982817561, 'alpha': 0.0010217996528778704, 'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate': 0.22, 'max_depth': 4, 'random_state': 2020, 'min_child_weight': 290}. Best is trial 0 with value: 48.612584882059075.[0m
[32m[I 2021-06-05 03:04:28,613][0m Trial 1 finished with value: 48.35567101504945 and parameters: {'lambda': 5.073150512659224, 'alpha': 0.06422657753751622, 'colsample_bytree': 1.0, 'subsample': 1.0, 'learning_rate': 0.17, 'max_depth': 3, 'random_state': 2020, 'min_child_weight': 8}. Best is trial 1 with value: 48.35567101504945.[0m
[32m[I 2021-06-05 03:04:34,193][0m Trial 2 finished with value: 49.1498476706082 and parameters: {'lambda': 0.010695016270567466, 'alpha': 0.001244637315977206, 'colsample_bytree': 0.7,

[32m[I 2021-06-05 03:05:07,812][0m Trial 24 finished with value: 47.57483281011512 and parameters: {'lambda': 8.06579268300187, 'alpha': 0.17226613971730526, 'colsample_bytree': 0.8, 'subsample': 1.0, 'learning_rate': 0.17, 'max_depth': 6, 'random_state': 808, 'min_child_weight': 103}. Best is trial 19 with value: 47.38970551998962.[0m
[32m[I 2021-06-05 03:05:09,304][0m Trial 25 finished with value: 48.07800530126391 and parameters: {'lambda': 0.2686917545505392, 'alpha': 3.722872329605207, 'colsample_bytree': 0.6, 'subsample': 1.0, 'learning_rate': 0.17, 'max_depth': 6, 'random_state': 808, 'min_child_weight': 136}. Best is trial 19 with value: 47.38970551998962.[0m
[32m[I 2021-06-05 03:05:10,576][0m Trial 26 finished with value: 47.6700605224386 and parameters: {'lambda': 0.06825377102303963, 'alpha': 0.8133432722705461, 'colsample_bytree': 0.8, 'subsample': 1.0, 'learning_rate': 0.2, 'max_depth': 6, 'random_state': 808, 'min_child_weight': 67}. Best is trial 19 with value: 4

[32m[I 2021-06-05 03:05:42,044][0m Trial 49 finished with value: 49.62106333429482 and parameters: {'lambda': 1.663908861056338, 'alpha': 0.01331668321690509, 'colsample_bytree': 0.3, 'subsample': 0.5, 'learning_rate': 0.28, 'max_depth': 6, 'random_state': 24, 'min_child_weight': 53}. Best is trial 43 with value: 47.21906507974068.[0m


Number of finished trials: 50
Best trial: {'lambda': 3.874667405571208, 'alpha': 0.02702827007724924, 'colsample_bytree': 0.9, 'subsample': 0.7, 'learning_rate': 0.2, 'max_depth': 6, 'random_state': 24, 'min_child_weight': 19}


In [85]:
study.best_trial.params

{'lambda': 3.874667405571208,
 'alpha': 0.02702827007724924,
 'colsample_bytree': 0.9,
 'subsample': 0.7,
 'learning_rate': 0.2,
 'max_depth': 6,
 'random_state': 24,
 'min_child_weight': 19}

In [86]:
xgb_cool = xgb.XGBRegressor(**study.best_trial.params)

In [87]:
pipeline = Pipeline(
        steps=[
            ("ohe_and_scaling", column_transformer),
            ("xgb", xgb_cool)
        ]
    )

In [88]:
model = pipeline
print((-cross_val_score(model, X, y, cv=5, scoring=smape)).sum() / 5)

48.23109396774855


In [89]:
linreg = Ridge()

In [90]:
pipeline = Pipeline(
        steps=[
            ("ohe_and_scaling", column_transformer),
            ("linreg", linreg)
        ]
    )

In [91]:
model = pipeline
print((-cross_val_score(model, X, y, cv=5, scoring=smape)).sum() / 5)

52.68972159548291


# Сравним результаты с window и promo

In [49]:
shop_number = 4600

In [50]:
df_model = pd.read_csv("MERGE_TABLE_STORE_{}.csv".format(shop_number), sep=";")
df_model = df_model[df_model.product_id.isin(id_list)]

In [51]:
df_model.loc[:, "curr_date"] = pd.to_datetime(df_model.curr_date)

In [52]:
df_model["lambda"] = np.nan

In [53]:
df_new = pd.DataFrame(columns = list(df_model.columns))

In [54]:
for product in np.unique(X.product_id):
    df = df_model.loc[(df_model["product_id"] == product) & (df_model["store_id"] == shop_number)]
    df = dem.restore_demand(df, product, store_id=shop_number, method="promo")
    df_new = df_new.append(df)

In [55]:
df_model = df_new

In [56]:
df_model["real demand"] = np.where(df_model["s_qty"] < df_model["stock"], df_model["s_qty"], np.nan)
df_model_demand = df_model[df_model["real demand"].notna()]

In [57]:
df_model_demand

Unnamed: 0,product_id,store_id,curr_date,s_qty,flg_spromo,stock,lambda,demand,real demand
2016-05-02,147600,4600,,1.0,0,3.0,0.928571,1.0,1.0
2016-06-05,147600,4600,,1.0,0,2.0,0.928571,1.0,1.0
2016-06-07,147600,4600,,0.0,0,1.0,0.928571,2.0,0.0
2016-07-27,147600,4600,,1.0,0,4.0,0.928571,2.0,1.0
2016-09-26,147600,4600,,2.0,0,3.0,0.928571,2.0,2.0
...,...,...,...,...,...,...,...,...,...
2019-10-09,15886000,4600,,1.0,0,16.0,1.530000,2.0,1.0
2019-10-10,15886000,4600,,1.0,0,15.0,1.530000,1.0,1.0
2019-10-13,15886000,4600,,1.0,0,14.0,1.530000,1.0,1.0
2019-10-14,15886000,4600,,1.0,0,13.0,1.530000,1.0,1.0


In [58]:
my_smape(df_model_demand["lambda"], df_model_demand["real demand"])

52.9211990867907

Среднее по промо и непромо дням:

In [59]:
shop_number = 4600

In [60]:
df_model = pd.read_csv("MERGE_TABLE_STORE_{}.csv".format(shop_number), sep=";")
df_model = df_model[df_model.product_id.isin(id_list)]

In [61]:
df_model.loc[:, "curr_date"] = pd.to_datetime(df_model.curr_date)

In [62]:
df_model["lambda"] = np.nan
df_new = pd.DataFrame(columns = list(df_model.columns))

In [63]:
for product in np.unique(X.product_id):
    df = df_model.loc[(df_model["product_id"] == product) & (df_model["store_id"] == shop_number)]
    df = dem.restore_demand(df, product, store_id=shop_number, method="window")
    df_new = df_new.append(df)

In [64]:
df_model = df_new
df_model["real demand"] = np.where(df_model["s_qty"] < df_model["stock"], df_model["s_qty"], np.nan)
df_model_demand = df_model[df_model["real demand"].notna()]

In [65]:
df_model_demand

Unnamed: 0,product_id,store_id,curr_date,s_qty,flg_spromo,stock,lambda,real demand
2016-05-02,147600,4600,,1.0,0,3.0,0.0,1.0
2016-06-05,147600,4600,,1.0,0,2.0,0.0,1.0
2016-06-07,147600,4600,,0.0,0,1.0,0.0,0.0
2016-07-27,147600,4600,,1.0,0,4.0,0.0,1.0
2016-09-26,147600,4600,,2.0,0,3.0,0.0,2.0
...,...,...,...,...,...,...,...,...
2019-10-09,15886000,4600,,1.0,0,16.0,1.0,1.0
2019-10-10,15886000,4600,,1.0,0,15.0,1.0,1.0
2019-10-13,15886000,4600,,1.0,0,14.0,1.0,1.0
2019-10-14,15886000,4600,,1.0,0,13.0,1.0,1.0


In [66]:
my_smape(df_model_demand["lambda"], df_model_demand["real demand"])

53.28514953848695