In [33]:
import scipy.stats
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import poisson
from scipy.stats import ttest_ind

In [34]:
df = pd.read_csv("MERGE_TABLE_STORE_4600.csv", sep=";")

### Пропуски в flag_promo

In [35]:
df.flg_spromo.isnull().sum()

0

In [36]:
df.flg_spromo.value_counts()

0    13594213
1       23101
Name: flg_spromo, dtype: int64

### Получим список товаров, по которым больше всего данных:

In [37]:
id_list = df.product_id.value_counts().nlargest(40).index.tolist()

In [38]:
id_list

[555800,
 616400,
 564900,
 582700,
 404500,
 589400,
 582800,
 1518900,
 835000,
 587400,
 617400,
 819800,
 1843100,
 631500,
 7562300,
 11637400,
 3539700,
 3540400,
 12906800,
 4095600,
 886100,
 4212800,
 706600,
 9339400,
 589700,
 625700,
 560100,
 559800,
 490400,
 1617800,
 744200,
 720500,
 4285500,
 615200,
 1453400,
 4043300,
 571300,
 808700,
 101300,
 6783400]

### Выберем только нужные товары и приводим даты к верному формату:

In [39]:
df_model = df[df.product_id.isin(id_list)]

Смотрим на тот же флаг промо

In [40]:
df_model.flg_spromo.value_counts()

0    54421
1     1891
Name: flg_spromo, dtype: int64

In [41]:
df_model.loc[:, "curr_date"] = pd.to_datetime(df_model.curr_date)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


### Заполним пропущенные даты в датасете для товара/магазина:

In [42]:
def add_missing_dates(model_df, product_id, store_id):
    # Data as index and fill missing stock data
    new_df = model_df[
        (model_df["product_id"] == product_id) & (model_df["store_id"] == store_id)
    ].set_index("curr_date")[["stock"]]
    new_df = new_df.reindex(pd.date_range(np.min(new_df.index), np.max(new_df.index))).fillna(
        method="ffill"
    )

    # Add sales data to df with stock data
    _df_day_sales = model_df[(model_df["product_id"] == product_id)].set_index("curr_date")
    new_df = _df_day_sales[["product_id", "store_id", "flg_spromo", "s_qty"]].merge(
        new_df, how="right", left_index=True, right_index=True
    )

    return new_df

### Посчитаем спрос 

In [43]:
def calculate_demand(df, sku, store=4600):
    max_sales = df[(df["product_id"] == sku) & (df["store_id"] == store)].s_qty.max()
    sales_oracle_day = df[(df["product_id"] == sku) & (df["store_id"] == store)]["s_qty"]
    lambda_value = df[((df["product_id"] == sku) & (df["store_id"] == store))]["lambda"]
    df.loc[((df["product_id"] == sku) & (df["store_id"] == store)), "demand"] = np.fmin(
        np.full((1, len(lambda_value)), max_sales),
        np.fmax(np.random.poisson(lambda_value, size=len(lambda_value)), sales_oracle_day),
    ).tolist()[0]
    return df


def ttest_promo(df_promo, df_nopromo):
    # Check data for correctness
    if len(df_promo) > 0 and len(df_nopromo) > 0:
        _, p_value = ttest_ind(np.array(df_nopromo["s_qty"]), np.array(df_promo["s_qty"]))
        if p_value < 0.05:
            flag = True
            decision = "H1: different averages"
        else:
            flag = False
            decision = "H0: same averages"

    # Report if test is not available
    else:
        flag = False
        decision = "Not enough data"

    return flag, decision


def calculate_lambda_promo(df, product_id, teta=1, enable_test=True):
    # Choose data without promo for a given product
    df_nopromo = df.loc[(df["product_id"] == product_id) & (df["flg_spromo"] == 0)]
    # Leave only correct data
    df_nopromo = df_nopromo.loc[
        (df_nopromo["stock"] > 0) & (df_nopromo["s_qty"] <= df_nopromo["stock"])
    ]
    # Count days, where sell all or part of product amount
    sales_part = len(df_nopromo.loc[df_nopromo["s_qty"] < df_nopromo["stock"]])
    sales_all = len(df_nopromo.loc[df_nopromo["s_qty"] == df_nopromo["stock"]])

    # Count lambda for poisson distribution for days without promo
    lambda_nopromo = df_nopromo["s_qty"].sum() / (sales_part + sales_all * teta)

    # Choose data with promo for a given product
    df_promo = df.loc[(df["product_id"] == product_id) & (df["flg_spromo"] == 1)]
    # Leave only correct data
    df_promo = df_promo.loc[(df_promo["stock"] > 0) & (df_promo["s_qty"] <= df_promo["stock"])]
    # Count days, where sell all or part of product amount
    sales_part = len(df_promo.loc[df_promo["s_qty"] < df_promo["stock"]])
    sales_all = len(df_promo.loc[df_promo["s_qty"] == df_promo["stock"]])

    # If required, conduct t_test and make a decision
    recount, decision = False, None
    if enable_test == True:
        recount, decision = ttest_promo(df_promo, df_nopromo)

    if enable_test == False or recount == True:
        # Count lambda for poisson distribution for days with promo
        if (sales_part + sales_all * teta) == 0:
            lambda_promo = 0
        else:
            lambda_promo = df_promo["s_qty"].sum() / (sales_part + sales_all * teta)
    else:
        lambda_promo = lambda_nopromo

    return lambda_nopromo, lambda_promo, decision

In [44]:
def add_lambda_window(df, product_id, store_id=4600, window=30, min_periods=7):
    model_df = df
    df = df.loc[
        (df["product_id"] == product_id) & (df["store_id"] == store_id)
    ]
    df.loc[:, "lambda"] = (
        df["s_qty"]
        .rolling(center=True, window=window, min_periods=min_periods)
        .apply(np.nanmean)
    )
    df["lambda"].fillna(method="ffill", inplace=True)
    df["lambda"].fillna(method="bfill", inplace=True)
    model_df.loc[
        (
            (model_df["product_id"] == product_id)
            & (model_df["store_id"] == store_id)
        ),
        "lambda",
    ] = df
    return model_df

In [45]:
def add_lambda_ordinary(df, product_id, store_id, lambda_nopromo):
    df.loc[
        (df["product_id"] == product_id) & (df["store_id"] == store_id), ["lambda"]
    ] = lambda_nopromo
    return df


def add_lambda_promo(df, product_id, store_id, lambda_promo):
    df.loc[
        (
            (df["product_id"] == product_id)
            & (df["store_id"] == store_id)
            & (df["flg_spromo"] == 1)
        ),
        ["lambda"],
    ] = lambda_promo
    return df

### Tests:

In [46]:
import ipywidgets as widgets
from IPython.display import display, clear_output

In [47]:
_sku_list = df_model.product_id.unique().tolist()
_store_list = df.store_id.unique().tolist()

dropdown_store = widgets.Dropdown(
    options=_store_list,
    value=_store_list[0],
    description='Store ID:',
    disabled=False,
)

dropdown_sku = widgets.Dropdown(
    options=_sku_list,
    value=582700,
    description='SKU ID:',
    disabled=False,
)

_dates = pd.to_datetime(df_model.curr_date.unique())
_options = [(d.strftime('%d.%m.%Y'), d) for d in _dates if d.weekday() == 0]
range_date = widgets.SelectionRangeSlider(
    options=_options,
    index=(0,len(_options)-1),
    description='Date',
    disabled=False,
    layout=widgets.Layout(width='auto')
)

button_plot = widgets.Button(
    description='Plot',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click me',
    icon='check'
)

def plot(b):
    temp = add_missing_dates(df_model, dropdown_sku.value, dropdown_store.value)
    lambda_nopromo, lambda_promo, _ = calculate_lambda_promo(temp, dropdown_sku.value, teta=1)

    # Добавим обычную лямбду для всех дней и посчитаем спрос
    temp = add_lambda_ordinary(temp, dropdown_sku.value, dropdown_store.value, lambda_nopromo)
    temp_demand = calculate_demand(temp, dropdown_sku.value)

    # Добавим лямбду для промо дней, посчитаем спрос
    temp = add_lambda_promo(temp, dropdown_sku.value, dropdown_store.value, lambda_promo)
    temp2 = calculate_demand(temp, dropdown_sku.value)
    
    # Рассчет
    
    # Спрос
    y1 = temp2[(range_date.value[0] <= temp2.index) & (temp2.index < range_date.value[1])].demand.values
    x1 = temp2[(range_date.value[0] <= temp2.index) & (temp2.index < range_date.value[1])].index.values

    # Реальные продажи
    y2_ = temp2[(range_date.value[0] <= temp2.index) & (temp2.index < range_date.value[1])].s_qty.values
    x2_ = temp2[(range_date.value[0] <= temp2.index) & (temp2.index < range_date.value[1])].index.values
    
    temp2.s_qty.fillna(method='ffill', inplace=True)
    
    # Реальные продажи
    y2 = temp2[(range_date.value[0] <= temp2.index) & (temp2.index < range_date.value[1])].s_qty.values
    x2 = temp2[(range_date.value[0] <= temp2.index) & (temp2.index < range_date.value[1])].index.values
    
    y3 = temp2[
        (range_date.value[0] <= temp2.index) & (temp2.index < range_date.value[1])
        & ((temp2.s_qty == temp2.stock) | (0 >= temp2.stock))
    ].s_qty.values
    x3 = temp2[
        (range_date.value[0] <= temp2.index) & (temp2.index < range_date.value[1])
        & ((temp2.s_qty == temp2.stock) | (0 >= temp2.stock))
    ].index.values

    x6 = temp_demand[
        (temp_demand.index >= range_date.value[0])
        & (temp_demand.index < range_date.value[1])
        & (temp_demand["flg_spromo"] == 1)
    ].index.values
    y6 = [0] * len(x6)
    
    # Вывод
    clear_output()
    display(dropdown_store, dropdown_sku, range_date, button_plot)
    
    plt.figure(figsize=(15, 7))

    plt.plot(x1, y1, label="Восстановленный спрос", c="black", alpha=0.25)
    
    plt.plot(x2, y2, label="Реальные продажи", c="blue", alpha=0.5)
    plt.scatter(x2_, y2_, marker='x', c="blue")
    
    for i in range( len(x3) ):
        if i and (x3[i] - x3[i-1]) / np.timedelta64(1, 'D') == 1:
            plt.axvspan(x3[i-1], x3[i], facecolor='r', alpha=0.25)
        elif i == len(x3) - 1 and not (x3[i] - x3[i-1]) / np.timedelta64(1, 'D') == 1:
            plt.axvspan(x3[i], x3[i]+np.timedelta64(1, 'D'), facecolor='r', alpha=0.25)
    
    for i in range( len(x6) ):
        if i and (x6[i] - x6[i-1]) / np.timedelta64(1, 'D') == 1:
            plt.axvspan(x6[i-1], x6[i], facecolor='g', alpha=0.25)
        elif i == len(x6) - 1 and not (x6[i] - x6[i-1]) / np.timedelta64(1, 'D') == 1:
            plt.axvspan(x6[i], x6[i]+np.timedelta64(1, 'D'), facecolor='g', alpha=0.25)

    plt.ylabel("Amount of desired products", fontsize="12")
    plt.xlabel("Dates", fontsize="12")
    plt.title(f"Demand and real sales [{range_date.value[0].strftime('%d.%m.%Y')}-{range_date.value[1].strftime('%d.%m.%Y')}]", fontsize="16")
    plt.legend()
    plt.show()

button_plot.on_click(plot)

In [48]:
display(dropdown_store, dropdown_sku, range_date, button_plot)

Dropdown(description='Store ID:', options=(4600,), value=4600)

Dropdown(description='SKU ID:', index=8, options=(101300, 404500, 490400, 555800, 559800, 560100, 564900, 5713…

SelectionRangeSlider(description='Date', index=(0, 208), layout=Layout(width='auto'), options=(('04.01.2016', …

Button(description='Plot', icon='check', style=ButtonStyle(), tooltip='Click me')