# Импорты библиотек

In [1]:
import numpy as np
import pandas as pd
import gradio as gr
import plotly.graph_objects as go
from scipy import stats

# Чтение данных

In [2]:
df_path = "./marketplace.csv"

df = pd.read_csv(df_path)

df.head()

Unnamed: 0,user_id,platform_num,first_login,reg_dt,browser,first_buy,target,total_buy,total_return
0,user_000000,12,2025-03-22,2024-08-17,browser_00,2025-03-22,0.0,85.329559,0.0
1,user_000001,1,2025-05-05,2025-05-15,browser_01,2025-05-20,0.0,21.819124,0.0
2,user_000002,5,2025-01-19,2025-01-23,browser_02,2025-02-20,0.3,194.61298,485.0
3,user_000003,3,2025-03-15,2023-09-05,browser_02,,0.0,138.780814,0.0
4,user_000004,1,2025-05-05,2025-05-14,browser_00,2025-05-11,0.0,16.334507,0.0


# Чистка и подготовка данных

### Подготовка данных для гипотезы 1

In [3]:
# данные для гипотезы №1
thesis_1 = df.copy()

# проверка на наличие записей, где сумма покупок меньше суммы возвратов
buys_lt_return = thesis_1["total_buy"] < thesis_1["total_return"]

# удаление этих записей
thesis_1 = thesis_1.drop(thesis_1[buys_lt_return].index)

# проверка на наличие записей, где значение target
# не соответствует рассчетной формуле: (0.3 - (сумма возвратов / сумма покупок))
lower_target = -0.7
upper_target = 0.3

incorrect_target_values = thesis_1[~((lower_target <= thesis_1["target"]) & (thesis_1["target"] <= upper_target))]

thesis_1 = thesis_1.drop(incorrect_target_values.index)

# отношение суммы возвратов к покупкам
thesis_1["current_target"] = np.where(
    thesis_1["total_buy"] != 0, 
    0.3 - (thesis_1["total_return"] / thesis_1["total_buy"]), 
    0
)

# совершались ли возвраты
thesis_1["was_return"] = thesis_1["current_target"] < 0.3

thesis_1 = thesis_1[["current_target", "target", "was_return"]]

# только те, кто совершал возвраты
thesis_1 = thesis_1[thesis_1["was_return"]]

# берем порог по 75-му перцентилю
threshold = thesis_1["current_target"].quantile(0.75)

# чем выше target|current_target, тем меньше возвратов
thesis_1["curr_return_group"] = np.where(
    thesis_1["current_target"] > threshold,
    "high_curr_target_return",
    "low_curr_target_return",
)

high_curr_target_return = thesis_1.loc[
    thesis_1["curr_return_group"] == "high_curr_target_return",
    "target",
]

low_curr_target_return = thesis_1.loc[
    thesis_1["curr_return_group"] == "low_curr_target_return",
    "target",
]

t_stat_1, p_value_1 = stats.ttest_ind(
    low_curr_target_return,
    high_curr_target_return,
    equal_var=False,
    alternative="less",
)


In [4]:
def prepare_thesis1_data():
    """Подготовка данных со статистикой по группам."""
    ROUND_AT = 4
    
    group_stats = (
        thesis_1
        .groupby("curr_return_group")["target"]
        .agg(["mean", "std", "count"])
        .round(ROUND_AT)
        .reset_index()
    )

    group_stats["ci_lower"] = round(
        group_stats["mean"] - 1.96 * group_stats["std"] / np.sqrt(group_stats["count"]),
        ROUND_AT,
    )
    group_stats["ci_upper"] = round(
        group_stats["mean"] + 1.96 * group_stats["std"] / np.sqrt(group_stats["count"]),
        ROUND_AT,
    )

    return group_stats

In [5]:
def build_thesis_1():
    plot_data = prepare_thesis1_data()

    gr.Markdown(f"""
    ## Средние значения target по группам в зависимости от текущего отношения возвратов к покупкам.

    ### Результат t-статистики: {t_stat_1:.2f}
    ### Различие статистически значимо (p = {p_value_1:.2e})
    """)

    gr.BarPlot(
        plot_data,
        x="curr_return_group",
        y="mean",
        y_title="Среднее значение target",
        x_title="Группа",
        tooltip=["curr_return_group", "mean", "ci_lower", "ci_upper"],
        color="curr_return_group",
    )

    col = [
        "Группа", "Среднее", "Станд. отклонение",
        "Количество", "CI нижний", "CI верхний",
    ]
    plot_data.columns = col

    gr.DataFrame(value=plot_data[col])

### Подготовка данных для гипотезы 2

In [6]:
thesis_2 = df.copy()

thesis_2["first_login"] = pd.to_datetime(thesis_2["first_login"])
thesis_2["reg_dt"] = pd.to_datetime(thesis_2["reg_dt"])
thesis_2["first_buy"] = pd.to_datetime(thesis_2["first_buy"])

thesis_2["date_correct?"] = "ок"

purchase_no_date = (thesis_2["total_buy"] > 0) & thesis_2["first_buy"].isna()
reg_after_login = thesis_2["reg_dt"] > thesis_2["first_login"]
login_after_buy = thesis_2["first_login"] > thesis_2["first_buy"]
reg_after_buy = thesis_2["reg_dt"] > thesis_2["first_buy"]

invalid_condition = purchase_no_date | reg_after_login | login_after_buy | reg_after_buy

thesis_2.loc[invalid_condition, "date_correct?"] = "исключаем"

thesis_2 = thesis_2[thesis_2["date_correct?"] == "ок"]
thesis_2["days_to_first_buy"] = (thesis_2["first_buy"] - thesis_2["reg_dt"]).dt.days

group_first_week = thesis_2[thesis_2["days_to_first_buy"] <= 7]
group_after_first_week = thesis_2[thesis_2["days_to_first_buy"] > 7]

first_week_target = group_first_week["target"]
after_first_week_target = group_after_first_week["target"]

t_stat_2, p_value_2 = stats.ttest_ind(
    after_first_week_target,
    first_week_target,
    equal_var=False,
    alternative="less",
)

In [7]:
def build_thesis_2():
    """Подготовка данных со статистикой по группам."""
    data = pd.DataFrame({
        "target": pd.concat([first_week_target, after_first_week_target]),
        "group": ["Первая неделя"] * len(first_week_target) + 
                 ["После недели"] * len(after_first_week_target),
        "x_jitter": np.concatenate([
            np.random.normal(0, 0.05, len(first_week_target)),
            np.random.normal(1, 0.05, len(after_first_week_target))
        ])
    })

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=data[data["group"] == "Первая неделя"]["x_jitter"],
        y=data[data["group"] == "Первая неделя"]["target"],
        mode="markers",
        name="Первая неделя",
        marker=dict(color="lightblue", size=10, line=dict(color="navy", width=1)),
        hovertemplate="Target: %{y:.2f}<extra></extra>",
        opacity=0.7
    ))

    fig.add_trace(go.Scatter(
        x=data[data["group"] == "После недели"]["x_jitter"],
        y=data[data["group"] == "После недели"]["target"],
        mode="markers",
        name="После недели",
        marker=dict(color="lightgreen", size=10, line=dict(color="darkgreen", width=1)),
        hovertemplate="Target: %{y:.2f}<extra></extra>",
        opacity=0.7
    ))

    fig.add_trace(go.Scatter(
        x=[0, 1],
        y=[first_week_target.mean(), after_first_week_target.mean()],
        mode="markers",
        name="Среднее",
        marker=dict(color=["red", "darkred"], size=15, symbol="diamond")
    ))

    fig.update_layout(
        title="Распределение target по периодам",
        xaxis=dict(
            tickvals=[0, 1],
            ticktext=["Первая неделя", "После недели"],
            title="Группа"
        ),
        yaxis_title="Значение target",
        width=None,
        height=None,
        autosize=True,
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=-0.2,
            xanchor="center",
            x=0.5
        )
    )

    return fig

In [8]:
def thesis_2_markdown():
    """Описание графика для Markdown."""
    return f"""
    ## Средние значения target по группам в зависимости от интервала между датой первой покупки и регистрации.

    ### Результат t-статистики: {t_stat_2:.2f}
    ### Различие статистически не значительно (p = {p_value_2:.2f})
    """

### Подготовка данных для гипотезы 3

In [9]:
thesis_3 = df.copy()

quant_platform_num = thesis_3["platform_num"].quantile(0.99)

thesis_3 = thesis_3[thesis_3["platform_num"] < quant_platform_num]

# разбиение на группы по количеству используемых устройств
platform_bins = [1, 8, np.inf]

platform_labels = ["low", "high"]

thesis_3["platform_num_group"] = pd.cut(
    thesis_3["platform_num"],
    bins=platform_bins,
    labels=platform_labels,
    right=False,
)

low_platform_group = thesis_3.loc[
    thesis_3["platform_num_group"] == "low",
    "target",
]

high_platform_group = thesis_3.loc[
    thesis_3["platform_num_group"] == "high",
    "target",
]

h_statistic_t3_kruskal, p_value_t3_kruskal = stats.kruskal(low_platform_group, high_platform_group)


In [10]:
def build_thesis_3():
    gr.Markdown(f"""
    ## Среднее значение target по группам в зависимости от количества платформ.

    ### Результат h-статистики: {h_statistic_t3_kruskal:.2f}
    ### Различие статистически значимо (p = {p_value_t3_kruskal:.2e})
    """)

    gr.BarPlot(
        thesis_3,
        x="platform_num_group",
        y="target",
        y_aggregate="mean",
        x_title="Количество платформ входа",
        y_title="Среднее значение target",
        color="platform_num_group",
    )

# Построение дашборда

In [11]:
with gr.Blocks() as demo:
    gr.Markdown("# Анализ target по группам")
    with gr.Tab("Гипотеза 1"):
        build_thesis_1()
    with gr.Tab("Гипотеза 2"):
        gr.Markdown(thesis_2_markdown())
        gr.Plot(build_thesis_2)
    with gr.Tab("Гипотеза 3"):
        build_thesis_3()
       

demo.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


