### Задача 1. Оценка эксперимента с CUPED
  
Оцените эксперимент «Sending email (correct link)» с использованием CUPED. В качестве ковариаты используйте 
выручку пользователей за 4 недели до эксперимента.

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from datetime import datetime, timedelta

In [2]:
sales_df = pd.read_csv("2022-05-03T12_df_sales.csv")
users_group = pd.read_csv("experiment_users.csv")

In [3]:
sales_df

Unnamed: 0,sale_id,date,count_pizza,count_drink,price,user_id
0,1000001,2022-02-04 10:00:24,1,0,720,1c1543
1,1000002,2022-02-04 10:02:28,1,1,930,a9a6e8
2,1000003,2022-02-04 10:02:35,3,1,1980,23420a
3,1000004,2022-02-04 10:03:06,1,1,750,3e8ed5
4,1000005,2022-02-04 10:03:23,1,1,870,cbc468
...,...,...,...,...,...,...
337783,1337784,2022-05-03 11:59:32,1,0,600,1f79c8
337784,1337785,2022-05-03 11:59:39,2,0,1500,cbff74
337785,1337786,2022-05-03 11:59:46,2,1,1500,c1e77e
337786,1337787,2022-05-03 11:59:50,2,0,1500,ddef7a


In [4]:
sales_df["date"] = pd.to_datetime(sales_df["date"])

In [5]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337788 entries, 0 to 337787
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   sale_id      337788 non-null  int64         
 1   date         337788 non-null  datetime64[ns]
 2   count_pizza  337788 non-null  int64         
 3   count_drink  337788 non-null  int64         
 4   price        337788 non-null  int64         
 5   user_id      337788 non-null  object        
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 15.5+ MB


In [6]:
exp_begin_date = datetime(2022, 4, 25)
exp_end_date = datetime(2022, 5, 2)
covariate_date = exp_begin_date - timedelta(weeks=4)

In [7]:
exp_sales = (
    sales_df[
        (sales_df["date"] >= exp_begin_date)
        & (sales_df["date"] < exp_end_date)
    ]
    .groupby("user_id")[["price"]]
    .sum()
)

In [8]:
covariate_sales = (
    sales_df[
        (sales_df["date"] >= covariate_date)
        & (sales_df["date"] < exp_begin_date)
    ]
    .groupby("user_id")[["price"]]
    .sum()
    .rename(columns={"price": f"cov_4_weeks"})
)

In [9]:
users_group.set_index("user_id", inplace=True)

In [10]:
users_group.head()

Unnamed: 0_level_0,pilot
user_id,Unnamed: 1_level_1
a9a6e8,0
23420a,0
cbc468,0
583c90,0
19ce47,0


In [11]:
df_experiment = (
    users_group.merge(exp_sales, left_index=True, right_index=True, how="left")
    .merge(covariate_sales, left_index=True, right_index=True, how="left")
    .rename(columns={"price": "metric"})
    .fillna(0)
)

In [12]:
df_experiment

Unnamed: 0_level_0,pilot,metric,cov_4_weeks
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a9a6e8,0,930.0,900.0
23420a,0,0.0,0.0
cbc468,0,0.0,0.0
583c90,0,2490.0,7350.0
19ce47,0,0.0,0.0
...,...,...,...
95b780,1,0.0,2220.0
e8287a,1,720.0,690.0
7f272d,1,0.0,840.0
3cd81f,1,840.0,600.0


In [14]:
df_experiment[["metric", "cov_4_weeks"]].corr()

Unnamed: 0,metric,cov_4_weeks
metric,1.0,0.17654
cov_4_weeks,0.17654,1.0


In [15]:
def calculate_theta(y_control, y_pilot, x_control, x_pilot):
    """Вычисляем Theta по данным двух групп.

    y_control - значения метрики во время пилота на контрольной группе
    y_pilot - значения метрики во время пилота на пилотной группе
    x_control - значения ковариант на контрольной группе
    x_pilot - значения ковариант на пилотной группе
    """
    y = np.hstack([y_control, y_pilot])
    x = np.hstack([x_control, x_pilot])
    covariance = np.cov(x, y)[0, 1]
    variance = x.var()
    theta = covariance / variance
    return theta


def check_cuped_test(df_control, df_pilot, covariate_column):
    """Проверяет гипотезу о равенстве средних с использованием CUPED.

    covariate_column - название стобца с ковариантой

    return - pvalue.
    """
    theta = calculate_theta(
        df_control["metric"],
        df_pilot["metric"],
        df_control[covariate_column],
        df_pilot[covariate_column],
    )
    metric_cuped_control = (
        df_control["metric"] - theta * df_control[covariate_column]
    )
    metric_cuped_pilot = (
        df_pilot["metric"] - theta * df_pilot[covariate_column]
    )
    _, pvalue = stats.ttest_ind(metric_cuped_control, metric_cuped_pilot)
    return pvalue

In [16]:
df_control = df_experiment[df_experiment["pilot"] == 0]
df_pilot = df_experiment[df_experiment["pilot"] == 1]

In [17]:
pvalue = check_cuped_test(df_control, df_pilot, "cov_4_weeks")
print(f"pvalue с CUPED {pvalue:0.4f}")

pvalue с CUPED 0.0539
