## Генерация искусственных данных

In [1]:
import pandas as pd
import numpy as np

### Генерация простого нормального распределения
Такие данные можно использовать только с обычным бутстрапом


In [2]:
def generate_simple_ab_data(num_samples=1000, control_mean=100, control_std=10, variant_mean=110, variant_std=10):
    """
    Generates artificial A/B test data using normal distributions.

    Parameters:
    - num_samples (int): The total number of samples to generate.
    - control_mean (float): Mean of the control group's normal distribution.
    - control_std (float): Standard deviation of the control group's normal distribution.
    - variant_mean (float): Mean of the variant group's normal distribution.
    - variant_std (float): Standard deviation of the variant group's normal distribution.

    Returns:
    - df (pd.DataFrame): A DataFrame with columns ["ab_variant", "user_id", "metric_value"].
    """

    # Split samples into two groups
    control_samples = int(num_samples / 2)
    variant_samples = num_samples - control_samples

    # Generate data for the control group
    control_data = np.random.normal(control_mean, control_std, control_samples)

    # Generate data for the variant group
    variant_data = np.random.normal(variant_mean, variant_std, variant_samples)

    # Combine the data
    data = np.concatenate([control_data, variant_data])

    # Create the ab_variant column
    ab_variant = ['control'] * control_samples + ['test'] * variant_samples

    # Create user ids
    user_ids = list(range(1, num_samples + 1))

    # Combine into a DataFrame
    df = pd.DataFrame({
        'ab_variant': ab_variant,
        'user_id': user_ids,
        'metric_value': data
    })
    return df



In [3]:
# Example usage
df = generate_simple_ab_data(num_samples=10000, control_mean=100, variant_mean=103, control_std=50, variant_std=50)

In [4]:
df.head()

Unnamed: 0,ab_variant,user_id,metric_value
0,control,1,73.494663
1,control,2,143.216464
2,control,3,41.753544
3,control,4,75.653054
4,control,5,94.726049


In [5]:
df.ab_variant.value_counts()

control    5000
test       5000
Name: ab_variant, dtype: int64

In [6]:
df.to_csv('sample_data.csv', index = False)

### Генерация данных для CUPED
На этих данных можно сравнить обычный бутсрап и cuped.

In [7]:
def generate_ab_data_for_cuped(num_users=10000, control_mean=100, control_std = 50, effect_size=3):
    # Setting random seed for reproducibility
    np.random.seed(42)

    # Generate user_ids
    user_ids = np.arange(num_users)

    # Generate pre-experiment data with some inherent structure and noise
    pre_experiment_data = np.random.normal(control_mean, control_std, num_users)

    # Create DataFrame for pre-experiment data
    df_pre_experiment = pd.DataFrame({
        'user_id': user_ids,
        'metric_value': pre_experiment_data
    })

    # Generate experiment data
    #noise = np.random.normal(0, control_std / 5, num_users)

    noise_test = np.random.normal(0, control_std / 5, num_users)
    noise_control = np.random.normal(0, control_std / 5, num_users)


    # For control group, we add only noise to pre-experiment data
    control_metric_values = pre_experiment_data + noise_control

    # For treatment group, we add a small effect (e.g., 5 units) + noise
    treatment_metric_values = pre_experiment_data + effect_size + noise_test

    # Combine control and treatment data
    ab_variant = ['control'] * num_users + ['test'] * num_users
    combined_metric_values = np.concatenate([control_metric_values, treatment_metric_values])

    # Create DataFrame for experiment data
    df_experiment = pd.DataFrame({
        'ab_variant': ab_variant,
        'user_id': list(user_ids) * 2,
        'metric_value': combined_metric_values
    })

    return df_experiment, df_pre_experiment


In [8]:
df_experiment, df_pre_experiment = generate_ab_data_for_cuped(control_mean=100, 
                                                              control_std = 50, 
                                                              effect_size=3)

In [9]:
df_experiment.head()

Unnamed: 0,ab_variant,user_id,metric_value
0,control,0,128.31857
1,control,1,95.920021
2,control,2,123.019228
3,control,3,181.947335
4,control,4,73.391505


In [10]:
df_pre_experiment.head()

Unnamed: 0,user_id,metric_value
0,0,124.835708
1,1,93.086785
2,2,132.384427
3,3,176.151493
4,4,88.292331


In [11]:
df_experiment.metric_value.std(), df_pre_experiment.metric_value.std()

(51.265110946493316, 50.173119025170166)

In [12]:
df_experiment.groupby('ab_variant').metric_value.mean()

ab_variant
control     99.768573
test       103.028541
Name: metric_value, dtype: float64

In [13]:
df_experiment.to_csv('sample_data.csv', index = False)
df_pre_experiment.to_csv('pre_exp_sample_data.csv', index = False)

### Генерация данных для всех методов
Генерирует данные, подходящие для всех методов. 
Моделирует линейную зависимость. 

In [14]:
def generate_ab_data_for_cupac(num_users=10000, control_mean=100, control_std = 50, effect_size=3):
    # Setting random seed for reproducibility
    np.random.seed(42)

    # Generate user_ids
    user_ids = np.arange(num_users)

    # Generate pre-experiment data with some inherent structure and noise
    pre_experiment_data = np.random.normal(control_mean, control_std, num_users)

    coef_a = np.random.randint(10)
    coef_b = np.random.randint(10)
    noise = np.random.normal(0, control_std , num_users)

    # Create DataFrame for pre-experiment data
    df_pre_experiment = pd.DataFrame({
        'feature_1': pre_experiment_data,
        'user_id': user_ids,
        'metric_value': coef_a * pre_experiment_data + coef_b + noise
    })

    # Generate experiment data
    noise_test = np.random.normal(0, control_std , num_users)
    noise_control = np.random.normal(0, control_std , num_users)

    # For control group, we add only noise to pre-experiment data
    control_metric_values = coef_a * (pre_experiment_data + noise_control) + coef_b 

    # For treatment group, we add a small effect (e.g., 5 units) + noise
    treatment_metric_values = coef_a * (pre_experiment_data + noise_test) + coef_b + effect_size 

    # Combine control and treatment data
    ab_variant = ['control'] * num_users + ['test'] * num_users
    combined_metric_values = np.concatenate([control_metric_values, treatment_metric_values])
    combined_features_values = np.concatenate([pre_experiment_data + noise_control, pre_experiment_data + noise_test])


    # Create DataFrame for experiment data
    df_experiment = pd.DataFrame({
        'feature_1': combined_features_values,
        'ab_variant': ab_variant,
        'user_id': list(user_ids) * 2,
        'metric_value': combined_metric_values
        
    })

    return df_experiment, df_pre_experiment


In [15]:
df_experiment, df_pre_experiment = generate_ab_data_for_cupac(control_mean=100, 
                                                              control_std = 10, 
                                                              effect_size=3)

In [16]:
df_experiment.head()

Unnamed: 0,feature_1,ab_variant,user_id,metric_value
0,104.508999,control,0,947.580992
1,104.972314,control,1,951.750822
2,107.466149,control,2,974.195339
3,107.203766,control,3,971.833897
4,93.071886,control,4,844.646976


In [17]:
df_pre_experiment.head()

Unnamed: 0,feature_1,user_id,metric_value
0,104.967142,0,953.441966
1,98.617357,1,886.048564
2,106.476885,2,963.810879
3,115.230299,3,1040.000719
4,97.658466,4,877.685605


In [18]:
df_experiment.metric_value.std(), df_pre_experiment.metric_value.std()

(127.1541285121924, 90.68630265079938)

In [19]:
df_experiment.groupby('ab_variant').metric_value.mean()

ab_variant
control    906.642692
test       910.224257
Name: metric_value, dtype: float64

In [20]:
df_experiment.to_csv('sample_data.csv', index = False)
df_pre_experiment.to_csv('pre_exp_sample_data.csv', index = False)