In [20]:
import numpy as np
import pandas as pd
from scipy import stats
import datetime 
import random
import tqdm

## задача про зависимость между долей удаляемых выбросов и мощностью

In [26]:
logs = pd.read_csv('2022-04-01T12_df_web_logs.csv')

In [27]:
logs['date'] = logs.apply(lambda x: pd.to_datetime(x['date']), axis=1)

In [28]:
logs = logs[
    (logs['date'] >= datetime.datetime(2022, 3, 1))
    &
    (logs['date'] < datetime.datetime(2022, 3, 8))
]


In [29]:
users = logs.user_id.unique()

In [30]:
# test_data.load_time.mean() / ctrl_data.load_time.mean()

In [31]:
outliers = [0.0002, 0.002, 0.02, 0.10, 0.20]
alpha = 0.05

In [32]:
# 1. ошибка 2 рода = вероятность не увидеть эффект, когда он есть
# 2. мощность = 1 - вероятность ошибки второго рода
# 3. вероятность ошибки 2 рода = (кол-во ошибок второго рода) / (кол-во наблюдений)

In [33]:
results_dict = {}
for item in outliers:
    results_dict[item] = []

for i in tqdm.tqdm(range(1000)):
    for outlier in outliers:
        
        random.shuffle(users)

        test_users = list(users[:1000])
        ctrl_users = list(users[1000:2000])
        
        test_users = pd.DataFrame(test_users, columns=['user_id'])
        ctrl_users = pd.DataFrame(ctrl_users, columns=['user_id'])
        
        test_data = logs.merge(test_users, how='inner', on='user_id')
        ctrl_data = logs.merge(ctrl_users, how='inner', on='user_id')
    
        test_data['load_time'] = test_data.apply(lambda x: float(x['load_time']), axis=1)
        ctrl_data['load_time'] = ctrl_data.apply(lambda x: float(x['load_time']), axis=1)
        
        test_data['load_time'] = test_data.apply(lambda x: x['load_time'] * 1.01, axis=1)
        
        left_q = outlier / 2
        right_q = 1 - outlier / 2

        test_data_left_boarder = np.quantile(test_data['load_time'], left_q)
        test_data_rght_boarder = np.quantile(test_data['load_time'], right_q)
        
        # вырезка персентиля в тесте
        test_data_tmp = test_data[
            (test_data['load_time'] >= test_data_left_boarder)
            & 
            (test_data['load_time'] <= test_data_rght_boarder)
        ].copy()['load_time']

        ctrl_data_left_boarder = np.quantile(ctrl_data['load_time'], left_q)
        ctrl_data_rght_boarder = np.quantile(ctrl_data['load_time'], right_q)
        
        # вырезка персентиля в контроле 
        ctrl_data_tmp = ctrl_data[
            (ctrl_data['load_time'] >= ctrl_data_left_boarder)
            & 
            (ctrl_data['load_time'] <= ctrl_data_rght_boarder)
        ].copy()['load_time']

        _, pvalue = stats.ttest_ind(ctrl_data_tmp, test_data_tmp)
        results_dict[outlier].append(1 if pvalue > alpha else 0)
    

100%|██████████| 1000/1000 [18:12<00:00,  1.09s/it]


In [34]:
for boarder in results_dict:
    results_dict[boarder] = sum(results_dict[boarder]) / len(results_dict[boarder])
    
print(results_dict)

{0.0002: 0.905, 0.002: 0.633, 0.02: 0.073, 0.1: 0.04, 0.2: 0.034}


## меняю способ добавления эффекта

In [35]:
logs = pd.read_csv('2022-04-01T12_df_web_logs.csv')
logs['date'] = logs.apply(lambda x: pd.to_datetime(x['date']), axis=1)
logs = logs[
    (logs['date'] >= datetime.datetime(2022, 3, 1))
    &
    (logs['date'] < datetime.datetime(2022, 3, 8))
]


In [36]:
users = logs.user_id.unique()

In [37]:
outliers = [0.0002, 0.002, 0.02, 0.10, 0.20]
alpha = 0.05

In [41]:
results_dict = {}
for item in outliers:
    results_dict[item] = []

for i in tqdm.tqdm(range(1000)):
    for outlier in outliers:
        
        random.shuffle(users)

        test_users = list(users[:1000])
        ctrl_users = list(users[1000:2000])
        
        test_users = pd.DataFrame(test_users, columns=['user_id'])
        ctrl_users = pd.DataFrame(ctrl_users, columns=['user_id'])
        
        test_data = logs.merge(test_users, how='inner', on='user_id')
        ctrl_data = logs.merge(ctrl_users, how='inner', on='user_id')
    
        test_data['load_time'] = test_data.apply(lambda x: float(x['load_time']), axis=1)
        ctrl_data['load_time'] = ctrl_data.apply(lambda x: float(x['load_time']), axis=1)
        
#         base_res = round(test_data['load_time'].mean() / ctrl_data['load_time'].mean(), 4)
        
        uplifted_indices = list(test_data.index)
        random.shuffle(uplifted_indices)
        uplift_indices_size = int(len(test_data.index)*0.01)
        uplifted_indices = list(uplifted_indices[:uplift_indices_size])
        
        # добавляю константу 1% данных
        const_value = test_data['load_time'].sum() * 0.01 / len(uplifted_indices)
        # const_value = test_data['load_time'].mean()

        for i in uplifted_indices:
            test_data.loc[i,'load_time'] += const_value
            
#         new_res = round(test_data['load_time'].mean() / ctrl_data['load_time'].mean(), 4)
        
#         print('Was ->', base_res, ', become ->', new_res, ', diff ->', round(new_res / base_res, 4))
        
        left_q = outlier / 2
        right_q = 1 - outlier / 2

        test_data_left_boarder = np.quantile(test_data['load_time'], left_q)
        test_data_rght_boarder = np.quantile(test_data['load_time'], right_q)
        
        # вырезка персентиля в тесте
        test_data_tmp = test_data[
            (test_data['load_time'] >= test_data_left_boarder)
            & 
            (test_data['load_time'] <= test_data_rght_boarder)
        ].copy()['load_time']

        ctrl_data_left_boarder = np.quantile(ctrl_data['load_time'], left_q)
        ctrl_data_rght_boarder = np.quantile(ctrl_data['load_time'], right_q)
        
        # вырезка персентиля в контроле 
        ctrl_data_tmp = ctrl_data[
            (ctrl_data['load_time'] >= ctrl_data_left_boarder)
            & 
            (ctrl_data['load_time'] <= ctrl_data_rght_boarder)
        ].copy()['load_time']

        _, pvalue = stats.ttest_ind(ctrl_data_tmp, test_data_tmp)
        results_dict[outlier].append(1 if pvalue > alpha else 0)
    

100%|██████████| 1000/1000 [15:20<00:00,  1.09it/s]


In [47]:
for boarder in results_dict:
    results_dict[boarder] = sum(results_dict[boarder]) / len(results_dict[boarder])
    
print(results_dict)

{0.0002: 0.906, 0.002: 0.663, 0.02: 0.559, 0.1: 0.709, 0.2: 0.687}


## Удаление выбросов

In [70]:
import pandas as pd


def process_outliers(metrics, bounds, outlier_process_type):
    """Возвращает новый датафрейм с обработанными выбросами в измерениях метрики.

    :metrics -> ['user_id', 'metric'].
    :bounds (tuple[float, float]): нижняя и верхняя границы метрики
    :outlier_process_type = способ обработки выбросов:
        'drop' - удаляем измерение,
        'clip' - заменяем выброс на значение ближайшей границы (lower_bound, upper_bound).
        
    :return df: таблица со столбцами ['user_id', 'metric']
    """
    metrics['metric'] = metrics.apply(lambda x: float(x['metric']), axis=1)
    
    if outlier_process_type == 'drop':
        indexes_1 = list(metrics[metrics['metric'] < bounds[0]].index)
        indexes_2 = list(metrics[metrics['metric'] > bounds[1]].index)
        metrics = metrics.drop(indexes_1).drop(indexes_2)
        
    elif outlier_process_type == 'clip':
        indexes_1 = list(metrics[metrics['metric'] < bounds[0]].index)
        indexes_2 = list(metrics[metrics['metric'] > bounds[1]].index)
        
        for index_1 in indexes_1:
            metrics.loc[index_1,'metric'] = bounds[0]
            
        for index_2 in indexes_2:
            metrics.loc[index_2,'metric'] = bounds[1]
    
    return metrics
        

In [71]:
metrics = pd.DataFrame({'user_id': [1, 2, 3], 'metric': [1., 2, 3]})
bounds = (0.1, 2.2,)
outlier_process_type = 'drop'
result = process_outliers(metrics, bounds, outlier_process_type)
print(result)
# result = pd.DataFrame({'user_id': [1, 2], 'metric': [1.0, 2.0]})

outlier_process_type = 'clip'
result = process_outliers(metrics, bounds, outlier_process_type)
print(result)
# result = pd.DataFrame({'user_id': [1, 2, 3], 'metric': [1.0, 2.0, 2.2]})

   user_id  metric
0        1     1.0
1        2     2.0
   user_id  metric
0        1     1.0
1        2     2.0
2        3     2.2


In [42]:
import pandas as pd

# создаем DataFrame
df = pd.DataFrame({'column_name': [1, 5, 12, 6, 18, 3, 9, 15, 2, 7]})

# удаляем строки, где значение в колонке "column_name" больше 10
df = df.query('column_name <= 10')

# или альтернативный вариант с использованием булевых индексов
# df = df[df['column_name'] <= 10]

print(df)

   column_name
0            1
1            5
3            6
5            3
6            9
8            2
9            7
