In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from typing import *

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 150)
pd.set_option('display.max_colwidth', None)
pd.set_option('future.no_silent_downcasting', True)
sns.set()

df = pd.read_csv('data_part_2.csv')

print(df.shape)
df.head()

(1009, 18)


Unnamed: 0,user_id,region,device,channel,session_start,session_end,sessiondurationsec,session_date,month,day,hour_of_day,order_dt,revenue,payment_type,promo_code,final_price,time_of_day,sold
0,529697267522,United States,iPhone,социальные сети,2019-05-01 00:06:40,2019-05-01 00:07:06,26.0,2019-05-01,5,3,0,2019-05-01 00:06:40,9999.0,Mobile payments,0.0,9999.0,night,1
1,601292388085,United States,PC,organic,2019-05-01 06:56:16,2019-05-01 07:09:18,782.0,2019-05-01,5,3,7,,,,,,morning,0
2,852898876338,United States,Mac,социальные сети,2019-05-01 04:30:45,2019-05-01 04:34:56,251.0,2019-05-01,5,3,4,,,,,,night,0
3,998513020664,United States,iPhone,социальные сети,2019-05-01 18:53:42,2019-05-01 18:57:35,233.0,2019-05-01,5,3,18,,,,,,evening,0
4,240702200943,United States,Mac,социальные сети,2019-05-02 14:04:32,2019-05-02 14:09:51,319.0,2019-05-02,5,4,14,,,,,,day,0


# Set Proper Data Types

In [2]:
dtype_mapping = {
    'user_id': 'int64', 
    'region': 'object', 
    'device': 'object', 
    'channel': 'object', 
    'session_start': 'datetime64[ns]', 
    'session_end': 'datetime64[ns]', 
    'sessiondurationsec': 'float64', 
    'session_date': 'datetime64[ns]', 
    'month': 'int64', 
    'day': 'int64', 
    'hour_of_day': 'int64', 
    'order_dt': 'datetime64[ns]', 
    'revenue': 'float64', 
    'payment_type': 'object', 
    'promo_code': 'float64'
}

df = df.astype(dtype_mapping)

# Custom Functions

In [12]:
def aggregate_daily_data(
    data: pd.DataFrame,
    group_col: str,  # 'device' or 'channel'
    date_col: str = 'session_date',
    target_col: str = 'sold'
) -> pd.DataFrame:
    """
    Aggregates data to daily counts by region and specified grouping column.
    """
    daily_data = (data
                  .groupby([date_col, 'region', group_col])[target_col]
                  .sum()
                  .reset_index())
    return daily_data

def test_group_effect(
    data: pd.DataFrame,
    group_col: str,  # 'device' or 'channel'
    combine_groups: dict = None,  # e.g., {'iPhone': 'mobile', 'Android': 'mobile'}
    region_col: str = 'region',
    target_col: str = 'sold',
    date_col: str = 'session_date',
    alpha: float = 0.05
) -> pd.DataFrame:
    """
    Tests effect of specified group on daily purchases for each region.
    
    Parameters
    ----------
    data : DataFrame
        Raw data
    group_col : str
        Column containing groups to compare (e.g., 'device', 'channel')
    combine_groups : dict, optional
        Mapping to combine groups (e.g., {'iPhone': 'mobile', 'Android': 'mobile'})
    region_col : str
        Column containing region information
    target_col : str
        Column containing values to compare
    date_col : str
        Column containing dates
    alpha : float
        Significance level
        
    Returns
    -------
    DataFrame
        Test results for each region
    """
    # First combine groups if specified
    data = data.copy()
    if combine_groups:
        data[f'{group_col}_combined'] = data[group_col].map(combine_groups)
        group_col = f'{group_col}_combined'
    
    # Then aggregate to daily level
    daily_data = (data
                 .groupby([date_col, region_col, group_col])[target_col]
                 .sum()
                 .reset_index())
    
    results = []
    for region in daily_data[region_col].unique():
        region_data = daily_data[daily_data[region_col] == region]
        
        # Get groups for testing
        groups = [group[target_col].values 
                 for name, group in region_data.groupby(group_col)]
        
        # Choose appropriate test based on number of groups
        if len(groups) == 2:
            stat, pval = stats.mannwhitneyu(groups[0], groups[1], alternative='two-sided')
            test_name = 'Mann-Whitney U'
        else:
            stat, pval = stats.kruskal(*groups)
            test_name = 'Kruskal-Wallis'
        
        # Calculate summary statistics
        group_stats = region_data.groupby(group_col)[target_col].agg(['size', 'mean'])
        
        # Create result dictionary
        result = {
            'region': region,
            'statistic': stat,
            'p_value': pval,
            'test_used': test_name,
            'significant': pval < alpha
        }
        
        # Add group statistics to results
        for group in group_stats.index:
            result[f'{group}_n'] = group_stats.loc[group, 'size']
            result[f'{group}_mean'] = group_stats.loc[group, 'mean']
        
        results.append(result)
    
    return pd.DataFrame(results)

# Example usage for devices:
device_results = test_group_effect(
    df,
    group_col='device',
    combine_groups={
        'iPhone': 'mobile',
        'Android': 'mobile',
        'Mac': 'desktop',
        'PC': 'desktop'
    }
)

print("\nDevice Results:")
print(device_results)

# Example usage for channels:
# First let's see what channels we have
print("\nUnique channels:", df['channel'].unique())
# Then we can run the analysis
channel_results = test_group_effect(df, group_col='channel')
print("\nChannel Results:")
print(channel_results)


Device Results:
          region  statistic   p_value       test_used  significant  desktop_n  desktop_mean  mobile_n  mobile_mean
0  United States     9693.5  0.029154  Mann-Whitney U         True        133      0.526316       168     0.755952
1         France     1147.5  0.492671  Mann-Whitney U        False         48      0.333333        45     0.222222
2        Germany      896.5  0.331919  Mann-Whitney U        False         43      0.232558        46     0.326087
3             UK     1085.0  0.720463  Mann-Whitney U        False         50      0.340000        45     0.377778

Unique channels: ['социальные сети' 'organic' 'реклама у блогеров' 'контекстная реклама'
 'email-рассылки']

Channel Results:
          region  statistic   p_value       test_used  significant  organic_n  organic_mean  контекстная реклама_n  контекстная реклама_mean  \
0  United States   9.302521  0.025528  Kruskal-Wallis         True        132      0.515152                     18                  0.333