In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from typing import *

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 150)
pd.set_option('display.max_colwidth', None)
pd.set_option('future.no_silent_downcasting', True)
sns.set()

df = pd.read_csv('data_part_2.csv')

print(df.shape)
df.head()

(1009, 18)


Unnamed: 0,user_id,region,device,channel,session_start,session_end,sessiondurationsec,session_date,month,day,hour_of_day,order_dt,revenue,payment_type,promo_code,final_price,time_of_day,sold
0,529697267522,United States,iPhone,социальные сети,2019-05-01 00:06:40,2019-05-01 00:07:06,26.0,2019-05-01,5,3,0,2019-05-01 00:06:40,9999.0,Mobile payments,0.0,9999.0,night,1
1,601292388085,United States,PC,organic,2019-05-01 06:56:16,2019-05-01 07:09:18,782.0,2019-05-01,5,3,7,,,,,,morning,0
2,852898876338,United States,Mac,социальные сети,2019-05-01 04:30:45,2019-05-01 04:34:56,251.0,2019-05-01,5,3,4,,,,,,night,0
3,998513020664,United States,iPhone,социальные сети,2019-05-01 18:53:42,2019-05-01 18:57:35,233.0,2019-05-01,5,3,18,,,,,,evening,0
4,240702200943,United States,Mac,социальные сети,2019-05-02 14:04:32,2019-05-02 14:09:51,319.0,2019-05-02,5,4,14,,,,,,day,0


# Set Proper Data Types

In [2]:
dtype_mapping = {
    'user_id': 'int64', 
    'region': 'object', 
    'device': 'object', 
    'channel': 'object', 
    'session_start': 'datetime64[ns]', 
    'session_end': 'datetime64[ns]', 
    'sessiondurationsec': 'float64', 
    'session_date': 'datetime64[ns]', 
    'month': 'int64', 
    'day': 'int64', 
    'hour_of_day': 'int64', 
    'order_dt': 'datetime64[ns]', 
    'revenue': 'float64', 
    'payment_type': 'object', 
    'promo_code': 'float64'
}

df = df.astype(dtype_mapping)

# 1. Data Preparation:

In [5]:
def prepare_daily_channel_purchases(data):
    daily_purchases = (data
                      .groupby(['session_date', 'region', 'channel'])['sold']
                      .sum()
                      .reset_index())
    return daily_purchases

# Let's look at data
daily_data = prepare_daily_channel_purchases(df)
print("Sample of daily data:")
print(daily_data.head())
print("\nUnique channels:", daily_data['channel'].unique())

Sample of daily data:
  session_date         region             channel  sold
0   2019-05-01  United States             organic     0
1   2019-05-01  United States     социальные сети     1
2   2019-05-02         France  реклама у блогеров     0
3   2019-05-02  United States     социальные сети     0
4   2019-05-03         France             organic     0

Unique channels: ['organic' 'социальные сети' 'реклама у блогеров' 'контекстная реклама'
 'email-рассылки']


# 2. Check for normality:

In [7]:
def test_normality_channel_purchases(daily_data):
    results = []
    for region in daily_data['region'].unique():
        for channel in daily_data['channel'].unique():
            mask = (daily_data['region'] == region) & (daily_data['channel'] == channel)
            purchases = daily_data[mask]['sold']
            
            if len(purchases) > 2:  # Need at least 3 samples for Shapiro-Wilk
                stat, pval = stats.shapiro(purchases)
                results.append({
                    'region': region,
                    'channel': channel,
                    'n_samples': len(purchases),
                    'shapiro_p': pval
                })
    
    return pd.DataFrame(results)

normality_results = test_normality_channel_purchases(daily_data)
print("\nNormality test results:")
print(normality_results)


Normality test results:
           region              channel  n_samples     shapiro_p
0   United States              organic        132  3.478933e-15
1   United States      социальные сети        158  3.347964e-15
2   United States   реклама у блогеров         35  8.357733e-09
3   United States  контекстная реклама         18  9.265257e-06
4          France              organic         29  3.103441e-10
5          France   реклама у блогеров         18  1.186212e-05
6          France  контекстная реклама         38  4.180788e-09
7          France       email-рассылки         11  5.145814e-05
8         Germany              organic         29  4.327982e-08
9         Germany   реклама у блогеров         20  1.857047e-06
10        Germany  контекстная реклама         39  1.016136e-09
11        Germany       email-рассылки          5  1.309782e-04
12             UK              organic         31  3.376156e-08
13             UK   реклама у блогеров         22  5.144927e-07
14             

# All p-values are < 0.05, indicating non-normal distributions everywhere. Therefore, we'll use Kruskal-Wallis test for each region.

In [8]:
def test_channel_effect_by_region(daily_data):
    """
    Tests effect of channel on daily purchases for each region using Kruskal-Wallis.
    """
    results = []
    
    for region in daily_data['region'].unique():
        region_data = daily_data[daily_data['region'] == region]
        
        # Get channel groups
        channel_groups = [group['sold'].values 
                         for _, group in region_data.groupby('channel')]
        
        # Kruskal-Wallis test
        stat, pval = stats.kruskal(*channel_groups)
        
        # Get summary statistics
        channel_stats = region_data.groupby('channel')['sold'].agg(['size', 'mean'])
        
        results.append({
            'region': region,
            'statistic': stat,
            'p_value': pval,
            'significant': pval < 0.05,
            **{f'{ch}_n': channel_stats.loc[ch, 'size'] 
               for ch in channel_stats.index},
            **{f'{ch}_mean': channel_stats.loc[ch, 'mean'] 
               for ch in channel_stats.index}
        })
    
    return pd.DataFrame(results)

results = test_channel_effect_by_region(daily_data)
print("\nTest results:")
print(results)


Test results:
          region  statistic   p_value  significant  organic_n  контекстная реклама_n  реклама у блогеров_n  социальные сети_n  organic_mean  \
0  United States   9.302521  0.025528         True        132                     18                    35              158.0      0.515152   
1         France   5.713365  0.126420        False         29                     38                    18                NaN      0.103448   
2        Germany   0.344477  0.951457        False         29                     39                    20                NaN      0.275862   
3             UK   1.108835  0.774942        False         31                     44                    22                NaN      0.290323   

   контекстная реклама_mean  реклама у блогеров_mean  социальные сети_mean  email-рассылки_n  email-рассылки_mean  
0                  0.333333                 0.285714               0.71519               NaN                  NaN  
1                  0.315789          

# Since we found a significant difference in USA (p = 0.025 < 0.05), we need to conduct post-hoc tests to understand specifically which channels differ from each other.

In [11]:
def perform_posthoc_tests(daily_data, region='United States', alpha=0.05):
    region_data = daily_data[daily_data['region'] == region]
    channels = region_data['channel'].unique()
    n_comparisons = len(channels) * (len(channels) - 1) // 2
    adjusted_alpha = alpha / n_comparisons

    results = []
    for i in range(len(channels)):
        for j in range(i + 1, len(channels)):
            ch1_data = region_data[region_data['channel'] == channels[i]]['sold']
            ch2_data = region_data[region_data['channel'] == channels[j]]['sold']
            
            stat, pval = stats.mannwhitneyu(ch1_data, ch2_data, alternative='two-sided')
            
            results.append({
                'channel1': channels[i],
                'channel2': channels[j],
                'statistic': stat,
                'p_value': pval,
                'significant': pval < adjusted_alpha,
                'channel1_mean': ch1_data.mean(),
                'channel2_mean': ch2_data.mean()
            })
    
    return pd.DataFrame(results)

# Perform post-hoc tests for USA
posthoc_results = perform_posthoc_tests(daily_data)
print("\nPost-hoc test results for USA:")
print(posthoc_results)


Post-hoc test results for USA:
             channel1             channel2  statistic   p_value  significant  channel1_mean  channel2_mean
0             organic      социальные сети     9414.5  0.110287        False       0.515152       0.715190
1             organic   реклама у блогеров     2676.0  0.091945        False       0.515152       0.285714
2             organic  контекстная реклама     1341.5  0.304774        False       0.515152       0.333333
3     социальные сети   реклама у блогеров     3446.0  0.010569        False       0.715190       0.285714
4     социальные сети  контекстная реклама     1732.0  0.092901        False       0.715190       0.333333
5  реклама у блогеров  контекстная реклама      306.5  0.844931        False       0.285714       0.333333


Initial analysis showed channel effect only in USA (p = 0.026 < 0.05)


However, post-hoc analysis with Bonferroni correction (α = 0.05/6 = 0.0083) revealed:

No significant pairwise differences between channels


Closest to significance: социальные сети vs реклама у блогеров (p = 0.011)

# While there's some evidence of channel effect in the USA market, the differences between specific channels are not strong enough to be statistically significant after controlling for multiple comparisons. No channel effects were detected in other regions.