In [2]:
import pandas as pd
import polars as pl
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.width', 150)
pd.set_option('future.no_silent_downcasting', True)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
sns.set()

In [3]:
dates = ['session_start', 'session_end', 'session_date', 'order_dt']
df = pd.read_csv('ecom_go_2.csv', parse_dates=dates)
df_pl = pl.from_pandas(df)

print(df.shape)
print(df.columns.tolist())

(1009, 19)
['user_id', 'region', 'device', 'channel', 'session_start', 'session_end', 'sessiondurationsec', 'session_date', 'month', 'day', 'hour_of_day', 'order_dt', 'revenue', 'payment_type', 'promo_code', 'final_price', 'time_of_day', 'payer', 'week']


In [4]:
df_unchanged = df.copy()
print(df_unchanged.shape, df_unchanged['payer'].sum())

df_replaced = df.copy()
df_replaced['revenue'] = np.where(df_replaced['revenue'] == 100000, 9999, df_replaced['revenue'])
df_replaced['revenue'] = np.where(df_replaced['revenue'] == 1, 4999, df_replaced['revenue'])
print(df_replaced.shape, df_replaced['payer'].sum())

df_removed = df.copy()
to_remove = df[df['revenue'].isin([1, 100000])].index
df_removed.drop(to_remove, inplace=True)
print(df_removed.shape, df_removed['payer'].sum())


dfs = [(df_unchanged, "ORIGINAL DATAFRAME:"), 
       (df_replaced, "REPLACED DATAFRAME:"), 
       (df_removed, "REMOVED DATAFRAME:")]

(1009, 19) 282
(1009, 19) 282
(1003, 19) 276


### `Рассчитать средний чек`

In [5]:
df['revenue'].unique()

array([  9999,      0,   4999,   5999,      1, 100000])

In [7]:
df_replaced.revenue.sum()

np.int64(1585718)

In [6]:
def calculate_average_revenue(df):
    total_revenue = df['revenue'].sum()
    
    total_orders = df['payer'].sum()
    total_users = df['user_id'].nunique()
    
    AOV = (total_revenue / total_orders).round().astype(int).item()
    ARPU = (total_revenue / total_users).round().astype(int).item()

    return f'{AOV=:,.0f}, {ARPU=:,.0f}'

for df_go, df_name in dfs:
    print(df_name, calculate_average_revenue(df_go))

ORIGINAL DATAFRAME: AOV=6,864, ARPU=1,940
REPLACED DATAFRAME: AOV=5,623, ARPU=1,589
REMOVED DATAFRAME: AOV=5,564, ARPU=1,548


### `Сколько покупок в среднем совершает 1 пользователь?`

In [7]:
def calculate_purchases_per_user(df):
    total_purchases = df['payer'].sum()
    total_users = df['user_id'].nunique()  # Count all unique users (paid + unpaid)
    avg_per_user = round(total_purchases / total_users, 2)
    return avg_per_user

for df_go, df_name in dfs:
    print(df_name, calculate_purchases_per_user(df_go))

ORIGINAL DATAFRAME: 0.28
REPLACED DATAFRAME: 0.28
REMOVED DATAFRAME: 0.28


In [8]:
def calculate_purchases_per_user(df):
    total_purchases = df['payer'].sum()
    paid_users = df[df['payer'] == 1]['user_id'].nunique()  # Only count users who made at least one purchase
    avg_per_user = round(total_purchases / paid_users, 2)
    return avg_per_user

for df_go, df_name in dfs:
    print(df_name, calculate_purchases_per_user(df_go))

ORIGINAL DATAFRAME: 1.02
REPLACED DATAFRAME: 1.02
REMOVED DATAFRAME: 1.02


### `Рассчитать среднюю продолжительность сессии по рекламным каналам` 

In [9]:
def calculate_avg_duration_by_channel(df):
    return (df.groupby('channel')[['sessiondurationsec']].mean() // 60)

for df_go, df_name in dfs:
    print(df_name, calculate_avg_duration_by_channel(df_go))
    print()

ORIGINAL DATAFRAME:                      sessiondurationsec
channel                                
email-рассылки                     33.0
organic                            31.0
контекстная реклама                32.0
реклама у блогеров                 31.0
социальные сети                    28.0

REPLACED DATAFRAME:                      sessiondurationsec
channel                                
email-рассылки                     33.0
organic                            31.0
контекстная реклама                32.0
реклама у блогеров                 31.0
социальные сети                    28.0

REMOVED DATAFRAME:                      sessiondurationsec
channel                                
email-рассылки                     33.0
organic                            31.0
контекстная реклама                32.0
реклама у блогеров                 31.0
социальные сети                    28.0



### `Рассчитать среднюю продолжительность сессии по типу устройства` 

In [10]:
def calculate_avg_duration_by_device(df):
    return (df.groupby('device')[['sessiondurationsec']].mean() // 60)

for df_go, df_name in dfs:
    print(df_name, calculate_avg_duration_by_device(df_go))
    print()

ORIGINAL DATAFRAME:          sessiondurationsec
device                     
Android                31.0
Mac                    29.0
PC                     32.0
iPhone                 28.0

REPLACED DATAFRAME:          sessiondurationsec
device                     
Android                31.0
Mac                    29.0
PC                     32.0
iPhone                 28.0

REMOVED DATAFRAME:          sessiondurationsec
device                     
Android                31.0
Mac                    29.0
PC                     32.0
iPhone                 28.0



### `Определить топ-3 рекламных канала по среднему чеку`

In [29]:
def top_channels(df):
    aov_by_channel = (df[df['revenue'] > 0].groupby('channel')['revenue'].mean()
                     .round().astype(int).sort_values(ascending=False))
    
    return aov_by_channel
for df_go, df_name in dfs:
    print(df_name, top_channels(df_go))
    print()

ORIGINAL DATAFRAME: channel
реклама у блогеров     11999
organic                 7681
социальные сети         5618
email-рассылки          5428
контекстная реклама     5310
Name: revenue, dtype: int64

REPLACED DATAFRAME: channel
реклама у блогеров     5792
organic                5692
социальные сети        5663
email-рассылки         5428
контекстная реклама    5310
Name: revenue, dtype: int64

REMOVED DATAFRAME: channel
социальные сети        5669
organic                5599
реклама у блогеров     5480
email-рассылки         5428
контекстная реклама    5310
Name: revenue, dtype: int64



### `Определить топ-3 региона по среднему чеку`

In [27]:
def top_regions(df):
    aov_by_region = (df[df['revenue'] > 0].groupby('region')['revenue'].mean()
                    .round().astype(int).sort_values(ascending=False))

    return aov_by_region

for df_go, df_name in dfs:
    print(df_name, top_regions(df_go))
    print()

ORIGINAL DATAFRAME: region
France           12653
UK                8381
United States     6014
Germany           5479
Name: revenue, dtype: int64

REPLACED DATAFRAME: region
UK               5734
France           5730
United States    5608
Germany          5479
Name: revenue, dtype: int64

REMOVED DATAFRAME: region
UK               5605
United States    5592
Germany          5479
France           5374
Name: revenue, dtype: int64



### `Определить топ-3 месяца по среднему чеку с разбивкой по регионам`

In [8]:
def top_3_months_by_avg_check_per_region(df):
    monthly_avg_check = (
        df[df['revenue'] > 0]
        .groupby(['region', 'month'])
        .agg(avg_check=('revenue', 'mean'))
        .reset_index()
    )
    
    monthly_avg_check['avg_check'] = monthly_avg_check['avg_check'].round().astype(int)

    top_3_months = (
        monthly_avg_check
        .sort_values(['region', 'avg_check'], ascending=[True, False])
        .groupby('region')
        .head(3)
    )
    
    return top_3_months

# for df_go, df_name in dfs:
#     print(df_name, top_3_months_by_avg_check_per_region(df_go)[['region', 'month', 'avg_check']])
#     print()
result = top_3_months_by_avg_check_per_region(df_replaced)
print(result)

           region  month  avg_check
0          France      5       7999
1          France      6       5570
5          France     10       5332
8         Germany      7       7499
7         Germany      6       5799
10        Germany      9       5249
12             UK      5       7999
17             UK     10       6142
15             UK      8       5635
18  United States      5       6477
21  United States      8       5713
19  United States      6       5666


### `Посчитать MAU по каждому месяцу с разбивкой по рекламным каналам и выделить топ-3 рекламных каналов по количеству уникальных пользователей в месяц`

In [18]:
def calculate_mau_by_channel(df):
    # Calculate MAU (unique users) for each month and channel
    mau_by_channel = (
        df.groupby(['month', 'channel'])['user_id']
        .nunique()
        .reset_index(name='unique_users')
    )
    
    # Find top 3 channels by unique users for each month
    top_3_channels = (
        mau_by_channel
        .sort_values(['month', 'unique_users'], ascending=[True, False])
        .groupby('month')
        .head(3)
    )
    
    return top_3_channels

for df_go, df_name in dfs:
    print(df_name, calculate_mau_by_channel(df_go))
    print()

ORIGINAL DATAFRAME:     month              channel  unique_users
1       5              organic            76
4       5      социальные сети            41
3       5   реклама у блогеров            14
6       6              organic            59
9       6      социальные сети            53
7       6  контекстная реклама            28
14      7      социальные сети            57
11      7              organic            46
12      7  контекстная реклама            24
19      8      социальные сети            69
16      8              organic            61
17      8  контекстная реклама            37
24      9      социальные сети            67
21      9              organic            58
22      9  контекстная реклама            38
29     10      социальные сети            85
26     10              organic            47
27     10  контекстная реклама            27

REPLACED DATAFRAME:     month              channel  unique_users
1       5              organic            76
4       5     

In [19]:
df.columns

Index(['user_id', 'region', 'device', 'channel', 'session_start', 'session_end', 'sessiondurationsec', 'session_date', 'month', 'day', 'hour_of_day',
       'order_dt', 'revenue', 'payment_type', 'promo_code', 'final_price', 'time_of_day', 'payer', 'week'],
      dtype='object')

### `Составить таблицу в которой будет указано по рекламным каналам: количество пользователей, количество уникальных пользователей, количество платящих пользователей, сумма продаж и определить какой источник “принес” больше всего платящих пользователей и большую сумму продаж`

In [30]:
def analyze_channel_performance(df):
    channel_performance = df.groupby('channel').agg(
        users_count=('user_id', 'count'),
        unique_users=('user_id', 'nunique'),
        paying_users=('user_id', lambda x: x[df.loc[x.index, 'payer'] == 1].nunique()),
        total_sales=('revenue', 'sum')
    )
    
    channel_performance['paying_users_rank'] = channel_performance['paying_users'].rank(method='dense', ascending=False)
    channel_performance['total_sales_rank'] = channel_performance['total_sales'].rank(method='dense', ascending=False)
    
    top_paying_users_channel = channel_performance.sort_values('paying_users', ascending=False).index[0]
    top_sales_channel = channel_performance.sort_values('total_sales', ascending=False).index[0]
    
    print(f"Channel with most paying users: {top_paying_users_channel}")
    print(f"Channel with highest total sales: {top_sales_channel}")
    
    return channel_performance

for df_go, df_name in dfs:
    print(df_name, analyze_channel_performance(df_go))
    print()

Channel with most paying users: социальные сети
Channel with highest total sales: organic
ORIGINAL DATAFRAME:                      users_count  unique_users  paying_users  total_sales  paying_users_rank  total_sales_rank
channel                                                                                                       
email-рассылки                24            23             6        37993                5.0               5.0
organic                      347           346            87       675916                2.0               1.0
контекстная реклама          162           159            42       238955                3.0               4.0
реклама у блогеров           101           100            29       347973                4.0               3.0
социальные сети              375           370           113       634889                1.0               2.0

Channel with most paying users: социальные сети
Channel with highest total sales: социальные сети
REPLACED DATAF