In [1]:
import pandas as pd
import polars as pl
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.width', 150)
pd.set_option('future.no_silent_downcasting', True)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
sns.set()

In [2]:
dates = ['session_start', 'session_end', 'session_date', 'order_dt']
df = pd.read_csv('ecom_go_2.csv', parse_dates=dates)
df_pl = pl.from_pandas(df)

print(df.shape)
print(df.columns.tolist())

(1009, 19)
['user_id', 'region', 'device', 'channel', 'session_start', 'session_end', 'sessiondurationsec', 'session_date', 'month', 'day', 'hour_of_day', 'order_dt', 'revenue', 'payment_type', 'promo_code', 'final_price', 'time_of_day', 'payer', 'week']


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   user_id             1009 non-null   int64         
 1   region              1009 non-null   object        
 2   device              1009 non-null   object        
 3   channel             1009 non-null   object        
 4   session_start       1009 non-null   datetime64[ns]
 5   session_end         1009 non-null   datetime64[ns]
 6   sessiondurationsec  1009 non-null   int64         
 7   session_date        1009 non-null   datetime64[ns]
 8   month               1009 non-null   int64         
 9   day                 1009 non-null   int64         
 10  hour_of_day         1009 non-null   int64         
 11  order_dt            282 non-null    datetime64[ns]
 12  revenue             1009 non-null   int64         
 13  payment_type        282 non-null    object      

In [35]:
df_unchanged = df.copy()
print(df_unchanged.shape, df_unchanged['payer'].sum())

df_replaced = df.copy()
df_replaced['revenue'] = np.where(df_replaced['revenue'] == 100000, 9999, df_replaced['revenue'])
df_replaced['revenue'] = np.where(df_replaced['revenue'] == 1, 4999, df_replaced['revenue'])
print(df_replaced.shape, df_replaced['payer'].sum())

df_removed = df.copy()
to_remove = df[df['revenue'].isin([1, 100000])].index
df_removed.drop(to_remove, inplace=True)
print(df_removed.shape, df_removed['payer'].sum())


dfs = [df_unchanged, df_replaced, df_removed]

(1009, 19) 282
(1009, 19) 282
(1003, 19) 276


### `Рассчитать средний чек`

In [36]:
df['revenue'].unique()

array([  9999,      0,   4999,   5999,      1, 100000])

In [37]:
def calculate_average_revenue(df):
    total_revenue = df['revenue'].sum()

    total_users = df['user_id'].unique().size
    total_paid_users = df[df['revenue'] > 0]['user_id'].unique().size
    print(total_revenue, total_users, total_paid_users)
    AOV = (total_revenue / total_paid_users).round().astype(int).item()  # AOV
    ARPU = (total_revenue / total_users).round().astype(int).item()  # ARPU

    print(f'{AOV=:,.0f}, {ARPU=:,.0f}')

for df_go in dfs:
    calculate_average_revenue(df_go)

1935726 998 277
AOV=6,988, ARPU=1,940
1585718 998 277
AOV=5,725, ARPU=1,589
1535724 992 271
AOV=5,667, ARPU=1,548


In [15]:
df['revenue'].sum()

np.int64(1935726)

277