# Initial Setup

In [10]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from typing import *

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 150)
pd.set_option('display.max_colwidth', None)
pd.set_option('future.no_silent_downcasting', True)
sns.set()

df = pd.read_csv('restart_data.csv')

print(df.shape)
df.head()

(1009, 18)


Unnamed: 0,user_id,region,device,channel,session_start,session_end,sessiondurationsec,session_date,month,day,hour_of_day,order_dt,revenue,payment_type,promo_code,final_price,time_of_day,payer
0,529697267522,United States,iPhone,социальные сети,2019-05-01 00:06:40,2019-05-01 00:07:06,26.0,2019-05-01,5,3,0,2019-05-01 00:06:40,9999.0,Mobile payments,0.0,9999.0,night,1
1,601292388085,United States,PC,organic,2019-05-01 06:56:16,2019-05-01 07:09:18,782.0,2019-05-01,5,3,7,,,,,,morning,0
2,852898876338,United States,Mac,социальные сети,2019-05-01 04:30:45,2019-05-01 04:34:56,251.0,2019-05-01,5,3,4,,,,,,night,0
3,998513020664,United States,iPhone,социальные сети,2019-05-01 18:53:42,2019-05-01 18:57:35,233.0,2019-05-01,5,3,18,,,,,,evening,0
4,240702200943,United States,Mac,социальные сети,2019-05-02 14:04:32,2019-05-02 14:09:51,319.0,2019-05-02,5,4,14,,,,,,day,0


In [11]:
dtype_mapping = {
    'user_id': 'int64', 
    'region': 'object', 
    'device': 'object', 
    'channel': 'object', 
    'session_start': 'datetime64[ns]', 
    'session_end': 'datetime64[ns]', 
    'sessiondurationsec': 'float64', 
    'session_date': 'datetime64[ns]', 
    'month': 'int64', 
    'day': 'int64', 
    'hour_of_day': 'int64', 
    'order_dt': 'datetime64[ns]', 
    'revenue': 'float64', 
    'payment_type': 'object', 
    'promo_code': 'float64'
}

df = df.astype(dtype_mapping)

In [12]:
# Check data completeness by month
daily_data = df.groupby(['month', 'session_date']).size().reset_index(name='visits')

print("Days with data per month:")
print(daily_data.groupby('month')['session_date'].count())

# Check distribution of dates within each month
print("\nFirst and last date for each month:")
print(df.groupby('month')['session_date'].agg(['min', 'max']))

# Check if sales launch patterns (first few weeks) are different
# Let's look at daily sales in the first month vs later months
daily_sales = df.groupby(['month', 'session_date'])['revenue'].agg(['count', 'sum']).reset_index()

Days with data per month:
month
5     31
6     30
7     31
8     30
9     30
10    31
Name: session_date, dtype: int64

First and last date for each month:
             min        max
month                      
5     2019-05-01 2019-05-31
6     2019-06-01 2019-06-30
7     2019-07-01 2019-07-31
8     2019-08-01 2019-08-31
9     2019-09-01 2019-09-30
10    2019-10-01 2019-10-31


In [13]:
# Let's look at weekly patterns for a clearer picture of the launch
df['week'] = df['session_date'].dt.isocalendar().week

# Weekly metrics
weekly_metrics = df.groupby(['month', 'week']).agg({
    'revenue': ['count', 'mean', 'sum'],
    'user_id': 'nunique'
}).round(2)

print("Weekly metrics:")
print(weekly_metrics)

# Also let's see daily sales in May specifically
may_sales = df[df['month'] == 5].groupby('session_date').agg({
    'revenue': ['count', 'mean', 'sum'],
    'user_id': 'nunique'
}).round(2)

print("\nMay daily sales:")
print(may_sales)

Weekly metrics:
           revenue                     user_id
             count      mean       sum nunique
month week                                    
5     18         1   9999.00    9999.0      20
      19         5   4999.00   24995.0      29
      20        10   4999.40   49994.0      37
      21        12  37666.00  451992.0      39
      22         4   6249.00   24996.0      21
6     22         5   6399.00   31995.0      11
      23         8   5874.00   46992.0      32
      24        14   5641.86   78986.0      26
      25         8   4999.00   39992.0      35
      26        14   5570.43   77986.0      50
7     27        13   5537.46   71987.0      41
      28        11   5635.36   61989.0      33
      29         8   5124.00   40992.0      25
      30         3   5332.33   15997.0      29
      31         8   5249.00   41992.0      19
8     31         9   5332.33   47991.0      23
      32        22   5726.27  125978.0      45
      33        23   5738.13  131977.0      

In [14]:
# Weekly patterns of key metrics
weekly_detailed = df.groupby(['month', 'week']).agg({
    'revenue': ['count', 'mean', 'std', 'sum'],  # sales metrics
    'user_id': 'nunique',  # unique users
    'promo_code': 'mean',  # promo usage
    'channel': lambda x: x.value_counts().index[0]  # dominant channel
}).round(2)

print("Weekly detailed metrics:")
print(weekly_detailed)

# Let's also look at the distribution of high-value sales
value_dist = df.groupby(['month', 'week'])['revenue'].apply(
    lambda x: x[x > 9000].count()  # count of high-value sales
).reset_index(name='high_value_sales')

print("\nHigh-value sales distribution:")
print(value_dist[value_dist['high_value_sales'] > 0])

Weekly detailed metrics:
           revenue                               user_id promo_code              channel
             count      mean       std       sum nunique       mean             <lambda>
month week                                                                              
5     18         1   9999.00       NaN    9999.0      20       0.00      социальные сети
      19         5   4999.00      0.00   24995.0      29       0.40              organic
      20        10   4999.40   2980.68   49994.0      37       0.30              organic
      21        12  37666.00  46070.27  451992.0      39       0.58              organic
      22         4   6249.00   2500.00   24996.0      21       0.50              organic
6     22         5   6399.00   2073.64   31995.0      11       0.60              organic
      23         8   5874.00   1726.89   46992.0      32       0.00      социальные сети
      24        14   5641.86   1336.31   78986.0      26       0.50              orga

In [15]:
df['revenue'].unique()

array([9.999e+03,       nan, 4.999e+03, 5.999e+03, 1.000e+00, 1.000e+05])

In [18]:
print("Records with 100,000 revenue:")
print(df[df['revenue'].isin([100000, 1])])

Records with 100,000 revenue:
         user_id         region  device             channel       session_start         session_end  sessiondurationsec session_date  month  day  \
55  649969085224  United States      PC             organic 2019-05-15 23:16:53 2019-05-15 23:28:04               671.0   2019-05-15      5    3   
56  777891466332  United States  iPhone     социальные сети 2019-05-15 15:52:06 2019-05-15 16:18:32              1586.0   2019-05-15      5    3   
94   94160026069         France     Mac  реклама у блогеров 2019-05-21 08:17:29 2019-05-21 08:24:15               406.0   2019-05-21      5    2   
95  422703292792         France      PC             organic 2019-05-21 14:19:29 2019-05-21 14:48:00              1711.0   2019-05-21      5    2   
96  822055781011  United States  iPhone  реклама у блогеров 2019-05-21 05:50:33 2019-05-21 06:17:17              1604.0   2019-05-21      5    2   
99  744763558101             UK      PC             organic 2019-05-22 04:11:46 20