In [1]:
import pandas as pd
from datetime import datetime

In [2]:
df = pd.read_csv('KC_case_data.csv', parse_dates=['date'])

In [3]:
df.head()

Unnamed: 0,date,event,purchase_sum,os_name,device_id,gender,city,utm_source
0,2020-01-01,app_start,,android,669460,female,Moscow,-
1,2020-01-01,app_start,,ios,833621,male,Moscow,vk_ads
2,2020-01-01,app_start,,android,1579237,male,Saint-Petersburg,referal
3,2020-01-01,app_start,,android,1737182,female,Moscow,facebook_ads
4,2020-01-01,app_start,,ios,4029024,female,Moscow,facebook_ads


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2747968 entries, 0 to 2747967
Data columns (total 8 columns):
 #   Column        Dtype         
---  ------        -----         
 0   date          datetime64[ns]
 1   event         object        
 2   purchase_sum  float64       
 3   os_name       object        
 4   device_id     int64         
 5   gender        object        
 6   city          object        
 7   utm_source    object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 167.7+ MB


### 1. MAU Февраль

In [5]:
#MAU февраля
df['month'] = pd.DatetimeIndex(df['date']). month
df.groupby('month', as_index=False).agg({'device_id':'nunique'})

Unnamed: 0,month,device_id
0,1,99161
1,2,75032
2,3,74623


# 2. Кол-во установок в январе

In [6]:
df.query("event == 'app_install'").groupby('month', as_index=False).agg({'device_id':'count'})

Unnamed: 0,month,device_id
0,1,80297
1,2,38078
2,3,36222


# 3. Найти когорту с наибольшей конверсией в покупку в течение 7 дней

In [7]:
#дата установки by device
date_install = df.query("event == 'app_install'") \
                .groupby('device_id', as_index=False) \
                .agg({'date':'min'}) \
                .rename(columns={'date':'date_install'})

#дата покупки by device
date_purch = df.query("event == 'purchase'") \
                .groupby('device_id', as_index=False) \
                .agg({'date':'min'}) \
                .rename(columns={'date':'date_purchase'})

#объединяем в одну таблицу
groups = date_install.merge(date_purch, how='left', on='device_id')

#ДФ с кол-вом установок по дням
groups_by_day = groups.groupby('date_install', as_index=False)\
                        .agg({'device_id':'count'}) \
                        .rename(columns={'device_id':'installs'})

#находим разницу между днем покупки и установки 
groups['days_left'] = groups.date_purchase - groups.date_install
groups['days_left'] = groups['days_left'].apply(lambda x: x.days) #преобразование формата (с '30 days' на '30')

#ДФ с кол-вом покупок в течение 7 дней после установки
groups_purchase = groups.query("days_left < 8") \
                        .groupby('date_install', as_index=False) \
                        .agg({'device_id':'count'}) \
                        .rename(columns={'device_id':'purch_count'})

#объединили 2 ДФ
groups_by_day = groups_by_day.merge(groups_purchase)
groups_by_day['conversion_untill_7'] = round(groups_by_day.purch_count / groups_by_day.installs , 3)
groups_by_day.sort_values('conversion_untill_7', ascending=False)


Unnamed: 0,date_install,installs,purch_count,conversion_untill_7
0,2020-01-01,3579,1408,0.393
8,2020-01-09,1424,558,0.392
14,2020-01-15,4310,1650,0.383
13,2020-01-14,5173,1973,0.381
1,2020-01-02,3144,1186,0.377
...,...,...,...,...
77,2020-03-18,1171,235,0.201
88,2020-03-29,1117,223,0.200
81,2020-03-22,1261,251,0.199
87,2020-03-28,1091,209,0.192


### 4. Канал, который принес больше новых пользователей

In [8]:
df.query("event == 'app_install'")\
    .groupby('utm_source', as_index=False)\
    .agg({'device_id':'nunique'})\
    .rename(columns={'device_id':'unique_devices_count'})\
    .sort_values('unique_devices_count')

Unnamed: 0,utm_source,unique_devices_count
4,referal,9282
1,facebook_ads,13916
3,instagram_ads,20096
5,vk_ads,23189
2,google_ads,26286
6,yandex-direct,29368
0,-,32460


### 5. На каком этапе воронки отваливается бОльшая часть клиентов.

In [9]:
#оставим необходимые столбцы от основного ДФ
df_3 = df[['date','event','device_id']].query("event in ['search', 'choose_item', 'tap_basket', 'register']")

In [10]:
#ДФ с датой регистрации девайсов
devices_registred = df_3.query("event == 'register'")\
                        .groupby('device_id', as_index=False)\
                        .agg({'date':'min'})\
                        .rename(columns={'date':'reg_date'})
                           
df_3 = df_3.merge(devices_registred, on='device_id', how='left')
df_3

Unnamed: 0,date,event,device_id,reg_date
0,2020-01-01,choose_item,294193,2020-01-01
1,2020-01-01,choose_item,8658257,2020-01-01
2,2020-01-01,choose_item,10345186,2020-01-01
3,2020-01-01,choose_item,20209604,2020-01-01
4,2020-01-01,choose_item,22449838,2020-01-01
...,...,...,...,...
1703278,2020-03-31,register,2984778,2020-03-31
1703279,2020-03-31,register,27301864,2020-03-31
1703280,2020-03-31,register,1294285,2020-03-31
1703281,2020-03-31,register,3010574,2020-03-31


In [11]:
#дата регистрации < даты совершения события, то пользователь уже зарегистрирован

devices_reg = df_3.query("reg_date < date") #зарегистрирован
devices_unreg = df_3.query("reg_date >= date") #незарегистрирован

In [12]:
#рассчитываем воронку зарегистрированных пользователей
reg_funnel = devices_reg.groupby('event', as_index=False)\
                        .agg({'device_id':'count'})\
                        .rename(columns={'device_id':'devices_count'})\
                        .sort_values('devices_count')

reg_funnel['conversion'] = round(reg_funnel['devices_count'] / devices_reg['device_id'].count(),2)
reg_funnel

Unnamed: 0,event,devices_count,conversion
2,tap_basket,217256,0.23
0,choose_item,314473,0.33
1,search,417101,0.44


In [13]:
#рассчитываем воронку НЕзарегистрированных пользователей

unreg_funnel = devices_unreg.groupby('event', as_index=False)\
                        .agg({'device_id':'count'})\
                        .rename(columns={'device_id':'devices_count'})\
                        .sort_values('devices_count')

unreg_funnel['conversion'] = round(unreg_funnel['devices_count'] / devices_unreg['device_id'].count(),2)
unreg_funnel

Unnamed: 0,event,devices_count,conversion
1,register,78310,0.17
3,tap_basket,105031,0.23
0,choose_item,125994,0.28
2,search,148170,0.32


### 6. Пользователи, пришедшие с каких каналов, показали самую низкую конверсию в первую покупку?

In [14]:
df.head()
df_6 = df[['date','event','device_id','utm_source']]

In [15]:
first_purchase = df_6.query("event == 'purchase'").groupby('device_id', as_index=False).agg({'date':'min'})
first_start = df_6.query("event == 'app_start'").groupby('device_id', as_index=False).agg({'date':'min'})
first_start

Unnamed: 0,device_id,date
0,4013,2020-01-15
1,4014,2020-01-02
2,4016,2020-01-04
3,4018,2020-03-27
4,4046,2020-01-04
...,...,...
190879,35379281,2020-03-29
190880,35380796,2020-03-31
190881,35381595,2020-03-30
190882,35388218,2020-03-31


In [16]:
first_purchase = first_purchase.merge(df_6, on ='device_id', how='right')

In [17]:
first_purchase

Unnamed: 0,device_id,date_x,date_y,event,utm_source
0,669460,2020-01-07,2020-01-01,app_start,-
1,669460,2020-01-07,2020-01-01,search,-
2,669460,2020-01-07,2020-01-03,app_start,-
3,669460,2020-01-07,2020-01-03,choose_item,-
4,669460,2020-01-07,2020-01-03,search,-
...,...,...,...,...,...
2747963,29262521,NaT,2020-03-31,app_start,yandex-direct
2747964,29262521,NaT,2020-03-31,choose_item,yandex-direct
2747965,29262521,NaT,2020-03-31,search,yandex-direct
2747966,29262521,NaT,2020-03-31,tap_basket,yandex-direct
