# Project Settings

## Import libraries

In [None]:
import os
import datetime as dt
from operator import attrgetter

import helpers

# visualization
import matplotlib.pyplot as plt
# Optional - for dark them of graphs run the next line
plt.style.use("dark_background")

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns

# to print all the outputs in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from scipy import stats
from scipy.stats import mode

# to show warnings only once:
import warnings; warnings.filterwarnings(action='once')

## Some additional settings

In [None]:
# adjusting columns width & number of chars
pd.set_option('display.width', 1200)
pd.set_option('max_colwidth', 500)

# set pandas max columns and rows to print
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

# settings for Vizs
large = 22
med = 16
small = 12
params = {
    'axes.titlesize': large,
    'legend.fontsize': med,
    'figure.figsize': (17, 10),
    'axes.labelsize': med,
    'axes.titlesize': med,
    'xtick.labelsize': med,
    'ytick.labelsize': med,
    'figure.titlesize': large
}
plt.rcParams.update(params)
sns.set_style("white")
%matplotlib inline

In [None]:
pio.templates.default = "simple_white"

px.defaults.template = "plotly_dark"
px.defaults.color_continuous_scale = px.colors.sequential.Blackbody
px.defaults.width = 800
px.defaults.height = 500
color_discrete_sequence=px.colors.sequential.Oryel

## Download data

In [None]:
source_dir = './data/processed/'

# files to download:
file_users = 'users_processed.csv'
file_costs = 'costs_processed.csv'

In [None]:
df_users = pd.read_csv(os.path.join(source_dir, file_users)
                      #  ,nrows=500
                       ,parse_dates=['session_start_ts', 'session_end_ts', 'buy_ts']
                       ,dtype={'device': 'category', 'source_id': 'int'}
                       )

In [None]:
df_users.info()

## Step 1. Add additional variables to the model of user dataset

### Variables based on visits, registrations, and buying activities

*Add `first_session_ts`, and `first_visit_date`*

*Let's assume that the first `session_start_ts` is the first user's visit and registration datetime*

In [None]:
_users = df_users.groupby('uid')['session_start_ts'].agg(['min']).reset_index()
_users.columns = ['uid', 'first_session_ts']

In [None]:
df_users = df_users.merge(_users, how='left', on='uid')

*Add visit date  of `session_start_ts` and the first visit date based on the `first_session_ts`*

In [None]:
df_users['visit_date'] = helpers.get_day(df_users, 'session_start_ts')
df_users['first_visit_date'] = helpers.get_day(df_users, 'first_session_ts')

*Add `first_purchase_date` and virables based on it*

In [None]:
_buyers = df_users.groupby('uid')['buy_ts'].agg(['min']).reset_index()
_buyers.columns = ['uid', 'first_buy_ts']

In [None]:
df_users = df_users.merge(_buyers, how='left', on='uid')

In [None]:
df_users['order_date'] = helpers.get_day(df_users, 'buy_ts')
df_users['first_order_date'] = helpers.get_day(df_users, 'first_buy_ts')

### Add indicators (flags) based on the visits, first purchases

In [None]:
df_users['is_new_user'] = (df_users['first_session_ts'] == df_users['session_start_ts']) * 1
df_users['is_new_buyer'] = (df_users['first_buy_ts'] == df_users['buy_ts']) * 1
df_users['is_bought'] = ~df_users['buy_ts'].isna() * 1

*Test Sample*

In [None]:
df_users[df_users['uid'] == 618178059910673496].sort_values(by='session_start_ts')
# df_users[df_users['uid'] == 6444568725642094739].sort_values(by='session_start_ts')

### Variables based on session duration

In [None]:
df_users['session_duration_sec'].describe()

df_users['session_duration_sec'].mode()

In [None]:
# plt.rcParams['figure.facecolor'] = 'gray'
plt.style.use('dark_background')
# plt.style.use('fivethirtyeight')
# plt.style.use('bmh')

df_users['session_duration_sec'].hist()

*Add baseline session duration categories of session duratioin*

In [None]:
df_users['session_dur_category'] = pd.cut(df_users['session_duration_sec'], 
                                          bins=[0, 60.0, 240.0, 444.16, 660.0, float("inf")], 
                                          right=False)

df_users['session_category_name'] = pd.cut(df_users['session_duration_sec'], 
                                           bins=[0, 60.0, 240.0, 444.16, 660.0, float("inf")], 
                                           right=False,
                                           labels=['up to 1 minute', 'between 1 and 4 minutes', 'between 4 and 7.4 minutes', 'between 7.4 and 10 minutes', 'more than 10 minutes'])

*Calculate the time difference between the first visit and the first purchase*

In [None]:
df_users['time_diff_1st_visit_1st_puchase_sec'] = helpers.time_diff(df_users['first_buy_ts'], df_users['first_session_ts'], 'seconds')
df_users['time_diff_1st_visit_1st_puchase_sec'].hist()
df_users['time_diff_1st_visit_1st_puchase_sec'].describe()

In [None]:
df_users.sample(10)

### Variables based on the revenue distribution

In [None]:
df_users['revenue'].describe()
df_users['revenue'].mode()

df_users['revenue'].hist()

In [None]:
df_users[df_users['revenue']>0]['revenue'].describe()
df_users[df_users['revenue']>0]['revenue'].mode()

In [None]:
df_users[df_users['revenue']>0]['revenue'].hist()

In [None]:
df_users['revenue_mean'] = df_users['revenue'].mean().round(3)

df_users['revenue_category'] = pd.cut(df_users['revenue'], 
                                          bins=[-float("inf"), 0.0, 1.22, 1.83, 2.44, 4.28, float("inf")], 
                                          right=True)


In [None]:
df_users[df_users['uid'] == 618178059910673496].sort_values(by='session_start_ts')

## Step 2. Make reports and calculate metrics.
Plot graphs to display how these metrics differ for various devices and ad sources and how they change in time.

### 2.1 Create analytical table based on user daily activities by aggregating the `df_users`

In [None]:
# list(df_users.columns)

In [None]:
at_users_daily = df_users.groupby([
    'visit_date'      
 ]).agg({
    'uid' : 'nunique'
    ,'session_start_ts': 'count'
    ,'buy_ts': 'count'
    ,'revenue': 'sum'
    ,'session_duration_sec': 'sum'
    ,'is_new_user': 'sum'    
    ,'is_new_buyer': 'sum'
    ,'is_bought': 'sum'  
    ,'time_diff_1st_visit_1st_puchase_sec': 'mean'
}).reset_index().sort_values(by=['visit_date'])

In [None]:
at_users_daily.sample()

In [None]:
at_users_daily = at_users_daily.rename(columns={
                    'uid': 'dau'
                    ,'session_start_ts': 'sessions_count'
                    ,'buy_ts': 'orders_count'
                    ,'revenue': 'revenue_sum'
                    ,'session_duration_sec': 'session_duration_sec_sum'
                    ,'is_new_user': 'is_new_user_sum'
                    ,'is_new_buyer': 'is_new_buyer_sum'
                    ,'is_bought': 'n_buyers'
                    ,'time_diff_1st_visit_1st_puchase_sec': 'diff_1st_visit_1st_purchase_mean_sec'
                    }
                      )


In [None]:
at_users_daily.info()

Test control data after merging

In [None]:
df_users['revenue'].sum()
at_users_daily['revenue_sum'].sum()

df_users['session_start_ts'].count()
at_users_daily['sessions_count'].sum()

In [None]:
at_users_daily['diff_1st_visit_1st_purchase_mean_sec'] = at_users_daily['diff_1st_visit_1st_purchase_mean_sec'].round(0)

*Add variables weekday, dayofweek, month and year based on the date*

In [None]:
at_users_daily['visit_weekday'] = at_users_daily['visit_date'].dt.day_name()
at_users_daily['weekday_num'] = at_users_daily['visit_date'].dt.dayofweek

at_users_daily['visit_week_num'] = at_users_daily['visit_date'].dt.isocalendar().week
at_users_daily['week_start_monday'] = at_users_daily["visit_date"].dt.to_period('W').dt.start_time

at_users_daily['visit_month'] = helpers.get_month(at_users_daily['visit_date'])
at_users_daily['visit_year'] = at_users_daily['visit_date'].dt.isocalendar().year

In [None]:
at_users_daily

Add variables to analytical table

In [None]:
at_users_daily['num_sessions_per_user_mean'] = (at_users_daily['sessions_count'] / at_users_daily['dau']).round(2)

at_users_daily['num_orders_per_user_mean'] = (at_users_daily['orders_count'] / at_users_daily['dau']).round(2)

In [None]:
at_users_daily.head()

### 2.2 Product metrics

User engagement metric: DAU, WAU, MAU

`DAU - the number of daily active unique users`

`A mean DAU`

In [None]:
at_users_daily['dau'].mean().astype(int)

In [None]:
fig = px.line(
              at_users_daily
              ,x='visit_date'
              ,y='dau'
              ,title='DAU - the number of daily active unique users'
              ,labels={
                    'visit_date': ''
                    ,'dau': ''
              }
)
(fig.update_yaxes(showgrid=False).update_xaxes(showgrid=False)
 .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white'))

`DAU vs New users`

In [None]:
fig = px.line(
              at_users_daily
              ,x='visit_date'
              ,y=['dau', 'is_new_user_sum']
              ,title='DAU vs New users'
              ,labels={
                    'visit_date': ''
                    ,'dau': ''
              }
)
(fig.update_yaxes(showgrid=False).update_xaxes(showgrid=False)
 .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white'))

> DAU metric

- As we can see from the DAU metric, the daily amount of visitors does not remain constant over the year
- Average daily visits - **930** users.
- In addition, the number of active (unique) users rose sharply on November 24, 2017, it was a Black Friday.
- And visits suddenly and significantly decreased on March 31, 2018. It can be assumed that this happened for some external reasons. For example, the famous [football match "Spartak Moscow" - "Tosnun" was held on March 31, 2018].(https://www.eurosport.com/football/russian-premier-league/2017-2018/live-spartak-moscow-fc-tosno_mtc956622/live.shtml)
- It also seems like there is a weekly sharp drop that depends on the day of the week.

`WAU - the number of weekly active unique users`

In [None]:

df_users['visit_weekday'] = df_users['visit_date'].dt.day_name()
df_users['weekday_num'] = df_users['visit_date'].dt.dayofweek

df_users['visit_week_num'] = df_users['visit_date'].dt.isocalendar().week
df_users['week_start_monday'] = df_users["visit_date"].dt.to_period('W').dt.start_time

df_users['visit_month'] = helpers.get_month(df_users['visit_date'])
df_users['visit_year'] = df_users['visit_date'].dt.isocalendar().year

In [None]:
wau = df_users.groupby('week_start_monday')['uid'].nunique().reset_index().rename(columns={'uid': 'wau'})
wau['wau'].mean().astype(int)

In [None]:
fig = px.bar(
    wau
    ,x='week_start_monday'
    ,y='wau'
    ,title='WAU - Weekly active users'
    ,labels={'wau': ''
             ,'week_start_monday': ''
             }
    # ,text='wau'
)
# fig.update_xaxes(showline=False, showgrid=False)
fig.update_yaxes(showline=False, showgrid=False)

In [None]:
_visit_weekday = (df_users.groupby(['visit_year', 'week_start_monday','visit_weekday', 'weekday_num'])
                  .agg({
                      'uid': 'nunique'
                      ,'session_start_ts': 'count'                      
                  })
                  .reset_index().rename(columns={'uid': 'nunique_users', 'session_start_ts': 'num_sessions'}).sort_values(by = ['weekday_num'])
                  )
_visit_weekday.head()                

In [None]:
fig = px.area(
    _visit_weekday    
    ,x='week_start_monday'
    ,y='nunique_users'
    ,title='WAU - Weekly active users by visit weekday'
    ,labels={'nunique_users': ''
             ,'visit_weekday': ''
             ,'week_start_monday': ''
             }
    ,color='visit_weekday'
)
fig.update_yaxes(showgrid=False).update_xaxes(showgrid=False).update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')


In [None]:
_tmp_visit_weekday = (_visit_weekday.groupby(['visit_year', 'visit_weekday', 'weekday_num'])['nunique_users']
                      .mean().reset_index().rename(columns={'nunique_users': 'nunique_users_mean'}).sort_values(by=['visit_year', 'weekday_num']) )
_tmp_visit_weekday['nunique_users_mean'] = _tmp_visit_weekday['nunique_users_mean'].astype(int)
_tmp_visit_weekday

In [None]:
fig = px.bar(
    _tmp_visit_weekday
    ,x='visit_weekday'
    ,y='nunique_users_mean'
    ,title='Mean of active users by visit weekday'
    ,labels={'nunique_users_mean': ''
             ,'visit_weekday': ''
             ,'visit_year': ''
             }
    ,barmode="group"
    ,color='visit_year'
    ,text='nunique_users_mean'
    # ,color_discrete_sequence=px.colors.diverging.Spectral[-1::-2]

)
fig.update_yaxes(showgrid=False)

# fig.show()

> WAU metric
 
- An average weekly visits - 5807 users

- User activities depend on the day of the week

`MAU - Monthly visits`

In [None]:
mau = df_users.groupby(['visit_year','visit_month'])['uid'].nunique().reset_index().rename(columns={'uid': 'mau'})
mau['mau'].mean().astype(int)
mau

In [None]:
fig = px.bar(
    mau
    ,x='visit_month'
    ,y='mau'
    ,title='MAU - Monthly active users'
    ,labels={'dau': ''
             ,'visit_month': ''
             ,'visit_year': ''
             }
    ,text='mau'
    ,color='visit_year'
)
fig.update_yaxes(showgrid=False).update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')
# fig.show()

*MAU - Average monthly visits - 23469 users.*

>**Conclusion**

- Daily, weekly, and monthly user activities are 930, 5807, and 23469 respectively.

- There are dependencies of user activity on the day of the week, and on the month of year.

- Users were the most active on the 5 the day of the week, and in November and December.

- Less active users were on weekend, as well as in summer months.

- User visits are influenced by external factors such as international holidays and local events.

`User stickiness` - `DAU/MAU/MAU Ratio`

In [None]:
# DAU/WAU ratio

(at_users_daily['dau'].mean() / wau['wau'].mean() * 100).round(2)

# DAU/MAU ratio
(at_users_daily['dau'].mean() / mau['mau'].mean() * 100).round(2)

# WAU/MAU ratio
(wau['wau'].mean() / mau['mau'].mean() * 100).round(2)

Selecting the Right User Metric:

https://medium.com/sequoia-capital/selecting-the-right-user-metric-de95015aa38

* One important point to remember is that the ratio of daily active users to weekly active users (DAU/WAU) can never be less than 1/7 (14.29%)
* Additionally, the ratio of daily active users to monthly active users (DAU/MAU) can never be lower than 1/28 (35.71%) where MAU is calculated over 28 days. 
* Finally, the ratio of weekly active users to monthly active users (WAU/MAU) can never be below 1/4 (25%).* 

*If the DAU/WAU (daily active users/weekly active users) ratio is at least 60%, it indicates that the product is used more than four days per week, making it a daily usage product.*


So, in our project we recommend that to monitor user activity on a monthly basis. 

However, business stakeholders are interested in the day-to-day activities of users. Thet's why we should create an analytical table on the daily base.

#### 2.2 User's sessions

`How many sessions are there per day?`

(One user might have more than one session.)

In [None]:
at_users_daily['sessions_count'].mean().astype(int)

In [None]:
fig = px.line(
    at_users_daily
    ,x='visit_date'
    ,y='sessions_count'
    ,title='Number of Daily user sessions'
    ,labels={
        'visit_date': ''
        ,'sessions_count': ''
    }
)
fig.update_yaxes(showgrid=False).update_xaxes(showgrid=False).update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

# fig.show()

> *The average number of sessions per day is 1027. It's clear that the total number of sessions increases with increasing users and vs.*

`The dynamics of total sessions per day vs the dynamics of DAU`

In [None]:
fig = px.line(
    at_users_daily
    ,x='visit_date'
    ,y=['sessions_count', 'dau']
    ,title='Number of daily user sessions vs DAU'
    ,labels={
        'visit_date': ''
    }
)
fig.update_yaxes(showgrid=False).update_xaxes(showgrid=False).update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

# fig.show()

> *The line chart shows that the number of sessions per day is almost the same as the number of visits.*

`Mean number of sessions per user`

In [None]:
at_users_daily['num_sessions_per_user_mean'].mean().round(3)

In [None]:
fig = px.line(
    at_users_daily
    ,x='visit_date'
    ,y='num_sessions_per_user_mean'
    ,title='Daily average number of sessions per user'
    ,labels={
        'visit_date': ''
        ,'num_sessions_per_user_mean': ''
    }
)
fig.update_yaxes(showgrid=False).update_xaxes(showgrid=False).update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')
# fig.update_yaxes(showgrid=False, rangemode="tozero")

>*Average daily sessions per user - 1.097*

The daily average number of sessions per user is from 1.03 to 1.27

On March 28 in 2018 the average number of sessions per user equals 1

`Average number of sesions by weekday`

In [None]:
_tmp = at_users_daily.groupby(['visit_year', 'weekday_num'])['sessions_count'].mean().reset_index().sort_values(by=['visit_year','weekday_num'])
_tmp['sessions_count'] = _tmp['sessions_count'].astype(int)
_tmp.rename(columns = {'sessions_count': 'sessions_count_mean'}, inplace=True)

In [None]:
fig = px.bar(
    _tmp
    ,x='weekday_num'
    ,y='sessions_count_mean'
    ,title='Average number of sessions by weekday'
    ,labels={'weekday_num': ''
             ,'visit_year': ''
             ,'sessions_count_mean': ''
             }
    ,barmode="group"
    ,color='visit_year'
    ,text='sessions_count_mean'
    # ,color_discrete_sequence=px.colors.diverging.Spectral[-1::-2]

)
fig.update_yaxes(visible=False, showgrid=False).update_xaxes(showgrid=False).update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

# fig.show()

>On weekends, the average number of sessions is less than on weekdays.

In [None]:
# list(at_users_daily.columns)
at_users_daily['session_duration_sec_mean'] = (at_users_daily['session_duration_sec_sum'] / at_users_daily['sessions_count']).astype(int)

In [None]:
at_users_daily.head()

`What is the average length of sessions per session per period?`

In [None]:
fig = px.line(
    at_users_daily
    ,x='visit_date'
    ,y='session_duration_sec_mean'
    ,title='Average length of sessions over time, seconds'
    ,labels={'visit_date': ''
             ,'session_duration_sec_mean': ''
             }
    ,color='visit_year'
)
(fig.update_yaxes(showgrid=False).update_xaxes(showgrid=False)
 .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')
)

`Number of sessions by device`

In [None]:
_sessions_by_device = df_users.groupby('device')['session_start_ts'].count().astype(int).reset_index()
_sessions_by_device

In [None]:
fig = px.bar(
    _sessions_by_device
    ,x='device'
    ,y='session_start_ts'
    ,title='Total number of sessions by device'
    ,labels={
        'device': ''
        ,'session_start_ts': ''
    }
    ,text='session_start_ts'
)
fig.show()

`Average number of sessions and session duration by device over time`

In [None]:
_sessions_by_device_over_time = (df_users.groupby(['visit_date', 'device'])
                                 .agg({
                                     'session_duration_sec': 'mean'
                                     ,'session_start_ts': 'count'
                                 }).reset_index().rename(columns={'session_start_ts': 'sessions_count'
                                                                  ,'session_duration_sec': 'session_duration_sec_mean'})
                              )
_sessions_by_device_over_time['session_duration_sec_mean'] = _sessions_by_device_over_time['session_duration_sec_mean'].fillna(0).astype(int)


In [None]:
_sessions_by_device_over_time.head()

In [None]:
fig = px.line(
    _sessions_by_device_over_time
    ,x='visit_date'
    ,y='sessions_count'
    ,title='Number of sessions by device over time'
    ,labels={'visit_date': ''
             ,'sessions_count': ''
             }
    ,color='device'
)
(fig.update_yaxes(showgrid=False).update_xaxes(showgrid=False)
 .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')
)

`Average length of sessions by device over time, seconds`

In [None]:
fig = px.line(
    _sessions_by_device_over_time
    ,x='visit_date'
    ,y='session_duration_sec_mean'
    ,title='Average length of sessions by device over time, seconds'
    ,labels={'visit_date': ''
             ,'session_duration_sec_mean': ''
             }
    ,color='device'
)
(fig.update_yaxes(showgrid=False).update_xaxes(showgrid=False)
 .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')
)

* The number of sessions in the "browser" is consistently higher compared to the "mobile" and "unknown" version.
* In addition, it was noticeable that the sessions in the "browser" version lasted about 100 seconds longer than in the "mobile" version.

`Number of sessions and unique users by session duration categoty name`

In [None]:
_session_category_name = df_users.groupby(['session_category_name']).agg({
    'uid': 'nunique'
    ,'session_start_ts': 'count'
}).reset_index().rename(columns = {
    'uid': 'nunique_users'
    ,'session_start_ts': 'sessions_count'})
_session_category_name

In [None]:
fig = px.bar(
    _session_category_name
    ,x='session_category_name'
    ,y=['sessions_count', 'nunique_users']
    ,title='Number of session by number of unique users and session duration category name'
    ,labels={'sessions_count': ''
             ,'session_category_name': ''
             ,
             }
    # ,text='sessions_count'
    ,barmode='group'
)
(fig.update_yaxes(showgrid=False).update_xaxes(showgrid=False)
 .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')
)

`Average daily sessions by source_id`

In [None]:
_sessions_source_id = df_users.groupby(['source_id']).agg({
   'session_duration_sec': 'mean'
   ,'session_start_ts': 'count'
}).reset_index().rename(columns={
   'session_duration_sec': 'session_duration_sec_mean'
   ,'session_start_ts': 'sessions_count'
   }).sort_values(by='source_id')
_sessions_source_id['session_duration_sec_mean'] = _sessions_source_id['session_duration_sec_mean'].astype(int)

_sessions_source_id

In [None]:
fig = px.bar(
    _sessions_source_id
    ,x='source_id'
    ,y='session_duration_sec_mean'
    ,title='Average session duration by source Id, seconds'
    ,labels={
        'source_id': ''
        ,'session_duration_sec_mean': ''
    }
    ,text='session_duration_sec_mean'
)
(fig.update_yaxes(showgrid=False).update_xaxes(showgrid=False)
 .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')
)

In [None]:
fig = px.bar(
    _sessions_source_id
    ,x='source_id'
    ,y='sessions_count'
    ,title='Total Sessions by source Id'
    ,labels={
        'source_id': ''
        ,'sessions_count': ''
    }
    ,text='sessions_count'
)
(fig.update_yaxes(showgrid=False).update_xaxes(showgrid=False)
 .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')
)

>Conclusion:

* The average number of sessions per day is 1076. And Average daily sessions per user - 1.097
* Visitors prefer the `browser` version to the `mobile` version.
* The amount of daily sessions differs depending on `source_id`.
* International holidays and local events affect the number of daily sessions as well as user visits.
* 4th and 3rd `source_id` have the most number of sessions for the entire period.
* Average daily session duration of 1st and 3rd `source_id` is noticeably longer.

#### 2.3 Cohorts & Retension

How often do users come back?

To answer this question we need to calculate the retention rate using cohort analysis. 

*We have data for a year, so it makes sense to build a cohort-based on monthly user activity.* 

`Cohorts Lifetime` 

https://towardsdatascience.com/a-step-by-step-introduction-to-cohort-analysis-in-python-a2cbbd8460ea

In [None]:
df_users['order_month'] = df_users['buy_ts'].dt.to_period('M')
df_users['cohort_month'] = df_users.groupby('uid')['buy_ts'].transform('min').dt.to_period('M') 

In [None]:
df_users.sample(10)

In [None]:
# df_users[df_users['uid'] == 17818797011639758931]

In [None]:
df_cohort = df_users.groupby(['cohort_month', 'order_month']) \
              .agg(n_users=('uid', 'nunique')) \
              .reset_index(drop=False)
df_cohort['period_number'] = (df_cohort.order_month - df_cohort.cohort_month).apply(attrgetter('n'))

In [None]:
df_cohort

In [None]:
cohort_pivot = df_cohort.pivot_table(index = 'cohort_month',
                                     columns = 'period_number',
                                     values = 'n_users')

In [None]:
cohort_pivot

In [None]:

with sns.axes_style("dark"):
    fig, ax = plt.subplots(1, 2, figsize=(12, 8), sharey=True, gridspec_kw={'width_ratios': [1, 11]})
    
    # retention matrix
    sns.heatmap(cohort_pivot, 
                mask=cohort_pivot.isnull(), 
                annot=True, 
                # fmt='.0%', 
                cmap='RdYlGn', 
                ax=ax[1])
    ax[1].set_title('Monthly Cohorts: User Buyers', fontsize=16)
    ax[1].set(xlabel='# of periods',
              ylabel='')

    fig.tight_layout()

In [None]:
plt.figure(figsize=(20, 13))
plt.title('Cohorts: monthly active users from each cohort')
sns.heatmap(cohort_pivot,
            annot=True,
            # fmt='.1%',
            linewidths=1,
            linecolor='gray')

`Retention Rate calculation`

To obtain the retention matrix, we need to divide the values each row by the row's first value, which is actually the cohort size — all customers who made their first purchase in the given month.

In [None]:
cohort_size = cohort_pivot.iloc[:,0]
retention_matrix = cohort_pivot.divide(cohort_size, axis = 0).round(2)
retention_matrix

In [None]:
with sns.axes_style("dark"):
    fig, ax = plt.subplots(1, 2, figsize=(12, 8), sharey=True, gridspec_kw={'width_ratios': [1, 11]})
    
    # retention matrix
    sns.heatmap(retention_matrix, 
                mask=retention_matrix.isnull(), 
                annot=True, 
                fmt='.0%', 
                cmap='RdYlGn', 
                ax=ax[1])
    ax[1].set_title('Monthly Cohorts: User Retention', fontsize=16)
    ax[1].set(xlabel='# of periods',
              ylabel='')

    fig.tight_layout()

>***Conclusion:***
- The first-month retention rate decreases for each new cohort.
- The '09-2017' cohort has the highest retention rate.
- In 2018, the retention rate is significantly lower than in 2017.
- There was no retention in the '05-2018' cohort in the first month.

### 2.4 Sales

 `When do people start buying?`

Let' find the difference between `first_order_date` and `first_visits_date`

In [None]:
df_users['conversion_day'] = ((df_users['first_order_date'] - df_users['first_visit_date'])
                              / np.timedelta64(1, 'D'))

In [None]:
df_users['conversion_group'] = df_users['conversion_day'].apply(helpers.conversion_group)
df_users[['conversion_group_name', 'conversion_group_id']] = pd.DataFrame(df_users['conversion_group'].tolist(), index=df_users.index)

`Conversion Rate`

The conversion rate is the share of users who change their status during visit. So, let's calculate it:

In [None]:
_conversion_rate = (df_users.groupby(['conversion_group_name', 'conversion_group_id']).agg({
                       'buy_ts': 'count'
                       ,'session_start_ts': 'count'                       
                   })
                   .reset_index()
                   .sort_values(by='conversion_group_id', ascending=True)
                   .rename(columns={
                       'uid': 'n_users'
                       ,'session_start_ts': 'n_sessions'
                       ,'buy_ts': 'n_orders'
                       })
                    )
_conversion_rate['conversion_%'] = (_conversion_rate['n_orders'] / _conversion_rate['n_sessions'] * 100).round(2)

_conversion_rate

Why is the conversion rate so high?

This occurs when a visitor’s conversion takes place on your landing page during a time period later than the time of their visit. In this particular case, our analytics will display the visit and the conversion with different timestamps and therefore, for a particular period of time, you will see a conversion rate of over 100%.

https://help.instapage.com/hc/en-us/articles/115010682767-Why-is-my-conversion-rate-over-100-

In [None]:
fig = px.bar(
    _conversion_rate
    ,x='conversion_group_name'
    ,y='conversion_%'
    ,title='CR - Conversion Rate by categories, %'
    ,labels={'conversion_%': ''
             ,'conversion_group_name': ''
             }
    ,text='conversion_%'
)
# fig.update_xaxes(showline=False, showgrid=False)
fig.update_yaxes(showline=False, showgrid=False)

`Conversion Rate over time`

In [None]:
at_users_daily['conversion_rate_%'] = (at_users_daily['orders_count'] / at_users_daily['sessions_count'] * 100).round(2)

In [None]:
avg_daily_conversion_rate = at_users_daily['conversion_rate_%'].mean().round(2)
avg_daily_conversion_rate

In [None]:
fig = px.line(
    at_users_daily
    ,x='visit_date'
    ,y='conversion_rate_%'
    ,title='Conversion rate % over time'
    ,labels={
        'visit_date': ''
        ,'conversion_rate_%': ''
    }
)
fig.add_hline(y=avg_daily_conversion_rate)\
    .update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

# fig.show()

`Average Convertion Rate by weekdays`

In [None]:
_tmp_conversion_weekday = at_users_daily.groupby(['visit_year', 'visit_weekday', 'weekday_num'])\
    .agg({
        'conversion_rate_%': 'mean'
    })\
        .reset_index()\
            .rename(columns={'conversion_rate_%': 'conversion_rate_mean'})\
                .sort_values(by=['visit_year', 'weekday_num']) 

_tmp_conversion_weekday['conversion_rate_mean'] = _tmp_conversion_weekday['conversion_rate_mean'].round(2)
_tmp_conversion_weekday

In [None]:
fig = px.bar(
    _tmp_conversion_weekday
    ,x='visit_weekday'
    ,y='conversion_rate_mean'
    ,title='Average Conversion Rate by weekday'
    ,labels={
        'visit_weekday': ''
        ,'conversion_rate_mean': ''
    }
    ,color='visit_year'
    ,barmode='group'
    ,text='conversion_rate_mean'
)
fig.add_hline(y=avg_daily_conversion_rate)\
    .update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

>***Conclusion:***
- Most often, users make their first purchase on the day of their first visit. The conversion on the first day of the user visit is almost 65%.
- Over the next 7 days and 30 days, the conversion dropped sharply
- 70.08% of visitors have never bought
- The conversion rate is not significantly affected by weekdays
- On Monday and Thursday the conversion rate drops below the average conversion.


### 2.5 Orders

`Number of Orders vs DAU over time`

In [None]:
fig = px.line(
    at_users_daily
    ,x='visit_date'
    ,y=['orders_count', 'dau']
    ,title='Number of orders vs DAU over time'
    ,labels={
        'visit_day': ''
        ,'orders_count': ''
    } 
)
fig.update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

`Average number of orders per user over time`

In [None]:
fig = px.line(
    at_users_daily
    ,x='visit_date'
    ,y='num_orders_per_user_mean'
    ,title='Average number of orders per user over time'
    ,labels={
        'visit_date': ''
        ,'num_orders_per_user_mean': ''
    }
 
)
fig.update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

`Average number of orders by weekday`

In [None]:
_tmp_orders_weekday = at_users_daily.groupby(['visit_year', 'visit_weekday', 'weekday_num'])\
    .agg({
        'orders_count': 'mean'
    })\
        .reset_index()\
            .rename(columns={'orders_count': 'n_orders_mean'})\
                .sort_values(by=['visit_year', 'weekday_num']) 

_tmp_orders_weekday['n_orders_mean'] = _tmp_orders_weekday['n_orders_mean'].astype(int)
_tmp_orders_weekday

In [None]:
fig = px.bar(
    _tmp_orders_weekday
    ,x='visit_weekday'
    ,y='n_orders_mean'
    ,title='Average number of orders by weekday'
    ,labels={
        'visit_weekday': ''
        ,'n_orders_mean': ''
    }
    ,color='visit_year'
    ,barmode='group'
    ,text='n_orders_mean'
)
fig.update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

`Number of orders by device over time`

In [None]:
_orders_monthly_by_device = df_users.groupby(['visit_year', 'visit_month', 'device'])\
    .agg({
        'uid': 'nunique'
        ,'buy_ts': 'count'
    })\
    .reset_index().rename(columns = {'uid': 'nunique_users', 'buy_ts': 'n_orders'})
_orders_monthly_by_device

In [None]:
fig = px.bar(
    _orders_monthly_by_device
    ,x='visit_month'
    ,y='n_orders'
    ,title='Monthly number of orders by device'
    ,labels={
        'device': ''
        ,'n_orders': ''
        ,'visit_month': ''
    }
    ,color='device'
    ,barmode='stack'
)
fig.update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

`Average number of orders by source Id`

In [None]:
_orders_monthly_by_sourse_id = df_users.groupby(['visit_year', 'visit_month', 'source_id'])\
    .agg({
        'uid': 'nunique'
        ,'buy_ts': 'count'
    })\
    .reset_index().rename(columns = {'uid': 'nunique_users', 'buy_ts': 'n_orders'})
_orders_monthly_by_sourse_id

In [None]:
fig = px.area(
    _orders_monthly_by_sourse_id
    ,x='visit_month'
    ,y='n_orders'
    ,title='Monthly number of orders by device'
    ,labels={
        'source_id': ''
        ,'n_orders': ''
        ,'visit_month': ''
    }
    ,color='source_id'
    # ,barmode='stack'
    # ,text='n_orders_mean'
)
fig.update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

### 2.6 Create Monthly Analytical Table

In [None]:
# list(df_users.columns)

In [None]:
at_users_monthly = df_users.groupby(['visit_year', 'visit_month'])\
    .agg({
        'uid': 'nunique'
        ,'session_start_ts': 'count'
        ,'session_duration_sec': 'sum'
        ,'buy_ts': 'count'
        ,'is_new_user': 'sum'
        ,'is_new_buyer': 'sum'
        ,'revenue': 'sum'
        
    })\
        .reset_index()\
            .rename(columns={
                'uid': 'mau'
                ,'session_start_ts': 'sessions_count'
                ,'session_duration_sec': 'sessions_duration_total_sec'
                ,'buy_ts': 'orders_count'
                ,'is_new_user': 'new_users'
                ,'is_new_buyer': 'new_buyers'
                ,'revenue': 'revenue_total'})
at_users_monthly

Add marketing metrics

Add  `profit` and `profit_cum` variables to monthly analytical table

*There is no margin information in the project description. But to complete the task, let's take into account the average sales margin of 50%*

In [None]:
at_users_monthly['profit'] = (at_users_monthly['revenue_total'] * 0.5).round(2)
at_users_monthly['profit_cum'] = at_users_monthly['profit'].cumsum()
# at_users_monthly['profit_lag'] = at_users_monthly['profit'].shift(1)

`Monthly Conversion Rate`

In [None]:
at_users_monthly['conversion_rate_%'] = (at_users_monthly['orders_count'] / at_users_monthly['sessions_count'] * 100).round(2)

`CLTV - Customer Lifetime Value Metrics`

https://blog.hubspot.com/service/how-to-calculate-customer-lifetime-value

- AOV - Average Order Value
- AOFR - Average Order Frequency Rate
- CV - Customer (User) Value
- ACLS - Average Customer Lifespan
- CLTV - Customer Lifetime Value - Customer Lifetime Value = (Customer Value * Average Customer Lifespan)

In [None]:
at_users_monthly['avg_order_value'] = (at_users_monthly['revenue_total'] / at_users_monthly['orders_count'] ).round(2)
at_users_monthly['avr_order_frequency_rate'] = (at_users_monthly['orders_count'] / at_users_monthly['mau'] ).round(2)
at_users_monthly['customer_value'] = (at_users_monthly['avg_order_value'] * at_users_monthly['avr_order_frequency_rate'] ).round(2)
at_users_monthly['avg_customer_lifetime_span'] = (at_users_monthly['revenue_total'] / at_users_monthly['mau'] ).round(2)
at_users_monthly['avg_customer_ltv_cum'] = (at_users_monthly['customer_value'] * at_users_monthly['avg_customer_lifetime_span'] ).cumsum().round(2)

In [None]:
at_users_monthly

In [None]:
avg_order_value_mean = at_users_monthly['avg_order_value'].mean().round(2)
avg_order_value_mean

`Profit vs Profit cumulative`

In [None]:
fig = px.line(
    at_users_monthly
    ,x='visit_month'
    ,y=['profit', 'profit_cum']
    ,title='Profit over time'
    ,labels={
        'visit_month': ''
        ,'orders_count': ''
        ,'value': ''
        ,'variable': ''
    }
 
)
fig.update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

`Number of monthly orders`

In [None]:
fig = px.bar(
    at_users_monthly
    ,x='visit_month'
    ,y='orders_count'
    ,title='Monthly number of orders'
    ,labels={
        'visit_month': ''
        ,'orders_count': ''
    }
    ,text='orders_count'
 
)
fig.update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

`Average Order Value over time`

In [None]:
avg_order_value_mean = at_users_monthly['avg_order_value'].mean()
avg_order_value_mean

In [None]:
fig = px.bar(
    at_users_monthly
    ,x='visit_month'
    ,y='avg_order_value'
    ,title='Average Order Value over time'
    ,labels={
        'visit_month': ''
        ,'avg_order_value': ''
    }
    ,text='avg_order_value'
     
)
fig.add_hline(y=avg_order_value_mean)\
    .update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

>Conclusion

- The maximum number of orders per day is 819 and the minimum is 0. From the line graph, we can see that the daily number of orders depends on the DAU.
- It is clear that the more visitors are attracted daily, the more orders we have.
- The number of orders fluctuates based on the month and season.
- According to the data, December 2017 was the most profitable month with a total of 6218 orders.
- The number of orders verises depending on the month.
- At the same time, the average monthly order values are higher than their average values only for 3 months - June, July 2017, and April 2018.

### 2.7 Marketing

In [None]:
df_costs = pd.read_csv(os.path.join(source_dir, file_costs)
                      #  ,nrows=500
                       ,parse_dates=['costs_date']
                    #    ,dtype={'device': 'category', 'source_id': 'int'}
                       )

In [None]:
df_costs.info()
# df_costs.head()
# df_costs.tail()
df_costs.sample(5)
df_costs.nunique()
# df_costs.duplicated().mean()

`How much money was spent - Overall?`

In [None]:
df_costs['costs'].sum()
df_costs['costs'].mean().round(2)

df_costs.groupby('costs_date')['costs'].sum().mean().round(2)


`How much money was spent - Per source overall?`

In [None]:
_costs_per_source_total = df_costs.groupby('source_id')\
    .agg({
        'costs': ['sum', 'mean']
        })\
        .reset_index()
_costs_per_source_total.columns = _costs_per_source_total.columns.map('_'.join)
# _costs_per_source_total.columns['costs_mean'] = _costs_per_source_total.columns['costs_mean'].round(2)
_costs_per_source_total

In [None]:
fig = px.bar(
    _costs_per_source_total
    ,x='source_id_'
    ,y='costs_sum'
    ,title='Costs per source Id overall'
    ,labels={
        'source_id_': ''
        ,'costs_sum': ''
    }
    ,text='costs_sum'
 
)
fig.update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

`Tracking Daily Expenses Over Time`

In [None]:
_costs_daily = df_costs.groupby('costs_date')['costs'].sum().reset_index()
_costs_daily.head()

In [None]:
fig = px.line(
    _costs_daily
    ,x='costs_date'
    ,y='costs'
    ,title='Daily money spent over time'
    ,labels={
        'costs_date': ''
        ,'costs': ''
    }
 
)
fig.update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

`Tracking Monthly Expenses Over Time by Source Id`

In [None]:
df_costs['costs_month'] = helpers.get_month(df_costs['costs_date'])
df_costs['costs_year'] = df_costs['costs_date'].dt.isocalendar().year

In [None]:
_costs_monthly_by_source_id = df_costs.groupby(['costs_year', 'costs_month', 'source_id'])\
    ['costs'].sum().reset_index()
_costs_monthly_by_source_id

In [None]:
fig = px.area(
    _costs_monthly_by_source_id
    ,x='costs_month'
    ,y='costs'
    ,title='Monthly Marketing costs over time by source Id'
    ,labels={
        'costs_month': ''
        ,'costs': ''
        ,'source_id': 'Source Id'
    }
    ,color='source_id'
 
)
fig.update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

>***Conclusion:***

- There are 7 sources Id
- Total spends is 329131.62
- Average daily costs are 904.21
- Average monthly costs are 14790.5 and range from minimum (14790.5) in August 2017 to maximum (30487.6) in January 2018.
- The most expensive `source_id`  is 3rd, the less expensive - 9th, and 10th.

`Monthly acquisition cost by Source Id`

In [None]:
at_users_monthly_by_source_id = df_users.groupby(['visit_year', 'visit_month', 'source_id'])\
    .agg({
        'uid': 'nunique'
        ,'session_start_ts': 'count'
        ,'buy_ts': 'count'
        ,'revenue': 'sum'
        ,'is_new_user': 'sum'
        ,'is_new_buyer': 'sum'        
    })\
        .reset_index()\
            .rename(columns = {
                'uid': 'nunique_users'
                ,'session_start_ts': 'sessions_count'
                ,'buy_ts': 'orders_count'
                ,'revenue': 'revenue_sum'
                ,'is_new_user': 'new_users'
                ,'is_new_buyer': 'new_buyers' 
            })
at_users_monthly_by_source_id.head()


In [None]:
report_cac = at_users_monthly_by_source_id.merge(_costs_monthly_by_source_id, how='left'
                                  ,left_on=['visit_year', 'visit_month', 'source_id']
                                  ,right_on=['costs_year', 'costs_month', 'source_id']
                                  )

In [None]:
report_cac.sample(5)

`Calculating CAC - customer acquisition costs`

Customer Acquisition Cost = Cost of Sales and Marketing divided by the Number of New Customers Acquired.

https://blog.hubspot.com/service/what-does-cac-stand-for

In [None]:
report_cac['cac'] = (report_cac['costs'] / report_cac['new_users']).round(2)

In [None]:
fig = px.bar(
    report_cac
    ,x='costs_month'
    ,y='cac'
    ,title='CAC - customer acquisition costs by source Id'
    ,labels={
        'visit_month': ''
        ,'cac': ''
        ,'source_id': ''
        ,'costs_month': ''
    }
    ,color='source_id'
    ,barmode='group'
 
)
fig.update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

`LTV vs CAC ratio`

In [None]:
df_costs_monthly = df_costs.groupby(['costs_year', 'costs_month'])['costs']\
    .sum().reset_index().rename(columns = {'costs': 'costs_total'})

In [None]:
at_users_monthly = at_users_monthly.merge(df_costs_monthly
                                          ,how='left'
                                          ,left_on=['visit_year', 'visit_month']
                                          ,right_on=['costs_year', 'costs_month']
                                          )

In [None]:
at_users_monthly['cac'] = (at_users_monthly['costs_total'] / at_users_monthly['new_users'])\
    .round(2)

In [None]:
at_users_monthly['ltv_cac_ratio'] = (at_users_monthly['avg_customer_ltv_cum'] / at_users_monthly['cac'])\
    .round(2)

In [None]:
fig = px.bar(
    at_users_monthly
    ,x='visit_month'
    ,y='ltv_cac_ratio'
    ,title='LTV vs CAC ratio'
    ,labels={
        'visit_month': ''
        ,'ltv_cac_ratio': ''
    }
    ,text='ltv_cac_ratio'
)
fig.add_hline(y=3)\
    .update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

*Ideally, LTV: CAC ratio should be 3:1 — in other words, the value of your customers should be three times the cost of acquiring them.*

`Churn Rate`

Churn Rate = (Users at the beginning of the month - Users at the end of the month) / by Users at the beginning of the month

https://www.zoho.com/subscriptions/guides/what-is-customer-lifetime-value-clv.html#:~:text=Average%20Customer%20Lifespan%20(ACL)%20is,the%20total%20number%20of%20customers.

The other appoach - What is churn rate in ecommerce?
A churn rate is the percentage of your existing customers who do not reorder.

https://www.shopify.com/blog/churn-rate-in-ecommerce#:~:text=What%20is%20churn%20rate%20in,a%20customer%20won't%20return.

Generally, a lower churn rate is better, as it means more customers are reordering

In [None]:
at_users_monthly['mau_previous'] = at_users_monthly['mau'].shift(1)

# The first approach
# at_users_monthly['churn_rate'] = ((at_users_monthly['mau_previous'] - at_users_monthly['mau']) / at_users_monthly['mau_previous']).round(2)


# The second approach
at_users_monthly['churn_rate'] = (at_users_monthly['orders_count'] / at_users_monthly['mau_previous'] * 100).round(2)

In [None]:
fig = px.bar(
    at_users_monthly
    ,x='visit_month'
    ,y='churn_rate'
    ,title='Churn Rate'
    ,labels={
        'visit_month': ''
        ,'churn_rate': ''
    }
    ,text='churn_rate'
)
fig.update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

`ROMI - Return on Marketing Investment`

ROMI = (Revenue - Marketing Expenses) / Marketing Expenses

https://www.dashly.io/blog/roi-formula/

In [None]:
at_users_monthly['romi'] = ((at_users_monthly['revenue_total'] - at_users_monthly['costs_total']) /at_users_monthly['costs_total'] * 100).round(2)

In [None]:
fig = px.bar(
    at_users_monthly
    ,x='visit_month'
    ,y='romi'
    ,title='Monthly ROMI - Return on Marketing Investment'
    ,labels={
        'visit_month': ''
        ,'romi': ''
    }
    ,text='romi'
)
fig.update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

`ROAS - the Return Ratio of your Advertising Spend`

ROAS = Revenue Total / Advertising Total Costs

In [None]:
at_users_monthly['roas'] = (at_users_monthly['revenue_total']  / at_users_monthly['costs_total'] * 100).round(2)

In [None]:
fig = px.bar(
    at_users_monthly
    ,x='visit_month'
    ,y='roas'
    ,title='Monthly ROAS - the Return Ratio of Advertising Spend'
    ,labels={
        'visit_month': ''
        ,'roas': ''
    }
    ,text='roas'
)
fig.update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

In [None]:
fig = px.bar(
    at_users_monthly
    ,x='visit_month'
    ,y=['revenue_total', 'costs_total']
    ,title='Revenue vs Costs, monthly'
    ,labels={
        'variable': ''
        ,'value': ''
        ,'revenue_total': 'Revenue Total'
        ,'costs_total': 'Costs Total'
        ,'visit_month': ''
    }
    ,barmode='group'
)
fig.update_yaxes(showgrid=False)\
    .update_xaxes(showgrid=False)\
        .update_traces(textangle=1, selector=dict(type='bar'), textfont_color='white')

In [None]:
at_users_monthly

>Conclusion

- The CAC depends on the `source_id`.
- The most expensive sources are 1, 2 and 3.
- The biggest season dependency 2nd and 3rd source Id has.
- Ideally, LTV: CAC ratio should be 3:1 — in other words, the value of your customers should be three times the cost of acquiring them. However we see that this ratio is less than 1.
- Revenue does not cover the costs of advertising. 
- Each month, the ROMI (Return on Marketing Investment) has negative values due to advertising costs exceeding revenue.
- ROAS - The Return Ratio of Advertising Spends is less than 60%

### 2.8 Save Analytical tables for daily and monthly user activities

In [None]:
at_users_monthly_by_source_and_device = df_users.groupby([
    'visit_year',  
    'visit_month', 
    'source_id',
    'device',       
    'session_category_name',     
    'conversion_group_name'
]).agg({
    'uid': 'nunique',
    'session_start_ts': 'count',    
    'session_duration_sec': 'sum', 
    'is_new_user': 'sum', 
    'is_new_buyer': 'sum', 
    'is_bought': 'sum',
    'revenue': 'sum'
})\
    .reset_index()\
        .rename(columns ={
            'uid': 'nunique_users',
            'session_start_ts': 'sessions_total',
            'revenue': 'revenue_sum',             
            'session_duration_sec': 'session_duration_sec_sum', 
            'is_new_user': 'is_new_user_sum', 
            'is_new_buyer': 'is_new_buyer_sum', 
            'is_bought': 'is_bought_sum',
        })

In [None]:
at_users_monthly_by_source_and_device.sample(10)

In [None]:
# The test numbers after aggregation

# Revenue
df_users.revenue.sum()
at_users_monthly_by_source_and_device.revenue_sum.sum()

# Number of sessions
df_users['session_start_ts'].count()
at_users_monthly_by_source_and_device['sessions_total'].sum()

# NB! The same user can have visits from different sources_id and devices at the same day/month

df_users.uid.nunique()
at_users_monthly_by_source_and_device.nunique_users.sum()

In [None]:
at_costs_monthly = df_costs.groupby(['costs_year', 'costs_month', 'source_id'])['costs'].sum().reset_index()
at_costs_monthly

In [None]:
at_users_monthly = at_users_monthly.drop(['costs_year', 'costs_month'], axis=1)

In [None]:
at_users_monthly
at_users_monthly['mau'].sum()
at_users_monthly['costs_total'].sum()

In [None]:
# The test numbers after aggregation

df_costs['costs'].sum()
at_costs_monthly['costs'].sum()

In [None]:
path_to_save = './reports/'
file_at_users_monthly_by_source_and_device = 'at_users_monthly_by_source_and_device.csv'
file_at_costs_monthly = 'at_costs_monthly.csv'
file_at_users_daily = 'at_users_daily.csv'
file_at_users_monthly = 'at_users_monthly.csv'

In [None]:
at_users_monthly_by_source_and_device.to_csv(os.path.join(path_to_save, file_at_users_monthly_by_source_and_device), sep=',', encoding='utf-8', index=False)
at_costs_monthly.to_csv(os.path.join(path_to_save, file_at_costs_monthly), sep=',', encoding='utf-8', index=False)

at_users_daily.to_csv(os.path.join(path_to_save, file_at_users_daily), sep=',', encoding='utf-8', index=False)
at_users_monthly.to_csv(os.path.join(path_to_save, file_at_users_monthly), sep=',', encoding='utf-8', index=False)

## Step 3. General conclusion

## End