# Title


## Import modules


In [1]:
import os
import sys

import pandas as pd
import datetime as dt

# To print all the outputs in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# To display the full text of a pandas DataFrame
pd.set_option('display.max_colwidth', 1) 

# To display all columns of DataFrame
pd.set_option('display.max_columns', None)

# To show warnings only once:
import warnings; warnings.filterwarnings(action='once')

# Set path
sys.path.insert(1, '../tools/')
import viztools as vt
import helpers as hp

viztools.py is being imported into module
helpers.py is being imported into module


## Download processed datasets


In [2]:
path_to_processed_data = '../data/processed/'
file_sales = 'online_sales_details.csv'
file_calendar_tbl = 'calendar_tbl.csv'
file_marketing_tbl = 'marketing_spend.csv'

Online sales details


In [3]:
sales = pd.read_csv(os.path.join(path_to_processed_data, file_sales))

sales['transaction_date'] = pd.to_datetime(sales['transaction_date'])
sales['transaction_month'] = pd.to_datetime(sales['transaction_month'])

df_sales = sales.copy()

Marketing costs


In [4]:
df_marketing = pd.read_csv(os.path.join(path_to_processed_data, file_marketing_tbl))
df_marketing['date'] = pd.to_datetime(df_marketing['date'])
df_marketing.info()
df_marketing.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           365 non-null    datetime64[ns]
 1   offline_spend  365 non-null    int64         
 2   online_spend   365 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 8.7 KB


Unnamed: 0,date,offline_spend,online_spend
0,2019-01-01,4500,2424.5
1,2019-02-01,4500,3480.36
2,2019-03-01,4500,1576.38
3,2019-04-01,4500,2928.55
4,2019-05-01,4500,4055.3


Because we only have online sales data, we do not include the `offline_spend`
column in our analysis.


## Customer Analytical table creation


In [5]:
at_sales = df_sales.groupby(['transaction_month', 'transaction_date', 'customer_id']).agg({
    'transaction_id': 'nunique'
    ,'product_category': 'nunique'
    ,'product_sku': 'nunique'
    ,'quantity': 'sum'    
    ,'delivery_charges': 'sum'
    ,'discount_amount': 'sum'
    ,'total_amount': 'sum'
    ,'is_coupon_status_used': 'sum'
})\
    .reset_index()\
        .sort_values(by='transaction_date', ascending=True)\
            .rename(columns={
                'transaction_id': 'transactions_cntd'
                ,'product_category': 'product_category_cntd'
                ,'product_sku': 'products_cntd'
                ,'quantity': 'quantity_sum'                
                ,'delivery_charges': 'delivery_charges_sum'
                ,'discount_amount': 'discount_amount_sum'
                ,'total_amount': 'total_amount_sum'
                ,'is_coupon_status_used': 'coupons_used'
            })

#### Validate data after aggregating by checking control sum


In [6]:
df_sales['total_amount'].sum()
at_sales['total_amount_sum'].sum()

df_sales['quantity'].sum()
at_sales['quantity_sum'].sum()

df_sales['transaction_id'].nunique()
at_sales['transactions_cntd'].sum()

3645802.5700000003

3645802.5700000003

99627

99627

26631

26631

In [7]:
at_sales.info() # 

<class 'pandas.core.frame.DataFrame'>
Index: 3208 entries, 0 to 3207
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   transaction_month      3208 non-null   datetime64[ns]
 1   transaction_date       3208 non-null   datetime64[ns]
 2   customer_id            3208 non-null   int64         
 3   transactions_cntd      3208 non-null   int64         
 4   product_category_cntd  3208 non-null   int64         
 5   products_cntd          3208 non-null   int64         
 6   quantity_sum           3208 non-null   int64         
 7   delivery_charges_sum   3208 non-null   float64       
 8   discount_amount_sum    3208 non-null   float64       
 9   total_amount_sum       3208 non-null   float64       
 10  coupons_used           3208 non-null   int64         
dtypes: datetime64[ns](2), float64(3), int64(6)
memory usage: 300.8 KB


#### Add the first and the last transaction dates


In [8]:
at_sales['first_transaction_date'] = at_sales.groupby(['customer_id'])['transaction_date'].transform('min')
at_sales['last_transaction_date'] = at_sales.groupby(['customer_id'])['transaction_date'].transform('max')

#### Add indicators of the first and the last customer activities


In [9]:
at_sales['is_new_user'] = ( at_sales['first_transaction_date']==at_sales['transaction_date'] ) * 1
at_sales['is_last_purchase'] = ( at_sales['last_transaction_date']==at_sales['transaction_date'] ) * 1

In [10]:
at_sales[at_sales['customer_id'] == 14713].sort_values(by='transaction_date')

Unnamed: 0,transaction_month,transaction_date,customer_id,transactions_cntd,product_category_cntd,products_cntd,quantity_sum,delivery_charges_sum,discount_amount_sum,total_amount_sum,coupons_used,first_transaction_date,last_transaction_date,is_new_user,is_last_purchase
929,2019-04-01,2019-04-21,14713,14,5,20,54,171.47,134.497,2557.483,8,2019-04-21,2019-09-30,1,0
1503,2019-06-01,2019-06-23,14713,12,6,16,32,148.74,240.003,1164.147,10,2019-04-21,2019-09-30,0,0
2414,2019-09-01,2019-09-30,14713,23,6,21,71,412.58,139.095,3390.955,13,2019-04-21,2019-09-30,0,1


Generally, an order may encompass one or more transactions. Each transaction is
a record of an action that is taken on an order. However, the `order ID` is not
presented in the datasets.

Therefore, in the Exploratory Data Analysis we will consider the average
transaction cost, average quantity items per transaction, ets


## Product Data Analysis

_We will utilize various data analytics metrics to extract valuable information
from the available data. To accomplish this, we will generate a report with
monthly-level aggregation._


### Preparing report


In [11]:
report = at_sales.groupby(['transaction_month']).agg({
    'customer_id': 'nunique'
    ,'transactions_cntd': 'sum' 
    ,'quantity_sum': 'sum'
    ,'delivery_charges_sum': 'sum'
    ,'discount_amount_sum': 'sum'
    ,'total_amount_sum': 'sum'
    ,'coupons_used': 'sum'
    ,'is_new_user': 'sum'
})\
    .reset_index()\
        .sort_values(by='transaction_month', ascending=True)\
            .rename(columns={
    'transaction_month': 'month'
    ,'customer_id': 'num_customers'
    ,'transactions_cntd': 'num_transactions' 
    ,'quantity_sum': 'quantity_items_sold'
    ,'delivery_charges_sum': 'delivery_charges_sum'
    ,'discount_amount_sum': 'discount_amount_sum'
    ,'total_amount_sum': 'total_amount_sum'
    ,'coupons_used': 'num_coupons_used'
    ,'is_new_user': 'num_new_users'
})
            
report

Unnamed: 0,month,num_customers,num_transactions,quantity_items_sold,delivery_charges_sum,discount_amount_sum,total_amount_sum,num_coupons_used,num_new_users
0,2019-01-01,232,2190,8161,53423.2,8515.453,305718.147,1434,232
1,2019-02-01,141,1813,6734,47453.1,15638.496,276769.684,1189,114
2,2019-03-01,229,2207,9023,54951.94,31110.072,307172.668,1510,166
3,2019-04-01,236,2113,8022,41451.19,9007.977,301089.883,1457,151
4,2019-05-01,236,2120,8262,40364.35,17096.238,280802.552,1423,119
5,2019-06-01,254,2194,8170,43052.44,28288.458,298113.802,1519,140
6,2019-07-01,252,2359,10806,55435.8,10286.504,349513.636,1890,100
7,2019-08-01,276,2420,9389,49014.73,18399.58,298456.99,1796,116
8,2019-09-01,195,2108,8035,44077.33,25545.582,271918.008,1407,77
9,2019-10-01,210,2095,7808,49499.2,8986.311,310779.799,1318,89


Validating the aggregation result


In [12]:
df_sales['total_amount'].sum()
report['total_amount_sum'].sum()

df_sales['quantity'].sum()
report['quantity_items_sold'].sum()

df_sales['transaction_id'].nunique()
report['num_transactions'].sum()

3645802.5700000003

3645802.57

99627

99627

26631

26631

### Add to the report number of unique categories and unique product_sku sold


In [13]:
df_products = df_sales.groupby(['transaction_month']).agg({
    'product_sku': 'nunique'
    ,'product_category': 'nunique'    
})\
    .reset_index()\
        .sort_values(by='transaction_month', ascending=True)\
            .rename(columns={
                'transaction_month': 'month'
                ,'product_sku': 'num_products_unique'
                ,'product_category': 'num_categories_unique'                ,
            })
df_products

Unnamed: 0,month,num_products_unique,num_categories_unique
0,2019-01-01,647,20
1,2019-02-01,638,20
2,2019-03-01,727,20
3,2019-04-01,697,19
4,2019-05-01,705,20
5,2019-06-01,714,20
6,2019-07-01,728,20
7,2019-08-01,599,19
8,2019-09-01,593,20
9,2019-10-01,561,20


In [14]:
report = pd.merge(report, df_products, on='month', how='left')
report

Unnamed: 0,month,num_customers,num_transactions,quantity_items_sold,delivery_charges_sum,discount_amount_sum,total_amount_sum,num_coupons_used,num_new_users,num_products_unique,num_categories_unique
0,2019-01-01,232,2190,8161,53423.2,8515.453,305718.147,1434,232,647,20
1,2019-02-01,141,1813,6734,47453.1,15638.496,276769.684,1189,114,638,20
2,2019-03-01,229,2207,9023,54951.94,31110.072,307172.668,1510,166,727,20
3,2019-04-01,236,2113,8022,41451.19,9007.977,301089.883,1457,151,697,19
4,2019-05-01,236,2120,8262,40364.35,17096.238,280802.552,1423,119,705,20
5,2019-06-01,254,2194,8170,43052.44,28288.458,298113.802,1519,140,714,20
6,2019-07-01,252,2359,10806,55435.8,10286.504,349513.636,1890,100,728,20
7,2019-08-01,276,2420,9389,49014.73,18399.58,298456.99,1796,116,599,19
8,2019-09-01,195,2108,8035,44077.33,25545.582,271918.008,1407,77,593,20
9,2019-10-01,210,2095,7808,49499.2,8986.311,310779.799,1318,89,561,20


### Add to the report marketing costs


In [15]:
df_marketing.head()

Unnamed: 0,date,offline_spend,online_spend
0,2019-01-01,4500,2424.5
1,2019-02-01,4500,3480.36
2,2019-03-01,4500,1576.38
3,2019-04-01,4500,2928.55
4,2019-05-01,4500,4055.3


In [16]:
df_marketing['month'] = df_marketing['date'].to_numpy().astype('datetime64[M]') 

In [17]:
df_marketing_monthly = df_marketing.groupby(['month']).agg({
    'online_spend': 'sum'
})\
    .reset_index()\
        .sort_values(by='month', ascending=True)\
            .rename(columns={
                'online_spend': 'marketing_costs'
            })
        
df_marketing_monthly

Unnamed: 0,month,marketing_costs
0,2019-01-01,47957.13
1,2019-02-01,52169.73
2,2019-03-01,53360.17
3,2019-04-01,60971.9
4,2019-05-01,58299.3
5,2019-06-01,56556.44
6,2019-07-01,58170.91
7,2019-08-01,57501.34
8,2019-09-01,58383.08
9,2019-10-01,59172.41


In [18]:
report = pd.merge(report, df_marketing_monthly, on='month', how='left')
report['month'] = pd.to_datetime(df_marketing['date'])
report

Unnamed: 0,month,num_customers,num_transactions,quantity_items_sold,delivery_charges_sum,discount_amount_sum,total_amount_sum,num_coupons_used,num_new_users,num_products_unique,num_categories_unique,marketing_costs
0,2019-01-01,232,2190,8161,53423.2,8515.453,305718.147,1434,232,647,20,47957.13
1,2019-02-01,141,1813,6734,47453.1,15638.496,276769.684,1189,114,638,20,52169.73
2,2019-03-01,229,2207,9023,54951.94,31110.072,307172.668,1510,166,727,20,53360.17
3,2019-04-01,236,2113,8022,41451.19,9007.977,301089.883,1457,151,697,19,60971.9
4,2019-05-01,236,2120,8262,40364.35,17096.238,280802.552,1423,119,705,20,58299.3
5,2019-06-01,254,2194,8170,43052.44,28288.458,298113.802,1519,140,714,20,56556.44
6,2019-07-01,252,2359,10806,55435.8,10286.504,349513.636,1890,100,728,20,58170.91
7,2019-08-01,276,2420,9389,49014.73,18399.58,298456.99,1796,116,599,19,57501.34
8,2019-09-01,195,2108,8035,44077.33,25545.582,271918.008,1407,77,593,20,58383.08
9,2019-10-01,210,2095,7808,49499.2,8986.311,310779.799,1318,89,561,20,59172.41


## Customers


### Number of Customers by Month


In [19]:
vt.plot_bar_graph(report, 'month', 'num_customers', 'num_customers', 'Number of Customers by Month')

### Number of New Customers by month


In [20]:
vt.plot_bar_graph(report, 'month', 'num_new_users', 'num_new_users', 'Number of New Customers by month')

### New customers rate


In [21]:
report['new_customers_rate'] = (report['num_new_users'] / report['num_customers'] * 100).round(1)

### MAU ( Average Monthly Active Users / Customers)


In [22]:
mau = report['num_customers'].mean().astype(int)
mau

226

## Orders


### Average Order Value


In [23]:
report['aov'] = (report['total_amount_sum'] / report['num_transactions']).round(2)

### Average number of Orders per Customer by month


In [24]:
report['avg_num_orders_per_customer'] = (report['num_transactions'] / report['num_customers']).astype(int)

### Average number of purchased items per Customer


In [25]:
report['avg_items_per_order'] = (report['quantity_items_sold'] / report['num_transactions']).astype(int)

## Acquisition Metrics


#### Customer Acquisition Cost (CAC)

_The cost of acquiring a new customer through marketing and sales efforts_


In [26]:
report['cac'] = (report['marketing_costs'] / report['num_new_users']).astype('int')
vt.plot_bar_graph(report, axis_x='month', axis_y='cac', label='cac', title='Customer Acquisition Cost (CAC)')

## Retention Metrics

A good example of cohort retenTion analysis:

https://towardsdatascience.com/strategy-analytics-in-python-1-churn-analysis-82cb0247fe6f

However, we will add to the report monthly retention metrics.


### User Retention Rate

_The percentage of users who continue using the product after a specific time
period. Often monthly._

Retention rate = (# customers current period - new customers) / # customers
starting period


In [27]:
report['retention_rate'] = (
    (report['num_customers'] - report['num_new_users']) / report['num_customers'].shift(1)
).fillna(1).round(2)

### Churn Rate

_The percentage of users who stop using the product within a specific time
period, e.g., monthly._


In [28]:
report['churn_rate'] = 1 - report['retention_rate']

### Customer Lifetime

_The average time it takes for a user to stop using the product._

𝐶𝑢𝑠𝑡𝑜𝑚𝑒𝑟 𝐿𝑖𝑓𝑒𝑡𝑖𝑚𝑒 = 1 / 𝐶ℎ𝑢𝑟𝑛 𝑅𝑎𝑡𝑒


In [29]:
report['clt'] = report['churn_rate'].apply(lambda x: 1 if x==0 else 1 / x).round(1)

In [30]:
report[['month', 'num_customers', 'retention_rate', 'churn_rate', 'clt']]

Unnamed: 0,month,num_customers,retention_rate,churn_rate,clt
0,2019-01-01,232,1.0,0.0,1.0
1,2019-02-01,141,0.12,0.88,1.1
2,2019-03-01,229,0.45,0.55,1.8
3,2019-04-01,236,0.37,0.63,1.6
4,2019-05-01,236,0.5,0.5,2.0
5,2019-06-01,254,0.48,0.52,1.9
6,2019-07-01,252,0.6,0.4,2.5
7,2019-08-01,276,0.63,0.37,2.7
8,2019-09-01,195,0.43,0.57,1.8
9,2019-10-01,210,0.62,0.38,2.6


## Revenue Metrics


### Average Revenue Per Account (ARPA)

_The average revenue generated per account (customer) within a specific time
frame. For example, monthly._


In [31]:
report['arpa'] = (report['total_amount_sum'] / report['num_customers']).astype('int')
vt.plot_bar_graph(report, axis_x='month', axis_y='arpa', label='arpa', title='Average Revenue Per Account (ARPA)')

### Customer Lifetime Value

_The total revenue a user generates during their entire relationship with the
product._

𝐶𝐿T𝑉 = 𝐶𝑢𝑠𝑡𝑜𝑚𝑒𝑟 𝐿𝑖𝑓𝑒𝑡𝑖𝑚𝑒 ∗𝐴𝑅𝑃𝐴


In [32]:
report['cltv'] = (report['arpa'] * report['clt']).astype('int')

### Customer Profitability

_The difference between the lifetime value of a customer (CLTV) and the cost of
acquiring them_


In [33]:
report['customer_profitability'] = report['cltv'] - report['cac']

### CLTV/CAC Ratio

_The Customer Lifetime Value (LTV) to Customer Acquisition Cost (CAC) Ratio
measures the relationship between the lifetime value of a customer and the cost
of acquiring that customer_


In [34]:
report['cltv_cac_ratio'] = (report['cltv'] / report['cac']).round(1)

## Revenue Churn

_The revenue lost due to customer cancellations, downgrades, or nonrenewals
within a specific time period, typically a month/year_


In [35]:
report['revenue_churn'] = (report['total_amount_sum'] * report['churn_rate']).astype('int')

In [36]:
report['revenue_churn_rate'] = report['total_amount_sum'].pct_change().fillna(0).round(2)

In [37]:
report

Unnamed: 0,month,num_customers,num_transactions,quantity_items_sold,delivery_charges_sum,discount_amount_sum,total_amount_sum,num_coupons_used,num_new_users,num_products_unique,num_categories_unique,marketing_costs,new_customers_rate,aov,avg_num_orders_per_customer,avg_items_per_order,cac,retention_rate,churn_rate,clt,arpa,cltv,customer_profitability,cltv_cac_ratio,revenue_churn,revenue_churn_rate
0,2019-01-01,232,2190,8161,53423.2,8515.453,305718.147,1434,232,647,20,47957.13,100.0,139.6,9,3,206,1.0,0.0,1.0,1317,1317,1111,6.4,0,0.0
1,2019-02-01,141,1813,6734,47453.1,15638.496,276769.684,1189,114,638,20,52169.73,80.9,152.66,12,3,457,0.12,0.88,1.1,1962,2158,1701,4.7,243557,-0.09
2,2019-03-01,229,2207,9023,54951.94,31110.072,307172.668,1510,166,727,20,53360.17,72.5,139.18,9,4,321,0.45,0.55,1.8,1341,2413,2092,7.5,168944,0.11
3,2019-04-01,236,2113,8022,41451.19,9007.977,301089.883,1457,151,697,19,60971.9,64.0,142.49,8,3,403,0.37,0.63,1.6,1275,2040,1637,5.1,189686,-0.02
4,2019-05-01,236,2120,8262,40364.35,17096.238,280802.552,1423,119,705,20,58299.3,50.4,132.45,8,3,489,0.5,0.5,2.0,1189,2378,1889,4.9,140401,-0.07
5,2019-06-01,254,2194,8170,43052.44,28288.458,298113.802,1519,140,714,20,56556.44,55.1,135.88,8,3,403,0.48,0.52,1.9,1173,2228,1825,5.5,155019,0.06
6,2019-07-01,252,2359,10806,55435.8,10286.504,349513.636,1890,100,728,20,58170.91,39.7,148.16,9,4,581,0.6,0.4,2.5,1386,3465,2884,6.0,139805,0.17
7,2019-08-01,276,2420,9389,49014.73,18399.58,298456.99,1796,116,599,19,57501.34,42.0,123.33,8,3,495,0.63,0.37,2.7,1081,2918,2423,5.9,110429,-0.15
8,2019-09-01,195,2108,8035,44077.33,25545.582,271918.008,1407,77,593,20,58383.08,39.5,128.99,10,3,758,0.43,0.57,1.8,1394,2509,1751,3.3,154993,-0.09
9,2019-10-01,210,2095,7808,49499.2,8986.311,310779.799,1318,89,561,20,59172.41,42.4,148.34,9,3,664,0.62,0.38,2.6,1479,3845,3181,5.8,118096,0.14


### Save customer analytical table and report to \*csv file


In [38]:
path_to_save_at = '../data/processed/'
file_at_sales = 'at_sales.csv'

hp.write_df_to_csv(at_sales, path_to_save_at, file_at_sales)


In [39]:
path_to_save_report = '../reports_and_dashboards/'
file_report = 'annual_report.csv'
hp.write_df_to_csv(report, path_to_save_report, file_report)

## The next step of the analysis is building dashboard in Tableau to present findings and insights.
