In [83]:
import random
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

##  Load Customer and Purchase information

In [46]:
customer_data = pd.read_csv('./data/ab/user_demographics_v1.csv')

customer_data.head()

# minor cleanup
customer_data['uid'] = customer_data['uid'].astype(int)
customer_data['reg_date'] = pd.to_datetime(customer_data['reg_date'])

customer_data.head()

Unnamed: 0,uid,reg_date,device,gender,country,age
0,54030035,2017-06-29,and,M,USA,19
1,72574201,2018-03-05,iOS,F,TUR,22
2,64187558,2016-02-07,iOS,M,USA,16
3,92513925,2017-05-25,and,M,BRA,41
4,99231338,2017-03-26,iOS,M,FRA,59


In [56]:
purchases = pd.read_csv('./data/ab/purchase_data_v1.csv')

# minor cleanup
purchases['uid'] = purchases['uid'].astype(int)
purchases['date'] = pd.to_datetime(purchases['date'])

purchases.head()

Unnamed: 0,date,uid,sku,price
0,2017-07-10,41195147,sku_three_499,499
1,2017-07-15,41195147,sku_three_499,499
2,2017-11-12,41195147,sku_four_599,599
3,2017-09-26,91591874,sku_two_299,299
4,2017-12-01,91591874,sku_four_599,599


## Join dataframes to get insight

In [57]:
purchase_data = purchases.merge(customer_data, on=['uid'], how='inner')
purchase_data.head()

Unnamed: 0,date,uid,sku,price,reg_date,device,gender,country,age
0,2017-07-10,41195147,sku_three_499,499,2017-06-26,and,M,BRA,17
1,2017-07-15,41195147,sku_three_499,499,2017-06-26,and,M,BRA,17
2,2017-11-12,41195147,sku_four_599,599,2017-06-26,and,M,BRA,17
3,2017-09-26,91591874,sku_two_299,299,2017-01-05,and,M,TUR,17
4,2017-12-01,91591874,sku_four_599,599,2017-01-05,and,M,TUR,17


In [58]:
subdata_grp = purchase_data.groupby(by=['country', 'device'], axis=0, as_index=False)
data_aggr = subdata_grp.price.agg({
    'price': ['max', 'mean', 'median', 'min'],
    'age': ['max', 'mean', 'min'],
})
print(data_aggr)

   country device price                         age                
                    max        mean median min  max        mean min
0      BRA    and   899  412.985594    499  99  899  412.985594  99
1      BRA    iOS   899  404.739300    299  99  899  404.739300  99
2      CAN    and   899  406.826087    499  99  899  406.826087  99
3      CAN    iOS   899  386.573964    299  99  899  386.573964  99
4      DEU    and   899  402.474903    299  99  899  402.474903  99
5      DEU    iOS   899  417.639798    499  99  899  417.639798  99
6      FRA    and   899  418.377163    499  99  899  418.377163  99
7      FRA    iOS   899  382.921569    299  99  899  382.921569  99
8      TUR    and   899  433.913793    499  99  899  433.913793  99
9      TUR    iOS   899  390.176471    299  99  899  390.176471  99
10     USA    and   899  399.000000    299  99  899  399.000000  99
11     USA    iOS   899  410.684783    499  99  899  410.684783  99


In [59]:
# Group the data 
grouped_purchase_data = purchase_data.groupby(by = ['device', 'gender'])

# Aggregate the data
purchase_summary = grouped_purchase_data.agg({'price': ['mean', 'median', 'std']})

# Examine the results
print(purchase_summary)

                    price                   
                     mean median         std
device gender                               
and    F       400.747504    299  179.984378
       M       416.237308    499  195.001520
iOS    F       404.435330    299  181.524952
       M       405.272401    299  196.843197


## Extracting KPIs

Now we will analyze the dataset to extract some KPIs

In [60]:
# Check most recent date
print("Newest record: {0} \nOldest record: {1}".format(
    purchase_data.date.max(),
    purchase_data.date.min()
))

Newest record: 2018-04-21 00:00:00 
Oldest record: 2015-03-02 00:00:00


In [61]:
# Latest lapse date
current_date = pd.to_datetime('2018-04-21')
max_lapse_date = current_date - timedelta(days=7)

# restrict users by lapse date


## KPI: Avg amount paid

We will calculate the average amount paid per purchase within a user's first 28 days. 

This KPI can provide a sense of the popularity of different in-app purchase price points to users within their first month.

In [62]:
# Compute max_purchase_date
max_purchase_date = current_date - timedelta(days=28)


# Filter to only include users who registered before our max date
purchase_data_filt = purchase_data[purchase_data.reg_date < max_purchase_date]

purchase_data_filt.head()


Unnamed: 0,date,uid,sku,price,reg_date,device,gender,country,age
0,2017-07-10,41195147,sku_three_499,499,2017-06-26,and,M,BRA,17
1,2017-07-15,41195147,sku_three_499,499,2017-06-26,and,M,BRA,17
2,2017-11-12,41195147,sku_four_599,599,2017-06-26,and,M,BRA,17
3,2017-09-26,91591874,sku_two_299,299,2017-01-05,and,M,TUR,17
4,2017-12-01,91591874,sku_four_599,599,2017-01-05,and,M,TUR,17


In [66]:
# Filter to contain only purchases within the first 28 days of registration
purchase_data_filt = purchase_data_filt[(purchase_data_filt.date <=
                                         purchase_data_filt.reg_date + 
                                         timedelta(days=28))]


# Output the mean price paid per purchase
print("Mean price paid per purchase: {0} cents".format(purchase_data_filt.price.mean()))

Mean price paid per purchase: 415.7741935483871 cents


Our average price is 414 cents which is below $4.99 

It seems that our purchasers tend towards the lower priced set of options.

## KPI: Average purchase price by cohort

P.S: A `cohort` is a group of subjects who share a defining characteristic (typically subjects who experienced a common event in a selected time period, such as birth or graduation). 

In [67]:
# Set the max registration date to be one month before today
max_reg_date = current_date - timedelta(days=28)

# Find the month 1 values:
month1 = np.where((purchase_data.reg_date < max_reg_date) &
                    (purchase_data.date < purchase_data.reg_date + timedelta(days=28)),
                  purchase_data.price, 
                  np.NaN)
                 
# Update the value in the DataFrame 
purchase_data['month1'] = month1

In [68]:
# Group the data by gender and device 
purchase_data_upd = purchase_data.groupby(by=['gender', 'device'], as_index=False) 

# Aggregate the month1 and price data 
purchase_summary = purchase_data_upd.agg(
                        {'month1': ['mean', 'median'],
                        'price': ['mean', 'median']})

In [69]:
purchase_summary

Unnamed: 0_level_0,gender,device,month1,month1,price,price
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,median,mean,median
0,F,and,390.758242,299.0,400.747504,299
1,F,iOS,429.434783,499.0,404.43533,299
2,M,and,417.285714,499.0,416.237308,499
3,M,iOS,434.39823,499.0,405.272401,299
