In [14]:
import pandas as pd
import numpy as np

In [15]:
customers = pd.read_csv('./customers.csv')
customers.head()

Unnamed: 0,id,created
0,35410,2015-07-03 22:01:11
1,35417,2015-07-03 22:11:23
2,35412,2015-07-03 22:02:52
3,35413,2015-07-03 22:05:02
4,35424,2015-07-03 22:21:55


In [16]:
customers.shape

(25716, 2)

In [17]:
orders = pd.read_csv('./orders.csv')
orders.head()

Unnamed: 0,id,order_number,user_id,created
0,1709,36,344,2014-10-28 00:20:01
1,1406,7,608,2014-10-14 23:44:53
2,1716,6,2296,2014-10-28 17:47:07
3,1426,2,1225,2014-10-15 18:33:38
4,1415,6,797,2014-10-15 02:07:16


In [18]:
first_order_by_customer = orders.groupby('user_id').agg({'created': np.min}).rename(columns = {'created': 'first_order'}) # time of first order by user id
# convert the first order column to a datetime column
first_order_by_customer['first_order'] = pd.to_datetime(first_order_by_customer['first_order'])
first_order_by_customer.head()

Unnamed: 0_level_0,first_order
user_id,Unnamed: 1_level_1
2,2014-05-16 00:13:50
3,2014-05-21 20:39:23
5,2014-05-18 01:36:30
10,2014-05-26 21:10:14
11,2014-06-03 22:25:15


In [19]:
customers_first_order = customers.join(first_order_by_customer, on='id', how='inner')
customers_first_order.head()

Unnamed: 0,id,created,first_order
4,35424,2015-07-03 22:21:55,2015-07-03 23:37:49
9,35399,2015-07-03 21:30:36,2015-07-03 22:17:24
22,35414,2015-07-03 22:09:04,2015-07-03 22:51:05
28,35452,2015-07-04 00:20:15,2015-07-04 00:30:13
33,35442,2015-07-03 23:33:17,2015-07-04 00:05:48


In [20]:
customers_first_order.shape

(5356, 3)

For this exercise group the customers into week long (7 days) cohorts and then calculate how many distinct customers ordered within X days from their signup date, where X is a multiple of 7. Older cohorts will have more buckets: 0-6 days, 7-13 days, 14-20 days, etc.

In [21]:
customers_first_order['customer_cohort'] = pd.to_datetime(customers_first_order.created)
customers_first_order.head()

Unnamed: 0,id,created,first_order,customer_cohort
4,35424,2015-07-03 22:21:55,2015-07-03 23:37:49,2015-07-03 22:21:55
9,35399,2015-07-03 21:30:36,2015-07-03 22:17:24,2015-07-03 21:30:36
22,35414,2015-07-03 22:09:04,2015-07-03 22:51:05,2015-07-03 22:09:04
28,35452,2015-07-04 00:20:15,2015-07-04 00:30:13,2015-07-04 00:20:15
33,35442,2015-07-03 23:33:17,2015-07-04 00:05:48,2015-07-03 23:33:17


In [22]:
n_distinct_customers = customers_first_order.groupby([pd.Grouper(key='customer_cohort', freq='W'), pd.Grouper(key='first_order', freq='W')]).agg({'id': pd.Series.nunique}).rename(columns={'id': 'n_unique_customers'})
n_distinct_customers.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n_unique_customers
customer_cohort,first_order,Unnamed: 2_level_1
2015-01-04,2015-01-04,16
2015-01-04,2015-01-11,4
2015-01-04,2015-01-18,3
2015-01-04,2015-01-25,1
2015-01-04,2015-02-01,1


In [23]:
sum_by_cohort = n_distinct_customers.sum(level=0).rename(columns={'n_unique_customers': 'cohort_unique_customers'})
sum_by_cohort.head()

Unnamed: 0_level_0,cohort_unique_customers
customer_cohort,Unnamed: 1_level_1
2015-01-04,32
2015-01-11,389
2015-01-18,394
2015-01-25,469
2015-02-01,234


In [26]:
# n_distinct_customers.merge(sum_by_cohort, left_index = True, right_index=True)
n_distinct_customers = sum_by_cohort.merge(n_distinct_customers, left_index = True, right_index  = True)
n_distinct_customers.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cohort_unique_customers,n_unique_customers
customer_cohort,first_order,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-01-04,2015-01-04,32,16
2015-01-04,2015-01-11,32,4
2015-01-04,2015-01-18,32,3
2015-01-04,2015-01-25,32,1
2015-01-04,2015-02-01,32,1


In [27]:
distinct_customers_pivot = n_distinct_customers.reset_index()
distinct_customers_pivot = distinct_customers_pivot.sort_values(by='customer_cohort', ascending=False)
distinct_customers_pivot.pivot(index='customer_cohort', columns='first_order', values='n_unique_customers')

first_order,2015-01-04,2015-01-11,2015-01-18,2015-01-25,2015-02-01,2015-02-08,2015-02-15,2015-02-22,2015-03-01,2015-03-08,...,2015-05-10,2015-05-17,2015-05-24,2015-05-31,2015-06-07,2015-06-14,2015-06-21,2015-06-28,2015-07-05,2015-07-12
customer_cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-04,16.0,4.0,3.0,1.0,1.0,1.0,,1.0,,1.0,...,1.0,,,,,,,1.0,,
2015-01-11,,82.0,72.0,55.0,26.0,20.0,14.0,10.0,11.0,14.0,...,4.0,3.0,4.0,2.0,2.0,2.0,1.0,5.0,4.0,2.0
2015-01-18,,,152.0,85.0,21.0,21.0,7.0,10.0,10.0,11.0,...,5.0,3.0,5.0,3.0,4.0,7.0,6.0,1.0,1.0,2.0
2015-01-25,,,,156.0,67.0,57.0,20.0,20.0,17.0,23.0,...,5.0,3.0,4.0,4.0,1.0,5.0,2.0,2.0,2.0,2.0
2015-02-01,,,,,114.0,32.0,14.0,14.0,7.0,8.0,...,3.0,2.0,7.0,1.0,3.0,,1.0,,2.0,
2015-02-08,,,,,,80.0,17.0,11.0,5.0,5.0,...,3.0,2.0,,2.0,2.0,1.0,2.0,,1.0,
2015-02-15,,,,,,,66.0,15.0,7.0,13.0,...,3.0,2.0,,,3.0,1.0,1.0,2.0,2.0,1.0
2015-02-22,,,,,,,,61.0,20.0,11.0,...,1.0,,2.0,,,,1.0,1.0,,
2015-03-01,,,,,,,,,84.0,53.0,...,5.0,2.0,2.0,1.0,1.0,2.0,2.0,5.0,2.0,3.0
2015-03-08,,,,,,,,,,183.0,...,8.0,6.0,5.0,3.0,2.0,3.0,5.0,6.0,4.0,


In [78]:
customers_orders = customers.join(orders.set_index('user_id'), on='id', how='inner', rsuffix='_order')

In [79]:
customers_orders.head()

Unnamed: 0,id,created,id_order,order_number,created_order
4,35424,2015-07-03 22:21:55,27970,1,2015-07-03 23:37:49
9,35399,2015-07-03 21:30:36,27940,1,2015-07-03 22:17:24
22,35414,2015-07-03 22:09:04,27949,1,2015-07-03 22:51:05
28,35452,2015-07-04 00:20:15,27988,1,2015-07-04 00:30:13
33,35442,2015-07-03 23:33:17,27980,1,2015-07-04 00:05:48


In [80]:
customers_orders['customer_cohort'] = pd.to_datetime(customers_orders.created)

In [81]:
# customers_orders.index = customers_orders['created_date']
# customers_orders.head()

In [83]:
customers_orders['created_order'] = pd.to_datetime(customers_orders.created_order)

In [84]:
grouped = customers_orders.groupby([pd.Grouper(key='customer_cohort', freq='W'), pd.Grouper(key='created_order', freq='W')])

In [94]:
n_unique_customer_orders_by_cohort = grouped.agg({ 'id': pd.Series.nunique }).rename(columns={'id': 'distinct_customers'})

In [104]:
n_unique_customer_orders_by_cohort.loc['2015-01-04']['distinct_customers'].sum()

146

In [113]:
customers_orders.groupby([pd.Grouper(key='customer_cohort', freq='W'), pd.Grouper(key='created_order', freq='W')])

AttributeError: Cannot access callable attribute 'reset_index' of 'DataFrameGroupBy' objects, try using the 'apply' method

In [111]:
customers_orders.groupby([pd.Grouper(key='customer_cohort', freq='w')]).agg({'id': pd.Series.nunique})

Unnamed: 0_level_0,id
customer_cohort,Unnamed: 1_level_1
2015-01-04,32
2015-01-11,389
2015-01-18,394
2015-01-25,469
2015-02-01,234
2015-02-08,147
2015-02-15,135
2015-02-22,110
2015-03-01,239
2015-03-08,333
