In [1]:
## Importing necessary libraries
import sys
import sqlite3
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path

# defining functions module path
module_path = str(Path.cwd().parents[0] / "src")
if module_path not in sys.path:
    sys.path.append(module_path)

from functions import normalize, clean_data

In [2]:
# Getting the transactions data from sqlite
conn = sqlite3.connect("../data/bootcamp_db") # creating connection

In [3]:
# Quering the data from the online_transactions_fixed table
query = """
select *
from online_transactions_fixed
"""
ot = pd.read_sql(query, conn)

In [4]:
# The function clean_data performed a serie of cleaning steps defined on EDA notebook
online_trans = clean_data(ot)

In [5]:
online_trans.describe()

Unnamed: 0,old_index,quantity,price,total_order_value,invoice_year,invoice_month,invoice_day,invoice_weekday
count,365180.0,365180.0,365180.0,365180.0,365180.0,365180.0,365180.0,365180.0
mean,201388.23323,12.879829,2.905252,21.919158,2010.934356,7.65009,15.011531,2.592034
std,115651.048694,43.577928,7.381186,95.735167,0.24766,3.405954,8.635549,1.920402
min,0.0,1.0,0.001,0.001,2010.0,1.0,1.0,0.0
25%,101147.75,2.0,1.25,4.95,2011.0,5.0,7.0,1.0
50%,202427.5,6.0,1.95,11.9,2011.0,8.0,15.0,2.0
75%,301305.25,12.0,3.75,19.8,2011.0,11.0,22.0,4.0
max,399806.0,4800.0,1599.26,38970.0,2011.0,12.0,31.0,6.0


#### In the previous notebook (EDA.ipynb) the data was grouped by customer id and the values of Recency, Frequency, and Monetary Value were computed. 
- Recency was defined as the difference between the current day and the day of the customer's last purchase. As the data is from 2011, and a customer whose recency is bigger than a year can be considered as churned, I chosed the "current day" as the last day of the last purchase on the data set, meaning that the minimum value that recency can take is 0.
- Frequency was define as the number of transactions that a customer performed.
- Monetary Value is simply the sum of the total order value per customer on the dataset

In this notebook, I am going to implement a customer segmentation analysis focusing on an percentile-based grouping RFM score

In [6]:
# Importing customer data
cust_data = pd.read_csv("../data/customer_data.csv")

In [7]:
cust_data.describe()

Unnamed: 0,loyalty_time,mean_nstock,mean_nitem,mon_value,frequency,recency
count,4247.0,4247.0,4247.0,4247.0,4247.0,4247.0
mean,133.676948,21.560731,224.903868,1884.727573,85.985401,88.463386
std,132.639094,19.285931,305.002619,8210.321624,215.913912,98.735861
min,0.0,1.0,1.0,0.85,1.0,0.0
25%,0.0,9.041667,90.583333,297.2,16.0,15.5
50%,101.0,16.5,158.235294,635.68,39.0,46.0
75%,257.0,28.0,265.483333,1534.09,94.0,134.0
max,372.0,300.647059,7824.0,279138.02,7499.0,373.0


### Let's calculated the RFM score based on a percentile grouping

In [8]:
r_quartiles = pd.qcut(cust_data['mon_value'], q=4, labels=range(4, 0, -1))
f_quartiles = pd.qcut(cust_data['frequency'], q=4, labels=range(1,5))
m_quartiles = pd.qcut(cust_data['mon_value'], q=4, labels=range(1,5))

In [9]:
cust_data['R'] = r_quartiles
cust_data['F'] = f_quartiles
cust_data['M'] = m_quartiles

In [10]:
def rfm_segm(x):
    return str(x['R']) + str(x['F']) + str(x['M'])
cust_data['RFM_segment'] = cust_data.apply(rfm_segm, axis=1) 

In [11]:
cust_data['RFM_score'] = cust_data[['R', 'F', 'M']].sum(axis=1)

In [12]:
cust_data.sort_values("RFM_segment")

Unnamed: 0,customer_id,loyalty_time,mean_nstock,mean_nitem,mon_value,frequency,recency,R,F,M,RFM_segment,RFM_score
3325,u16986,0,1.500000,520.000000,1873.20,3.0,29,1,1,4,114,6
3113,u16698,26,2.500000,420.000000,1998.00,5.0,226,1,1,4,114,6
3094,u16671,136,2.000000,291.666667,1692.27,6.0,27,1,1,4,114,6
3550,u17317,43,4.000000,555.000000,2562.18,8.0,45,1,1,4,114,6
2647,u16041,110,2.000000,528.000000,3166.40,6.0,30,1,1,4,114,6
...,...,...,...,...,...,...,...,...,...,...,...,...
3730,u17561,0,45.000000,144.000000,165.12,45.0,18,4,3,1,431,8
2563,u1592,0,19.333333,31.333333,163.55,58.0,153,4,3,1,431,8
2537,u15877,16,106.000000,172.000000,239.31,106.0,1,4,4,1,441,9
1937,u1506,0,28.500000,64.000000,293.00,114.0,7,4,4,1,441,9


In [17]:
# Let's inspect number of customers in each segment
cust_data.groupby('RFM_segment').size().sort_values(ascending=False)

RFM_segment
144    767
411    724
322    553
233    542
421    242
312    234
243    228
223    217
332    212
134    207
431     93
213     74
342     63
124     57
114     31
441      3
dtype: int64

In [20]:
cust_data.groupby('RFM_segment').get_group('441')

Unnamed: 0,customer_id,loyalty_time,mean_nstock,mean_nitem,mon_value,frequency,recency,R,F,M,RFM_segment,RFM_score
1937,u1506,0,28.5,64.0,293.0,114.0,7,4,4,1,441,9
2537,u15877,16,106.0,172.0,239.31,106.0,1,4,4,1,441,9
3512,u17254,10,55.5,125.5,271.19,111.0,4,4,4,1,441,9


In [24]:
cust_data['RFM_score'].describe()

count    4247.000000
mean        7.497528
std         1.118268
min         6.000000
25%         6.000000
50%         7.000000
75%         8.000000
max         9.000000
Name: RFM_score, dtype: float64