# Customer Segmentation

#### Three types of cohorts:-
+ Time cohorts
+ Behavior cohorts
+ Size cohorts

## Time cohorts

In [None]:
def get_month(x): return dt.datetime(x.year, x.month, 1) #01 day of month and year
online['InvoiceMonth'] = online['InvoiceDate'].apply(get_month)
grouping = online.groupby('customerID')['InvoiceMonth']
online['cohortMonth'] = grouping.transform('min')
online.head()

In [None]:
# Function to extract year, month and day integer values
def get_date_int(df, column):
    year = df[column].dt.year
    month = df[column].dt.month
    day = df[column].dt.day
    return year, month, day

In [None]:
# Assign time offset value
invoice_year, invoice_month, _ = get_date_int(online, 'InvoiceMonth')
cohort_year, cohort_month, _ = det_date_int(online,'cohortMonth')
years_diff = invoice_year - cohort_year
months_diff = invoice_month - cohort_month
online['CohortIndex'] = years_diff * 12 + months_diff + 1

In [None]:
grouping = online.groupby(['CohortMonth','CohortIndex'])
cohort_data = grouping['CustomerID'].apply(pd.Series.nunique)
cohort_data = cohort_data.reset_index()
cohort_counts = cohort_data.pivot(index = 'CohortMonth',
                                 columns = 'CohortIndex',
                                 values = 'CustomerID')

#### Calculate the retention rates

cohort_sizes = cohort_counts.iloc[:,0]
retention = cohort_counts.divide(cohort_sizes,axis = 0)
retention.round(3)*100

#### Visualize the retention rates

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = (10,8))
plt.title('Retention rates')
sns.heatmap(data = retention,
           annot = True,
           fmt = '.0%'
           vmin = 0.0,
           vmax = 0.5,
           cmap = 'BuGn')
plt.show()

## RFM Segmentation

RFM values can be grouped in several ways
+ percentiles
+ pareto 80/20 cut
+ custom - based on business knowledge

In [None]:
# Based on quartiles
spend_quartiles = pd.qcut(data['Spend'], q = 4, lables = range(1,5))
data['Spend_quartile'] = spend_quartiles
data.sort_values('Spend')

In [None]:
# Labels in case of Recency (Lower the recency, higher the quartile number)
r_labels = list(range(4, 0, -1))
# String labels can also be created like this
r_labels = ['Active','Lapsed','Inactive','Churned']
recency_quartiles = pd.qcut(data['Recency_Days'], q=4, labels = r_labels)
data['Recency_Quartile'] = recency_quartiles
data.sort_values('Recency_Days')

#### Create a hypothetical snapshot_day

In [None]:
snapshot_date = max(online.InvoiceDate) + datetime.timedelta(days = 1)

In [None]:
# Aggregate data on a customer level
datamart = online.groupby(['CustomerID']).agg({
    'InvoiceDate': lambda x:(snapshot_date - x.max()).days,
    'InvoiceNo':'count',
    'TotalSum':'sum'})

# Rename columns for easier interpretation
datamart.rename(columns = {'InvoiceDate':'Recency',
                          'InvoiceNo':'Frequency',
                          'TotalSum':'MonetaryValue'}, inplace = True     )

In [None]:
### creating quartiles
f_labels = range(1,5)
f_quartiles = pd.qcut(datamart['Frequency'],4,labels = f_labels)
datamart = datamart.assign(F = f_quartiles.values)

#### Build RFM Segment and RFM score

In [None]:
# Concatenate RFM quartile values to RFM_Segment
# Sum RFM quartiles values to RFM_Score
datamart['RFM_Segment'] = datamart.apply(join_rfm, axis = 1)
datamart['RFM_Score'] = datamart[['R','F','M']].sum(axis = 1)

In [5]:
print(range(4,0,-1))

range(4, 0, -1)


In [6]:
range(4,0,-1)

range(4, 0, -1)

In [None]:
# Calculate the size of the segment
datamart.groupby('RFM_Segment').size().sort_values(ascending = False)[:10]

In [None]:
# select bottom RFM segment '111' and view top 5 rows
datamart[datamart['RFM_Segment']=='111'][:5]

In [None]:
datamart.groupby('RFM_Score').agg({
    'Recency':'mean',
    'Frequency':'mean',
    'MonetaryValue':['mean','count']})
    .round(1)

In [None]:
# Group customers into named segments
# use RFM score to group customers into Gold, Silver and Bronze segments

def segment_me(df):
    if df['RFM_Score'] >= 9:
        return 'Gold'
    elif (df['RFM_Score'] == 5) and (df['RFM_Score'] < 9):
        return 'Silver'
    else:
        return 'Bronze'
    
datamart['General_segment'] = datamart.apply(segment_me, axis = 1)

### Data pre-processing for K-Means Clustering