# Part 2: Segmentation with Clustering

In [None]:
import datetime
import numpy as np
import pandas as pd
import scipy

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

For this tutorial, we are going to create several clustering algorithms and, most importantly, analyze and interpret the result. We are going to use cleaned and preprocessed data from our first class. However, the data is not yet fully ready for use. We need to convert it to a customer level dataset.

In [None]:
# Load the data
data = pd.read_csv('data/data_cleared.csv')

data['InvoiceNo'] = data['InvoiceNo'].astype('O')
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data.head()

For the first model, we are going to create an RFM table as we did [in the previous lesson.](https://github.com/Tigran-Karamyan/customer_behaviour/blob/master/Week_6_Basics_of_Segmentation_RFM.ipynb) 

In [None]:
# create RFM
dt = data.groupby(['CustomerID', 'InvoiceDate'], as_index=False)['TotalPrice'].sum()
dt.head()

In [None]:
now = datetime.datetime(2011,12,10)

rfm= dt.groupby('CustomerID').agg({'InvoiceDate': lambda date: (now - date.max()).days,
                                     'CustomerID': 'count',
                                     'TotalPrice': 'mean'})

rfm.columns=['recency', 'frequency', 'monetary',]
rfm.reset_index(inplace=True)
rfm.head()

In [None]:
# Compute Pearson correlation coefficient for the features in our data set.
plt.figure(figsize = (10, 8))
s = sns.heatmap(rfm.corr(),
               annot = True, 
               cmap = 'RdBu',
               vmin = -1, 
               vmax = 1)

s.set_yticklabels(s.get_yticklabels(), rotation = 0, fontsize = 12)
s.set_xticklabels(s.get_xticklabels(), rotation = 90, fontsize = 12)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Standardizing data
cluster_data = rfm.drop(columns='CustomerID')
scaler = StandardScaler()
data_stand = scaler.fit_transform(cluster_data)

In [None]:
cluster_data

In [None]:
data_stand

In [None]:
x = pd.DataFrame(data_stand, columns=['recency','frequency','monetary'])
# x['recency'] = x * pd.Series(np.std(rfm.recency)) + pd.Series(np.mean(rfm.recency))

## ${\textbf{Hierarchical Clustering}}$

Hierarchical clustering starts by treating each observation as a separate cluster. Then, it repeatedly executes the following two steps: 
1. identify the two clusters that are closest together 
2. merge the two most similar clusters. This iterative process continues until all the clusters are merged together

The main output of Hierarchical Clustering is a dendrogram, which shows the hierarchical relationship between the clusters.

[Ward Method](https://jbhender.github.io/Stats506/F18/GP/Group10.html)

In [None]:
hier_clust = linkage(data_stand, method = 'ward')

In [None]:
# We plot the results from the Hierarchical Clustering using a Dendrogram. 
plt.figure(figsize = (12,9))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Observations')
plt.ylabel('Distance')
dendrogram(hier_clust,
           truncate_mode = 'level', 
           p = 10,
           no_labels = True)
plt.show()

The dendrogram shows that we have 3 different clusters. However, hierarchical clustering is not the most popular and advanced segmentation method. Anyway, it can help us get some preliminary insight about our data and customers, as well as we will get more understanding about how to use other machine learning algorithms such as K-Means.

## ${\textbf{K-means Clustering}}$

To process the learning data, the K-means algorithm in data mining starts with a first group of randomly selected centroids, which are used as the beginning points for every cluster, and then performs iterative (repetitive) calculations to optimize the positions of the centroids. 

[About Silhouette Score](https://towardsdatascience.com/silhouette-coefficient-validating-clustering-techniques-e976bb81d10c#:~:text=Silhouette%20Coefficient%20or%20silhouette%20score%20is%20a%20metric%20used%20to,each%20other%20and%20clearly%20distinguished.)

In [None]:
# Loop and count Within Cluster Sum of Squares
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters = i,  random_state = 42)
    kmeans.fit(data_stand)
    wcss.append(kmeans.inertia_)

In [None]:
plt.figure(figsize = (10,8))
plt.plot(range(1, 11), wcss, marker = 'o', linestyle = '--')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('K-means Clustering')
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 4, init = 'k-means++', random_state = 42)
kmeans.fit(data_stand)

Using the Elbow method, we see that we can distinguish 4 clusters. Now let's interpret the results.

### ${\textbf{Results}}$

In [None]:
segm_labels = cluster_data.copy()
segm_labels['Segments'] = kmeans.labels_
segm_labels.head()

In [None]:
profiling = segm_labels.groupby(['Segments'], as_index=False).mean()
profiling

In [None]:
#Create new columns: Segment size and Segment proportion 
profiling['Segment_size'] = segm_labels.groupby(['Segments'])['Segments'].count()
profiling['Segment_prop'] = round(profiling['Segment_size'] / profiling['Segment_size'].sum() * 100, 2)

# Add the segment labels to our table
profiling['Segments'] = profiling['Segments'].map({0:'promising', 
                                                   1:'champions',
                                                   2:'lost', 
                                                   3:'high spenders'})

profiling

In [None]:
segm_labels['Segments'] = segm_labels['Segments'].map({0:'promising', 
                                                   1:'champions',
                                                   2:'lost', 
                                                   3:'high spenders'})

segm_labels.head()

In [None]:
x_axis = segm_labels['monetary']
y_axis = segm_labels['frequency']
plt.figure(figsize = (10, 8))
sns.scatterplot(x_axis, y_axis, hue = segm_labels['Segments'], palette = ['g', 'r', 'c', 'm'])
plt.title('Segmentation K-means')
plt.show()

In [None]:
x_axis = segm_labels['monetary']
y_axis = segm_labels['recency']
plt.figure(figsize = (10, 8))
sns.scatterplot(x_axis, y_axis, hue = segm_labels['Segments'], palette = ['g', 'r', 'c', 'm'])
plt.title('Segmentation K-means')
plt.show()

In [None]:
x_axis = segm_labels['frequency']
y_axis = segm_labels['recency']
plt.figure(figsize = (10, 8))
sns.scatterplot(x_axis, y_axis, hue = segm_labels['Segments'], palette = ['g', 'r', 'c', 'm'])
plt.title('Segmentation K-means')
plt.show()

# Add more features

Now let's add some more features and see if the results are changing.

In [None]:
dt = data.groupby(['CustomerID', 'InvoiceDate'], as_index=False).agg({'TotalPrice': 'sum', 
                                                                      'Quantity' : 'sum', 
                                                                      "InvoiceNo" : 'count'})

customer_data = dt.groupby(['CustomerID']).agg(AvgQuantity = ('Quantity', 'mean'), 
                                               AvgDifferentProducts = ('InvoiceNo', 'mean'),
                                               Recency = ('InvoiceDate', lambda date: (now - date.max()).days),
                                               Frequency = ('CustomerID', 'count'),
                                               Monetary_Value = ('TotalPrice', 'mean'),
                                               GapBetweenOrders = ('InvoiceDate', lambda date: (date.max() - date.min()).days)
                                              )

customer_data.head()

In [None]:
data_stand = scaler.fit_transform(customer_data)

wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters = i, random_state = 42)
    kmeans.fit(data_stand)
    wcss.append(kmeans.inertia_)

In [None]:
plt.figure(figsize = (10,8))
plt.plot(range(1, 11), wcss, marker = 'o', linestyle = '--')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('K-means Clustering')
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 3,  random_state = 42)
kmeans.fit(data_stand)

In [None]:
segm_labels = customer_data.copy()
segm_labels['Segments'] = kmeans.labels_
segm_labels.head()

In [None]:
profiling = segm_labels.groupby(['Segments'], as_index=False).mean()

profiling['Segment_size'] = segm_labels.groupby(['Segments'])['Segments'].count()
profiling['Segment_prop'] = round(profiling['Segment_size'] / profiling['Segment_size'].sum() * 100, 2)

profiling

In [None]:
# Add the segment labels to our table
profiling['Segments'] = profiling['Segments'].map({0:'lost', 
                                                   1:'promising',
                                                   2:'champions'})

profiling

In [None]:
segm_labels['Segments'] = segm_labels['Segments'].map({0:'lost', 
                                                   1:'promising',
                                                   2:'champions'})

segm_labels.head()

In [None]:
x_axis = segm_labels['Monetary_Value']
y_axis = segm_labels['AvgQuantity']
plt.figure(figsize = (10, 8))
sns.scatterplot(x_axis, y_axis, hue = segm_labels['Segments'], palette = ['g', 'r', 'c'])
plt.title('Segmentation K-means')
plt.show()

In [None]:
x_axis = segm_labels['Recency']
y_axis = segm_labels['Frequency']
plt.figure(figsize = (10, 8))
sns.scatterplot(x_axis, y_axis, hue = segm_labels['Segments'], palette = ['g', 'r', 'c'])
plt.title('Segmentation K-means')
plt.show()

In [None]:
x_axis = segm_labels['GapBetweenOrders']
y_axis = segm_labels['Frequency']
plt.figure(figsize = (10, 8))
sns.scatterplot(x_axis, y_axis, hue = segm_labels['Segments'], palette = ['g', 'r', 'c'])
plt.title('Segmentation K-means')
plt.show()
