In [None]:
import datetime
import numpy as np
import pandas as pd
import scipy

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

For this tutorial, we are going to create several clustering algorithms and, most importantly, analyze and interpret the result. We are going to use cleaned and preprocessed data from our first class. However, the data is not yet fully ready for use. We need to convert it to a customer level dataset.

In [None]:
# Load the data
data = pd.read_csv('data/data_cleared.csv')

data['InvoiceNo'] = data['InvoiceNo'].astype('O')
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data.head()

For the first model, we are going to create an RFM table as we did [in the previous lesson.](https://github.com/LilitYolyan/customer_behavior_analysis/blob/master/Week_4_Basics_of_Segmentation_RFM.ipynb) 

In [None]:
# create RFM
dt = data.groupby(['CustomerID', 'InvoiceDate'], as_index=False)['TotalPrice'].sum()
dt.head()

In [None]:
now = datetime.datetime(2011,12,10)

rfm= dt.groupby('CustomerID').agg({'InvoiceDate': lambda date: (now - date.max()).days,
                                     'CustomerID': 'count',
                                     'TotalPrice': 'mean'})

rfm.columns=['recency', 'frequency', 'monetary',]
rfm.reset_index(inplace=True)
rfm.head()

In [None]:
# Compute Pearson correlation coefficient for the features in our data set.
plt.figure(figsize = (10, 8))
s = sns.heatmap(rfm.corr(),
               annot = True, 
               cmap = 'RdBu',
               vmin = -1, 
               vmax = 1)

s.set_yticklabels(s.get_yticklabels(), rotation = 0, fontsize = 12)
s.set_xticklabels(s.get_xticklabels(), rotation = 90, fontsize = 12)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Standardizing data
cluster_data = rfm.drop(columns='CustomerID')
scaler = StandardScaler()
data_stand = scaler.fit_transform(cluster_data)

# Add more features

Now let's add some more features and see if the results are changing.

In [None]:
dt = data.groupby(['CustomerID', 'InvoiceDate'], as_index=False).agg({'TotalPrice': 'sum', 
                                                                      'Quantity' : 'sum', 
                                                                      "InvoiceNo" : 'count'})

customer_data = dt.groupby(['CustomerID']).agg(AvgQuantity = ('Quantity', 'mean'), 
                                               AvgDifferentProducts = ('InvoiceNo', 'mean'),
                                               Recency = ('InvoiceDate', lambda date: (now - date.max()).days),
                                               Frequency = ('CustomerID', 'count'),
                                               Monetary_Value = ('TotalPrice', 'mean'),
                                               GapBetweenOrders = ('InvoiceDate', lambda date: (date.max() - date.min()).days)
                                              )

customer_data.head()

In [None]:
data_stand = scaler.fit_transform(customer_data)

In [None]:
data_stand.shape

### ${\textbf{PCA}}$

Principal Component Analysis, or PCA, is a dimensionality-reduction method that is often used to reduce the dimensionality of large data sets, by transforming a large set of variables into a smaller one that still contains most of the information in the large set. 

In [None]:
pca = PCA()
pca.fit(data_stand)
pca.explained_variance_ratio_

In [None]:
plt.figure(figsize = (12,9))
plt.plot(range(1,7), pca.explained_variance_ratio_.cumsum(), marker = 'o', linestyle = '--')
plt.title('Explained Variance by Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')

According to the rule of thumb, 80% explained variance is a good choice for PCA.

In [None]:
pca = PCA(n_components = 3)
pca.fit(data_stand)

In [None]:
pca.components_

### ${\textbf{PCA Results}}$

In [None]:
df_pca_comp = pd.DataFrame(data = pca.components_,
                           columns = customer_data.columns.values,
                           index = ['Component 1', 'Component 2', 'Component 3'])
df_pca_comp

In [None]:
sns.heatmap(df_pca_comp,
            vmin = -1, 
            vmax = 1,
            cmap = 'RdBu',
            annot = True)
plt.yticks([0, 1, 2], 
           ['Component 1', 'Component 2', 'Component 3'],
           rotation = 45,
           fontsize = 9)

In [None]:
pca_results = pca.transform(data_stand)
pca_results.shape

### ${\textbf{K-means clustering with PCA}}$

In [None]:
# We fit K means using the transformed data from the PCA.
wcss = []
for i in range(1,11):
    kmeans_pca = KMeans(n_clusters = i, random_state = 42)
    kmeans_pca.fit(pca_results)
    wcss.append(kmeans_pca.inertia_)

In [None]:
# Plot the Within Cluster Sum of Squares for the K-means PCA model. Here we make a decission about the number of clusters.
# Again it looks like four is the best option.
plt.figure(figsize = (10,8))
plt.plot(range(1, 11), wcss, marker = 'o', linestyle = '--')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.title('K-means with PCA Clustering')
plt.show()

In [None]:
kmeans_pca = KMeans(n_clusters = 4, init = 'k-means++', random_state = 42)
kmeans_pca.fit(pca_results)

In [None]:
pca_frame = pd.DataFrame(pca_results, columns=['Component 1', 'Component 2', 'Component 3'])
final_data = pd.concat([customer_data.reset_index(drop = True), pca_frame],  axis = 1)
final_data['Segments'] = kmeans_pca.labels_

In [None]:
final_data

In [None]:
segm_labels = customer_data.copy()
segm_labels['Segments'] = kmeans_pca.labels_
segm_labels.head()

In [None]:
profiling = final_data.groupby(['Segments'], as_index=False).mean()

profiling['Segment_size'] = final_data.groupby(['Segments'])['Segments'].count()
profiling['Segment_prop'] = round(profiling['Segment_size'] / profiling['Segment_size'].sum() * 100, 2)

profiling

In [None]:
# Add the segment labels to our table
profiling['Segments'] = profiling['Segments'].map({0:'promising', 
                                                   1:'new_and_promising',
                                                   2:'champions', 
                                                   3:'lost'})

profiling

In [None]:
# Add the segment labels to our table
final_data['Segments'] = final_data['Segments'].map({0:'promising', 
                                                   1:'new_and_promising',
                                                   2:'champions', 
                                                   3:'lost'})

final_data.head()

In [None]:
x_axis = final_data['Component 2']
y_axis = final_data['Component 1']
plt.figure(figsize = (10, 8))
sns.scatterplot(x_axis, y_axis, hue = final_data['Segments'], palette = ['g', 'r', 'c', 'm'])
plt.title('Clusters by PCA Components')
plt.show()

As we can see, PCA helps to get better quality segments with clearer differences. We can now use these segments to create a better business strategy or make more personalized decisions.