In this programme we want to create segments of customers based on relevant buying behaviour. We are using a data set from here: http://archive.ics.uci.edu/ml/machine-learning-databases/00352/
More info on the data set can be found here: http://archive.ics.uci.edu/ml/datasets/Online+Retail

Each transaction has a customer ID and then associated information on that purchase such as price and quantity of products bought

In [130]:
import tensorflow as tf
import pandas as pd
import numpy as np
import json
import itertools
from sklearn.mixture import GMM
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

#Import the data file as a matrix
try:
    datafile = pd.read_csv('Retail_Data.csv')
except:
    print("The data cannot load")

#Create a placeholder for the no of segments we want to create
no_clusters = tf.placeholder(tf.float32, name = 'no_clusters')

#View basic info about the data
stats = datafile.describe()
stats



Unnamed: 0,CustomerID,total spend,No. of purchases,Average value of purchase
count,4372.0,4372.0,4372.0,4372.0
mean,15299.677722,322.008226,93.053294,7.978683
std,1722.390705,1284.783098,232.471608,144.963316
min,12346.0,0.0,1.0,0.0
25%,13812.75,52.865,17.0,2.22
50%,15300.5,130.095,42.0,2.94
75%,16778.25,302.3325,102.0,3.9
max,18287.0,41376.33,7983.0,8055.78


In [27]:
#Select a number of data examples to view
samples = datafile.loc[[1,20,58, 287, 1025, 4000],:]
print(samples)

      CustomerID  total spend  No. of purchases  Average value of purchase
1          13047       798.30               196                       4.07
20         13408      1614.50               501                       3.22
58         15485       273.62                86                       3.18
287        17059      1371.39               516                       2.66
1025       15059       754.77               239                       3.16
4000       13607       308.76                98                       3.15


In [28]:
#Get the titles of the columns
column_names = datafile.columns
print(column_names)


Index(['CustomerID', 'total spend', 'No. of purchases',
       'Average value of purchase'],
      dtype='object')


In [30]:
#Apply log scale to the datat so it has a better distribution
log_data = np.log(datafile)
log_samples = np.log(samples)
print(log_samples)



      CustomerID  total spend  No. of purchases  Average value of purchase
1       9.476314     6.682484          5.278115                   1.403643
20      9.503607     7.386781          6.216606                   1.169381
58      9.647627     5.611740          4.454347                   1.156881
287     9.744433     7.223580          6.246107                   0.978326
1025    9.619731     6.626413          5.476464                   1.150572
4000    9.518340     5.732564          4.584967                   1.147402


  


Assign the value of the 25th percentile for the given feature to Q1. Use np.percentile for this.
Assign the value of the 75th percentile for the given feature to Q3. Again, use np.percentile.
Assign the calculation of an outlier step for the given feature to step.
Optionally remove data points from the dataset by adding indices to the outliers list.

In [132]:
#Create list to store outliers in
outliers_list = []
u_outliers = []
q1_list = {}
q3_list = {}
step_list = {}

#Find the 25th percentile figure for each column
for i in range(1,4):
    #find 25th pecentile
    q1 = np.percentile(log_data.loc[:,column_names[i]], 25)
    q1_list[column_names[i]] = [q1]
    #find 75th percentile
    q3 = np.percentile(log_data.loc[:,column_names[i]], 75)
    q3_list[column_names[i]] = [q3]
    #Calculate steps from IQR
    step = 1.5*(q3-q1)
    step_list[column_names[i]] = step
    #Find outliers
    outliers_rows = log_data.loc[~((log_data[column_names[i]] >= q1 - step) & (log_data[column_names[i]] <= q3 + step)), :]
    outliers_list.append(list(outliers_rows.index))

#Makes the outliers lists all one long list rather than them being separated by square brackets (d0nt understand!)
outliers = list(itertools.chain.from_iterable(outliers_list))  
u_outliers = list(set(outliers))

print("Q1 thresholds", q1_list)    
print("Q3 thresholds", q3_list)
print("List of steps", step_list)
print("list of unique outliers", u_outliers)

#remove all outliers from the dataset
good_data = log_data.drop(u_outliers)

Q1 thresholds {'total spend': [3.9677404060726249], 'No. of purchases': [2.8332133440562162], 'Average value of purchase': [0.79750719588418817]}
Q3 thresholds {'total spend': [5.7115265934774113], 'No. of purchases': [4.6249728132842707], 'Average value of purchase': [1.3609765531356006]}
List of steps {'total spend': 2.6156792811071794, 'No. of purchases': 2.6876392038420818, 'Average value of purchase': 0.84520403587711868}
list of unique outliers [514, 4, 3589, 518, 7, 8, 9, 1547, 524, 13, 2059, 4111, 3089, 1042, 19, 2067, 4114, 4118, 4122, 3099, 1051, 3101, 3102, 542, 3105, 2082, 1571, 3110, 2599, 1574, 1578, 2602, 557, 3117, 559, 48, 3629, 2615, 4156, 2622, 1090, 67, 3140, 69, 3652, 1608, 1096, 1611, 2126, 4178, 1106, 2643, 3154, 598, 604, 93, 3677, 1116, 1630, 3166, 4190, 3172, 4196, 4200, 108, 2157, 3692, 4210, 3193, 122, 2685, 1150, 1666, 132, 1669, 1668, 2181, 4231, 4232, 3211, 1165, 2702, 141, 1169, 2196, 4244, 1688, 2713, 1177, 2203, 2202, 3228, 1182, 671, 3743, 4258, 2729,

Now we will use k means clustering to find segments in the good_data and get a list of customer IDs for each of the segments. We will use the number of clusters with the greatest silhouette score

In [183]:
#Remove customer ID so it isnt used for the clustering
final_data = good_data.drop('CustomerID',1)

#Create clusters and apply predictions to data
kmeans = KMeans(n_clusters=2).fit(final_data)
preds = kmeans.predict(final_data)
kmeans.labels_
score = silhouette_score(final_data, preds, metric='euclidean')
centers = kmeans.cluster_centers_

true_centers = np.exp(centers)
print(true_centers)

#Get the indices for the 2 segments
segment_1_index = np.where(preds == 0)
segment_2_index = np.where(preds == 1)

segment_1_customer_list = good_data.iloc[segment_1_index]
segment_1_customers = np.exp(segment_1_customer_list)

segment_2_customer_list = good_data.iloc[segment_2_index]
segment_2_customers = np.exp(segment_2_customer_list)

segment_1_customers.describe()

[[ 347.83379466  117.24490576    2.96665568]
 [  50.216463     17.07382497    2.94127452]]


Unnamed: 0,CustomerID,total spend,No. of purchases,Average value of purchase
count,2033.0,2033.0,2033.0,2033.0
mean,15275.615839,456.911654,155.371864,3.14361
std,1715.56656,413.975357,146.509306,1.133583
min,12347.0,89.85,28.0,0.98
25%,13798.0,204.02,68.0,2.37
50%,15271.0,305.96,105.0,2.93
75%,16768.0,543.38,180.0,3.69
max,18287.0,3852.33,1284.0,9.05


In [185]:
segment_2_customers.describe()

Unnamed: 0,CustomerID,total spend,No. of purchases,Average value of purchase
count,2065.0,2065.0,2065.0,2065.0
mean,15323.720581,64.15172,21.308959,3.260305
std,1712.60672,39.778217,12.773308,1.520408
min,12348.0,4.05,2.0,0.96
25%,13859.0,31.05,11.0,2.12
50%,15325.0,58.35,20.0,2.95
75%,16778.0,92.51,30.0,4.02
max,18282.0,203.45,67.0,8.99
