# Unsupervised learning - Clustering

K means clustering will cluster each data point over each variable and over the specified number of clusters. Clustering aims to have the largest distance from cluster centroids (means), while also minimising the distance of each member from the centroid (variance).

### Import libraries

In [None]:
# import libraries
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

### Read CSV file

In [None]:
# read the aggregated dataset
df = pd.read_csv('SQL_aggregated_data.csv', header = 0, )

### Explore data

In [None]:
df.head(3)

In [None]:
# check number of rows and columns
df.shape

### Pull out only those retailers from the future period

In [None]:
# it's unclear how to combine trial and future data from the same retailer - sum/mean? 
# group by retailer (mean doesn't do anything here as each retailer only appears once in the future period)
df = df[df['period']=='future']
df = df.groupby(['CustomAttribute1']).mean()

In [None]:
# drop profit as not used, total_conversionvalue is a duplicate of revenue
df = df.drop(columns = ['Profit','total_conversionvalue'])

In [None]:
df.describe()

## Rescale features (0 , 1) before clustering

In [None]:
# rescale features for k-means cluster analysis
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_scaledClst = df.copy()
df_scaledClst[:] = scaler.fit_transform(df_scaledClst)

### Check for optimal number of clusters (1-20) w Elbow Method

In [None]:
# apply K-means clustering over 1-20 different clusters, and plot the sum of the squared distances of each data point from the
# cluster centroid, over each feature and cluster
from sklearn.cluster import KMeans
Sum_of_squared_distances = []
K = range(1,20)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(df_scaledClst)
    Sum_of_squared_distances.append(km.inertia_)

In [None]:
# Plot elbow
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

### Assign clusters to non-rescaled data (for visualization)

In [None]:
# apply k-means clustering to data for 4 clusters using info from above plot - we want the smallest number of clusters that 
# provides a low squared distance
kmeans = KMeans(n_clusters = 4).fit(df_scaledClst)
df_Clst = df.copy()
df_Clst['cluster'] = kmeans.predict(df_scaledClst)

In [None]:
df_Clst.head(3)

### Check for clustering criteria

In [None]:
# group each retailer by their respective cluster and take the mean over all retailers in that cluster
df_Clst.groupby(['cluster']).mean()

### Number of retailers per cluster

In [None]:
df_Clst['cluster'].value_counts(dropna=False)

### Check high-revenue cluster retailers

In [None]:
# cluster assignment may be different on each pass, so choose the cluster with only 8 members
df_Clst[df_Clst['cluster']==0]

## 2D plot

In [None]:
# plot impressions vs revenue as an example
fig, ax = plt.subplots(figsize=(12,10))
varX = 'total_impressions'
varY = 'Revenue'
scatter = ax.scatter(df_Clst[varX], df_Clst[varY], c=df_Clst['cluster'],
            s=60, cmap = 'Set1');
ax.legend(*scatter.legend_elements(),
                    loc="lower right", title="Classes")
ax.set_yscale('log')
ax.set_xscale('log')
plt.xlim((10,50000000))
plt.ylim((0.01,1000000))
plt.xlabel(varX)
plt.ylabel(varY)

## Interactive 3D plot

In [None]:
# plot revenue vs click/day and CTR as an example
from mpl_toolkits.mplot3d import Axes3D
#%matplotlib qt   # this is for an interactive plot, it may or may not work on your python setup
fig = plt.figure(figsize=(12,10))
ax = Axes3D(fig)
varX='clicks_per_day'
varY='avg_Ctr'
varZ='Revenue'
xx = np.log10(df_Clst[varX])
yy = df_Clst[varY]
zz = np.log10(df_Clst[varZ])
ax.scatter(xx,yy, zz, c=df_Clst['cluster'], s=60, cmap = 'Set1')
ax.view_init(35, 115)

plt.xlabel('log '+varX)
plt.ylabel(varY)
ax.set_zlabel('log '+varZ)

ax.set_ylim(0,7)