In [2]:
# tabular manipulation:
import numpy as np
import pandas as pd
# visualization:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib qt
import seaborn as sns
# sklearn for scaling and clustering:
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN
# environment:
from env import host, username, password

In [3]:
def get_db_url(database, host=host, user=username, password=password):
    return f'mysql+pymysql://{user}:{password}@{host}/{database}'

url = get_db_url("grocery_db")

sql = """
select *
from grocery_customers
"""

df = pd.read_sql(sql, url, index_col="customer_id")
df.head()

Unnamed: 0_level_0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185


#### We will need to proceed through several operational steps to achieve utility from DBSCAN:

- Select what variables/features that we wish to examine
- Scale these features (DBSCAN is going to be useful for continuous variables)
- Ensure that our features are in a numpy array for fitting DBSCAN
- Select our epsilon and min_neighbors to fit our clusters
- Use our clusters to label outliers
- Explore our clusters

In [6]:
# select what features we wish to examine
selected_feats = ['Fresh', 'Frozen', 'Delicassen']

In [7]:
# note: choosing continuous variables is going to be significantly more valuable for distance based clustering as points
# in space will not inherently snap to any given set values and density will mean more as a result

In [8]:
# note: we will treat df as if it has already been split into train, validate test and we are examining train

In [9]:
# We will create a scaler:
# Make it
# Fit it
# Use it
minmax = MinMaxScaler()

In [10]:
scaled_features = minmax.fit_transform(df[selected_feats])

In [11]:
# examine our numpy array
scaled_features

array([[0.11294004, 0.0031063 , 0.02784731],
       [0.06289903, 0.02854842, 0.03698373],
       [0.05662161, 0.03911643, 0.16355861],
       ...,
       [0.1295431 , 0.00677142, 0.03888194],
       [0.091727  , 0.01664914, 0.04426366],
       [0.02482434, 0.00065742, 0.00102211]])

In [12]:
# note: the output of the scaler is already a numpy array
# this is suitable for feeding into our DBSCAN model
type(scaled_features)

numpy.ndarray

In [13]:
# glue the scaled information back into df:
scaled_cols = [col + '_scaled' for col in selected_feats]

In [14]:
scaled_df = pd.DataFrame(scaled_features, index=df.index, columns=scaled_cols)

In [15]:
scaled_df

Unnamed: 0_level_0,Fresh_scaled,Frozen_scaled,Delicassen_scaled
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.112940,0.003106,0.027847
1,0.062899,0.028548,0.036984
2,0.056622,0.039116,0.163559
3,0.118254,0.104842,0.037234
4,0.201626,0.063934,0.108093
...,...,...,...
435,0.264829,0.215469,0.045912
436,0.349761,0.073713,0.048874
437,0.129543,0.006771,0.038882
438,0.091727,0.016649,0.044264


In [16]:
df = df.merge(scaled_df, on=df.index)

In [17]:
df.head()

Unnamed: 0,key_0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Fresh_scaled,Frozen_scaled,Delicassen_scaled
0,0,2,3,12669,9656,7561,214,2674,1338,0.11294,0.003106,0.027847
1,1,2,3,7057,9810,9568,1762,3293,1776,0.062899,0.028548,0.036984
2,2,2,3,6353,8808,7684,2405,3516,7844,0.056622,0.039116,0.163559
3,3,1,3,13265,1196,4221,6404,507,1788,0.118254,0.104842,0.037234
4,4,2,3,22615,5410,7198,3915,1777,5185,0.201626,0.063934,0.108093


In [19]:
# Create our DBSCAN model:

In [20]:
# make it, fit it, use it

In [21]:
# make the object
dbsc = DBSCAN(eps = 0.1, min_samples=20)

In [22]:
# fit it
dbsc.fit(scaled_features)

DBSCAN(eps=0.1, min_samples=20)

In [23]:
# use it:
dbsc.labels_

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0, -1,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,
        0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0, -1,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [25]:
df['labels'] = dbsc.labels_

In [33]:
df[df.labels == -1].count()

key_0                18
Channel              18
Region               18
Fresh                18
Milk                 18
Grocery              18
Frozen               18
Detergents_Paper     18
Delicassen           18
Fresh_scaled         18
Frozen_scaled        18
Delicassen_scaled    18
labels               18
dtype: int64

In [26]:
df[selected_feats + ['labels']].head()

Unnamed: 0,Fresh,Frozen,Delicassen,labels
0,12669,214,1338,0
1,7057,1762,1776,0
2,6353,2405,7844,0
3,13265,6404,1788,0
4,22615,3915,5185,0


In [27]:
df.labels.value_counts()

 0    422
-1     18
Name: labels, dtype: int64

In [28]:
# We have our cluster labels now:
# Clusters: 1 (label: 0)
# Outliers: (Label: -1)

In [None]:
# Explore

In [28]:
sns.scatterplot(x='Fresh', y='Frozen', hue='labels', data=df)

<AxesSubplot:xlabel='Fresh', ylabel='Frozen'>

In [29]:
sns.scatterplot(x='Fresh', y='Delicassen', hue='labels', data=df)

<AxesSubplot:xlabel='Fresh', ylabel='Delicassen'>

In [None]:
# let's examine it on a 3D scale

In [30]:
fig = plt.figure(1, figsize=(10,10))
ax = Axes3D(fig)
ax.scatter(df.Fresh,
          df.Frozen,
          df.Delicassen,
          c=df.labels,
          edgecolor='k')

ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])

ax.set_xlabel('Fresh')
ax.set_ylabel('Milk')
ax.set_zlabel('Grocery')

Text(0.5, 0, 'Grocery')

In [34]:
# Initial Takeaways:
#  - We have at least ~18 points that are definite outliers
#  - We have some data points that may or may not belong to the main cluster
#  - There appears to be one large cluster, no need to additional clusters
#  - Try and capture additional points that look likely to be part of the cluster but may be ruled out due to hyperparameters