In [172]:
import numpy as np

#Using Bokeh 0.12.5 and Python 3.6
import warnings
warnings.filterwarnings('ignore')
from bokeh.io import output_notebook, show
from bokeh.models import Select,CustomJS
from bokeh.layouts import row,column
from bokeh.plotting import figure, ColumnDataSource
output_notebook()

In [173]:
#Loading files into Numpy arrays
data_orig = np.genfromtxt('Data/data_full.csv',delimiter=",",dtype=None,skip_header=1)
data=data_orig
print(data.shape)

data_max=np.max(data,axis=0) #Max of each column
data_min=np.min(data,axis=0) #Min of each column
data_range=np.ptp(data,axis=0) #Range of each column
print(data_range)




(440, 8)
[     1      2 112148  73443  92777  60844  40824  47940]


* Data has 8 columns and 440 rows, last 6 colums seem comparable, but transformation is needed.
* Transforming data to 0-100

In [174]:
data=data-data_min
data=data/data_range*100

* Column 1 is the 'Channel' with 2 distinct values (1,2), Column 2 is 'Region' with 3 distinct values (1,2,3)
* These 2 nominal columns would not be used for clustering.
* Columns 2,3,4,5,6,7 depict the scaled annual spending amounts for 6 different commodities

It can be expected that data points for a (channel,region) pair, might be clustered.
Hence, I would take a value of k=6 for KMeans Clustering I am about to perform


In [175]:
#### Approach 1####
#Clustering data using K-means 
from sklearn.cluster import KMeans
k=6

kmeans = KMeans(n_clusters=k,init='k-means++',max_iter=1000,random_state=0).fit(data[:,2:8])
kmeans_labels=np.array(kmeans.labels_)

In [176]:
#Visualise the clusters
#KMeans has clustered using all values of all 6 columns, but we can visualise only 2 at a time
#Making an interactive chart that allows user to choose the columns to visualise in the clusters


#Clusters labels are in the range 0 to k-1
#This dictionary maps (label,column number) to [values]
kmeans_label_dict={}

for i in range(0,k):
    for j in range(2,8):
        kmeans_label_dict[str((i,j))]=[data[x,j] for x in range(data.shape[0]) if kmeans_labels[x]==i]

        
#Making the live columndatasource
kmeans_source={}
col1=2
col2=4

for i in range(0,k):
    for j in range(2,8):
        if(j==col1):
            kmeans_source['c'+str(i)+'x']=kmeans_label_dict[str((i,j))]
        if(j==col2):
            kmeans_source['c'+str(i)+'y']=kmeans_label_dict[str((i,j))]


source_full=ColumnDataSource(data=kmeans_label_dict)
source=ColumnDataSource(data=kmeans_source)

fig1=figure(plot_width=500, plot_height=300,title='K-Means Clustering')

#Plotting clusters with column values for the 2 axes
cluster0=fig1.circle(x='c0x',y='c0y',size=4, color="#D4E157",legend='Cluster 1',source=source)
cluster1=fig1.circle(x='c1x',y='c1y',size=4, color="#3F51B5",legend='Cluster 2',source=source)
cluster2=fig1.circle(x='c2x',y='c2y',size=4, color="#64B5F6",legend='Cluster 3',source=source)
cluster3=fig1.circle(x='c3x',y='c3y',size=4, color="#26A69A",legend='Cluster 4',source=source)
cluster4=fig1.circle(x='c4x',y='c4y',size=4, color="#FF7043",legend='Cluster 5',source=source)
cluster5=fig1.circle(x='c5x',y='c5y',size=4, color="#455A64",legend='Cluster 6',source=source)

#Axes and Labels
fig1.xaxis.axis_label = 'Fresh (Annual Sale Scaled)'
fig1.xaxis.axis_label_text_font='times'
fig1.yaxis.axis_label_text_font='times'
fig1.yaxis.axis_label = 'Grocery (Annual Sale Scaled)'
fig1.axis.major_label_text_color = "#4B5C8A"
fig1.ygrid.grid_line_alpha = 0.4
fig1.ygrid.grid_line_dash = [5, 3]
fig1.xgrid.grid_line_alpha = 0.4
fig1.xgrid.grid_line_dash = [5, 3]

fig1.legend.location = "top_right"
fig1.legend.click_policy="hide"

#Dropdowns to choose columns
menu1=[]
menu1.append(('2','Fresh'))
menu1.append(('3','Milk'))
menu1.append(('4','Grocery'))
menu1.append(('5','Frozen'))
menu1.append(('6','Detergents/Paper'))
menu1.append(('7','Delicassen'))

menu2=[]
menu2.append(('2','Fresh'))
menu2.append(('3','Milk'))
menu2.append(('4','Grocery'))
menu2.append(('5','Frozen'))
menu2.append(('6','Detergents/Paper'))
menu2.append(('7','Delicassen'))

column1_dd=Select(title="Choose X-Axis Column",value="2", options=menu1)
column2_dd=Select(title="Choose Y-Axis Column",value="4", options=menu2)

update_curve = CustomJS(args=dict(source=source, column1_dd=column1_dd,column2_dd=column2_dd,source_full=source_full,fig1=fig1), code="""

    x=column1_dd.value
    y=column2_dd.value
    source.data['c0x']=source_full.data['('+0+', '+x+')']
    source.data['c1x']=source_full.data['('+1+', '+x+')']
    source.data['c2x']=source_full.data['('+2+', '+x+')']
    source.data['c3x']=source_full.data['('+3+', '+x+')']
    source.data['c4x']=source_full.data['('+4+', '+x+')']
    source.data['c5x']=source_full.data['('+5+', '+x+')']
    
    source.data['c0y']=source_full.data['('+0+', '+y+')']
    source.data['c1y']=source_full.data['('+1+', '+y+')']
    source.data['c2y']=source_full.data['('+2+', '+y+')']
    source.data['c3y']=source_full.data['('+3+', '+y+')']
    source.data['c4y']=source_full.data['('+4+', '+y+')']
    source.data['c5y']=source_full.data['('+5+', '+y+')']
    
    var labels=["Fresh","Milk","Grocery","Frozen","Detergent/Paper","Delicassen"]
    fig1.attributes.below[0].axis_label=labels[parseInt(column1_dd.value)-2]+' (Annual Sale Scaled)'
    fig1.attributes.left[0].axis_label=labels[parseInt(column2_dd.value)-2]+' (Annual Sale Scaled)'

    
    source.trigger('change');


""")

column1_dd.js_on_change('value', update_curve)
column2_dd.js_on_change('value', update_curve)



#### K-Means Clustering Results

* You can interact with the K-Means Clustering results below. As we can only visualise 2 columns at a time,you can choose the X and Y columns from the dropdowns and see how each one of them affected the clustering. 


In [177]:
show(row(fig1,column(column1_dd,column2_dd)))

* Clusters 2,3,4,5 contain the maximum numner of data points
* Clusters 1,6 behave as outliers
* Clusters 2,3,4,5 have clear boundaries in many comparisons like Fresh/Detergents
* Fresh and Grocery seem to produce well clustered graphs and hence should have affected the clustering more
* Feature (column) that produces clearer clusters with all the other cloumns has a greater impact on the clustering

In [178]:
###Approach 2###
#Using DBSCAN
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
#data=StandardScaler().fit_transform(data_orig[:,2:8])
dbscan = DBSCAN(eps=10, min_samples=5).fit(data[:,2:8])
dbscan_labels=np.array(dbscan.labels_)
print(max(dbscan_labels),min(dbscan_labels))
print(data.shape)


0 -1
(440, 8)


* -1 as DBSCAN label corresponds to an outlier.
* Trying various values of eps and min_samples, I conclude that DBSCAN sees the data as just 1 cluster and with a few outliers With and without the transformation, the results very the same

In [179]:
#Visualising the clustering as in KMEANS, but this time it's just 1 cluster
#Data points marked in red are the part of cluster while outliers are colored gray

#Clusters labels are in the range 0 to k-1
#This dictionary maps (label,column number) to [values]
dbscan_label_dict={}

for i in range(min(dbscan_labels),max(dbscan_labels)+1):
    for j in range(2,8):
        dbscan_label_dict[str((i,j))]=[data[x,j] for x in range(data.shape[0]) if dbscan_labels[x]==i]

        
#Making the live columndatasource
dbscan_source={}
col1=2
col2=4

for i in range(min(dbscan_labels),max(dbscan_labels)+1):
    for j in range(2,8):
        if(j==col1):
            dbscan_source['c'+str(i)+'x']=dbscan_label_dict[str((i,j))]
        if(j==col2):
            dbscan_source['c'+str(i)+'y']=dbscan_label_dict[str((i,j))]


source_full_dbscan=ColumnDataSource(data=dbscan_label_dict)
source_dbscan=ColumnDataSource(data=dbscan_source)

fig2=figure(plot_width=500, plot_height=300,title='DBSCAN Clustering')

#Plotting clusters with column values for the 2 axes
fig2.circle(x='c-1x',y='c-1y',size=4, color="#455A64",legend='Outlier',source=source_dbscan)
fig2.circle(x='c0x',y='c0y',size=4, color="#EF5350",legend='In Cluster',source=source_dbscan)

#Axes and Labels
fig2.xaxis.axis_label = 'Fresh (Annual Sale Scaled)'
fig2.xaxis.axis_label_text_font='times'
fig2.yaxis.axis_label_text_font='times'
fig2.yaxis.axis_label = 'Grocery (Annual Sale Scaled)'
fig2.axis.major_label_text_color = "#4B5C8A"
fig2.ygrid.grid_line_alpha = 0.4
fig2.ygrid.grid_line_dash = [5, 3]
fig2.xgrid.grid_line_alpha = 0.4
fig2.xgrid.grid_line_dash = [5, 3]

fig2.legend.location = "top_right"
fig2.legend.click_policy="hide"

#Dropdowns to choose columns
menu3=[]
menu3.append(('2','Fresh'))
menu3.append(('3','Milk'))
menu3.append(('4','Grocery'))
menu3.append(('5','Frozen'))
menu3.append(('6','Detergents/Paper'))
menu3.append(('7','Delicassen'))

menu4=[]
menu4.append(('2','Fresh'))
menu4.append(('3','Milk'))
menu4.append(('4','Grocery'))
menu4.append(('5','Frozen'))
menu4.append(('6','Detergents/Paper'))
menu4.append(('7','Delicassen'))

column3_dd=Select(title="Choose X-Axis Column",value="2", options=menu3)
column4_dd=Select(title="Choose Y-Axis Column",value="4", options=menu4)

update_curve_2 = CustomJS(args=dict(source_dbscan=source_dbscan, column3_dd=column3_dd,column4_dd=column4_dd,source_full_dbscan=source_full_dbscan,fig2=fig2), code="""

    x=column3_dd.value
    y=column4_dd.value
    console.log(source_dbscan)
    console.log(source_full_dbscan)
    source_dbscan.data['c-1x']=source_full_dbscan.data['(-1, '+x+')']
    source_dbscan.data['c0x']=source_full_dbscan.data['(0, '+x+')']
    
    source_dbscan.data['c-1y']=source_full_dbscan.data['(-1, '+y+')']
    source_dbscan.data['c0y']=source_full_dbscan.data['(0, '+y+')']
    
    var labels=["Fresh","Milk","Grocery","Frozen","Detergent/Paper","Delicassen"]
    fig2.attributes.below[0].axis_label=labels[parseInt(column3_dd.value)-2]+' (Annual Sale Scaled)'
    fig2.attributes.left[0].axis_label=labels[parseInt(column4_dd.value)-2]+' (Annual Sale Scaled)'

    
    source_dbscan.trigger('change');
   

""")

column3_dd.js_on_change('value', update_curve_2)
column4_dd.js_on_change('value', update_curve_2)

##### DBSCAN Clustering results
* DBSCAN sees the dataset as just one cluster with few outliers
* Plotted the Dataset with red as a point in cluster, whereas gray is an outlier
* User can choose the columns on X and Y axis

In [180]:
show(row(fig2,column(column3_dd,column4_dd)))

* Grocery as X axis, gives clear outlier distinction with all other features as Y axis
* Plotting a feature vs the same feature gives a straight like with gray points on the farther end. Hence, if these points are well distinguished, the feature has more affect on clustering
* Grocery vs Grocery shows a clearly separated red and gray points on a line

In [181]:
#Comparing KMeans and DBSCAN side by side
#As DBSCAN gives just 1 cluster and some outliers,
#we can use KMeans with 2 clusters to compare with it
k2=2
col1=2
col2=4

kmeans2 = KMeans(n_clusters=k2,init='k-means++',max_iter=1000,random_state=0).fit(data[:,2:8])
kmeans_labels2=np.array(kmeans2.labels_)

dbscan2 = DBSCAN(eps=10, min_samples=5).fit(data[:,2:8])
dbscan_labels2=np.array(dbscan2.labels_)

kmeans_label_dict2={}


for i in range(0,k2):
    for j in range(2,8):
        kmeans_label_dict2[str((i,j))]=[data[x,j] for x in range(data.shape[0]) if kmeans_labels2[x]==i]

kmeans_source2={}
for i in range(0,k2):
    for j in range(2,8):
        if(j==col1):
            kmeans_source2['c'+str(i)+'x']=kmeans_label_dict2[str((i,j))]
        if(j==col2):
            kmeans_source2['c'+str(i)+'y']=kmeans_label_dict2[str((i,j))]

dbscan_label_dict2={}

for i in range(min(dbscan_labels2),max(dbscan_labels2)+1):
    for j in range(2,8):
        dbscan_label_dict2[str((i,j))]=[data[x,j] for x in range(data.shape[0]) if dbscan_labels2[x]==i]

                    
dbscan_source2={}

for i in range(min(dbscan_labels2),max(dbscan_labels2)+1):
    for j in range(2,8):
        if(j==col1):
            dbscan_source2['c'+str(i)+'x']=dbscan_label_dict2[str((i,j))]
        if(j==col2):
            dbscan_source2['c'+str(i)+'y']=dbscan_label_dict2[str((i,j))]

#ColumnDataSource for both charts
source_full_dbscan2=ColumnDataSource(data=dbscan_label_dict2)
source_dbscan2=ColumnDataSource(data=dbscan_source2)

source_full_kmeans2=ColumnDataSource(data=kmeans_label_dict2)
source_kmeans2=ColumnDataSource(data=kmeans_source2)


#Plotting points on both charts
TOOLS="pan,wheel_zoom,reset"
fig3=figure(plot_width=350, plot_height=300,title='K-Means Clustering',tools=TOOLS)
fig4=figure(plot_width=350, plot_height=300,title='DBSCAN',x_range=fig3.x_range, y_range=fig3.y_range,tools="")

fig3.circle(x='c0x',y='c0y',size=4, color="#1976D2",legend='Cluster 1',source=source_kmeans2)
fig3.circle(x='c1x',y='c1y',size=4, color="#455A64",legend='Cluster 2',source=source_kmeans2)


fig4.circle(x='c-1x',y='c-1y',size=4, color="#455A64",legend='Outlier',source=source_dbscan2)
fig4.circle(x='c0x',y='c0y',size=4, color="#1976D2",legend='In Cluster',source=source_dbscan2)


#Axes and Labels
fig3.xaxis.axis_label = 'Fresh (Annual Sale Scaled)'
fig3.xaxis.axis_label_text_font='times'
fig3.yaxis.axis_label_text_font='times'
fig3.yaxis.axis_label = 'Grocery (Annual Sale Scaled)'
fig3.axis.major_label_text_color = "#4B5C8A"
fig3.ygrid.grid_line_alpha = 0.4
fig3.ygrid.grid_line_dash = [5, 3]
fig3.xgrid.grid_line_alpha = 0.4
fig3.xgrid.grid_line_dash = [5, 3]

fig3.legend.location = "top_right"
fig3.legend.click_policy="hide"

fig4.xaxis.axis_label = 'Fresh (Annual Sale Scaled)'
fig4.xaxis.axis_label_text_font='times'
fig4.yaxis.axis_label_text_font='times'
fig4.yaxis.axis_label = 'Grocery (Annual Sale Scaled)'
fig4.axis.major_label_text_color = "#4B5C8A"
fig4.ygrid.grid_line_alpha = 0.4
fig4.ygrid.grid_line_dash = [5, 3]
fig4.xgrid.grid_line_alpha = 0.4
fig4.xgrid.grid_line_dash = [5, 3]

fig4.legend.location = "top_right"
fig4.legend.click_policy="hide"

#Dropdowns to choose columns
menu5=[]
menu5.append(('2','Fresh'))
menu5.append(('3','Milk'))
menu5.append(('4','Grocery'))
menu5.append(('5','Frozen'))
menu5.append(('6','Detergents/Paper'))
menu5.append(('7','Delicassen'))

menu6=[]
menu6.append(('2','Fresh'))
menu6.append(('3','Milk'))
menu6.append(('4','Grocery'))
menu6.append(('5','Frozen'))
menu6.append(('6','Detergents/Paper'))
menu6.append(('7','Delicassen'))

column5_dd=Select(title="Choose X-Axis Column",value="2", options=menu5)
column6_dd=Select(title="Choose Y-Axis Column",value="4", options=menu6)


update_curve_3 = CustomJS(args=dict(source_full_dbscan2=source_full_dbscan2,source_dbscan2=source_dbscan2,source_full_kmeans2=source_full_kmeans2,source_kmeans2=source_kmeans2, column5_dd=column5_dd,column6_dd=column6_dd,fig3=fig3,fig4=fig4), code="""

    x=column5_dd.value
    y=column6_dd.value
  
    source_dbscan2.data['c-1x']=source_full_dbscan2.data['(-1, '+x+')']
    source_dbscan2.data['c0x']=source_full_dbscan2.data['(0, '+x+')']
    
    source_dbscan2.data['c-1y']=source_full_dbscan2.data['(-1, '+y+')']
    source_dbscan2.data['c0y']=source_full_dbscan2.data['(0, '+y+')']
    
    source_kmeans2.data['c0x']=source_full_kmeans2.data['('+0+', '+x+')']
    source_kmeans2.data['c1x']=source_full_kmeans2.data['('+1+', '+x+')']
    
    source_kmeans2.data['c0y']=source_full_kmeans2.data['('+0+', '+y+')']
    source_kmeans2.data['c1y']=source_full_kmeans2.data['('+1+', '+y+')']
    
    var labels=["Fresh","Milk","Grocery","Frozen","Detergent/Paper","Delicassen"]
    fig3.attributes.below[0].axis_label=labels[parseInt(column5_dd.value)-2]+' (Annual Sale Scaled)'
    fig3.attributes.left[0].axis_label=labels[parseInt(column6_dd.value)-2]+' (Annual Sale Scaled)'
    fig4.attributes.below[0].axis_label=labels[parseInt(column5_dd.value)-2]+' (Annual Sale Scaled)'
    fig4.attributes.left[0].axis_label=labels[parseInt(column6_dd.value)-2]+' (Annual Sale Scaled)'

    source_dbscan2.trigger('change');
    source_kmeans2.trigger('change');
   

""")

column5_dd.js_on_change('value', update_curve_3)
column6_dd.js_on_change('value', update_curve_3)


##### Plotting K-Means with 2 clusters against DBSCAN with 1 Cluster and Outliers
* Using a smiliar color scheme, the user can choose columns to display on X,Y Axis
* Both plots are linked for the dropdowns and respond together for zooms
* Legend is clickable to hide/show clusters
* We can compare what algorithm was better for outlier detection

In [182]:
show(column(row(column5_dd,column6_dd),row(fig3,fig4)))

* Although the dataset was hard to cluster even with normalisation or scaling, there were a lot of takeaways from the plots
* K-Means, tries the divide the regional space into as many parts as specified by k
* Few features have more impact on the clustering result than the others
* DBSCAN considered the plot as one cluster with some outliers

* KMeans with 2 clusters behaves similar to DBSCAN with one cluster and outliers, but the outliers in case of DBSCAN are clearer and better. 

* Few comparisons like Grocery vs Milk produce similar outliers in both clustering techniques. On the other hand, Fresh Vs Frozen show very different clustering result.

* Results differ as the algorithms focuse differently on one feature more than the other
