<IMG SRC="https://github.com/jacquesroy/byte-size-data-science/raw/master/images/Banner.png" ALT="BSDS Banner" WIDTH=1195 HEIGHT=200>

# Clustering customers using K-Means

### W002-Understanding kmeans
Execute the next cell if you want to see the `Byte Size Data Science` youtube channel video

In [None]:
from IPython.display import IFrame

IFrame(src="https://www.youtube.com/embed/lc78xfSJeQA?rel=0&amp;controls=0&amp;showinfo=0", width=560, height=315)


## Read the customer data

In [None]:
import sys
import types
import pandas as pd
import io
import os
import requests

url = 'https://github.com/jacquesroy/byte-size-data-science/raw/master/data/customer_churn.csv'
content = requests.get(url).content
customers_pd = pd.read_csv(io.StringIO(content.decode('utf-8')))
customers_pd.head()

In [None]:
# Number of rows
customers_pd.count()[0]

## Using K-Means to group customers
We are using K-means to find the center of groupings for the customers.

In [None]:
# Need to prepare the character attributes...
import numpy as np
cols=["Gender", "Status", "Children", "Est Income", "Car Owner", "Age", "LongDistance", 
      "International", "Local", "Dropped", "Paymethod", "LocalBilltype", 
      "LongDistanceBilltype", "Usage", "RatePlan"]

X = customers_pd.iloc[:,2:17].values # Columns from Gender on
y = customers_pd.iloc[:,1].values # CHURN column

### Encoding:
- Categorical: Gender, Status, Car Owner, Paymethod, LocalBilltype, LongDistanceBilltype

Other encoding could be used for some attributes. For example onehotencoder for Gender and Status

In [None]:
# Encoding categorical data before split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
# columns: 0 (Gender), 1(Status), 4 (Car owner), 10 (Payment method), 11 (LocalBillType), 12 (LongDistanceBillType)

labelencoder_X_0 = LabelEncoder()
X[:,0] = labelencoder_X_0.fit_transform(X[:,0])
labelencoder_X_1 = LabelEncoder()
X[:,1] = labelencoder_X_1.fit_transform(X[:,1])
labelencoder_X_4 = LabelEncoder()
X[:,4] = labelencoder_X_4.fit_transform(X[:,4])
labelencoder_X_10 = LabelEncoder()
X[:,10] = labelencoder_X_10.fit_transform(X[:,10])
labelencoder_X_11 = LabelEncoder()
X[:,11] = labelencoder_X_11.fit_transform(X[:,11])
labelencoder_X_12 = LabelEncoder()
X[:,12] = labelencoder_X_12.fit_transform(X[:,12])
# Since the answer column is "T" or "F", we have to encode it
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

In [None]:
# Feature scaling
# we need all the values in a standardized range 
sc = StandardScaler()
X_scaled = sc.fit_transform(X)

# Look at the values in one record
# X_scaled[0]

In [None]:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import KMeans
import sklearn.metrics as sm

%matplotlib inline

In [None]:
# K Means Cluster
k=2
model = KMeans(n_clusters=k)
kmeans = model.fit(X_scaled)
vals=[0] * k
for i in kmeans.labels_ :
    vals[i] = vals[i] + 1
# Distribution between clusters
vals

In [None]:
# Find if the clustering represents the churn in any way (of course not!)
# Rows[0]: True, Rows[1]: False
# Cols[0]: cluster 0, cols[1]: cluster 1
res=np.zeros((k,k), int)
for l in range(0,len(y)) :
    res[y[l],kmeans.labels_[l]] += 1
res

The perfect grouping for churn would have given something like:<br/>
  723, 0<br/>
    0, 1076

## Finding the optimal K
Using the elbow method

In [None]:
from scipy.spatial.distance import cdist

distortions = []
K = range(2,15)
for k in K :
    kmeanModel = KMeans(n_clusters=k).fit(X_scaled)
    distortions.append(sum(np.min(cdist(X_scaled, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X_scaled.shape[0])

# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method for optimal k')
plt.show()

In [None]:
# K Means Cluster
k=10
model = KMeans(n_clusters=k)
kmeans = model.fit(X_scaled)
vals=[0] * k
for i in kmeans.labels_ :
    vals[i] = vals[i] + 1
# Distribution between clusters
print(vals)

## Extra: projection of the clusters to a 2D graph

In [None]:
from sklearn.decomposition import PCA as sklearnPCA

# PCA: Principal Component Analysis
# Used to reduce the number of dimensions
pca = sklearnPCA(n_components=2) #2-dimensional PCA
transformed = pd.DataFrame(pca.fit_transform(X_scaled))


In [None]:
colors=['black', 'red', 'gold', 'cyan', 'limegreen', 'blue', 'orange', 'aqua', 'magenta', 'y', 'gray']
c = kmeans.labels_
for n in range(0,k) :
    plt.scatter(transformed[c==n][0], transformed[c==n][1], label='Cluster' + str(n), c=colors[n])

plt.title('15-dimension projection (PCA)')
plt.legend()
plt.show()

## More Extras: projection of the clusters to a 3D graph

In [None]:
from sklearn.decomposition import PCA as sklearnPCA
from mpl_toolkits import mplot3d

pca3d = sklearnPCA(n_components=3) #3-dimensional PCA
transformed3d = pd.DataFrame(pca3d.fit_transform(X_scaled))


In [None]:
colors=['black', 'red', 'gold', 'cyan', 'limegreen', 'blue', 'orange', 'aqua', 'magenta', 'y', 'gray']
c = kmeans.labels_

ax = plt.axes(projection='3d')

for n in range(0,k) :
    ax.scatter3D(transformed3d[c==n][0], transformed3d[c==n][1], transformed3d[c==n][2],
                 label='Cluster' + str(n), c=colors[n])

plt.title('15-dimension to 3D projection (PCA)')
plt.legend()
plt.show()
# ax.scatter3D(xdata, ydata, zdata, c=zdata, cmap='Greens');

## Try dimension reduction with TSNE
T-SNE: t-Distributed Stochastic Neighbor Embedding

In [None]:
from sklearn.manifold import TSNE
import re

tsne = TSNE(n_components=2)
X_tsne = pd.DataFrame(tsne.fit_transform(X_scaled) )

In [None]:
colors=['black', 'red', 'gold', 'cyan', 'limegreen', 'blue', 'orange', 'aqua', 'magenta', 'y', 'gray']
c = kmeans.labels_
for n in range(0,k) :
    plt.scatter(X_tsne[c==n][0], X_tsne[c==n][1], label='Cluster' + str(n), c=colors[n])

plt.title('15-dimension projection (TSNE)')
plt.legend()
plt.show()

In [None]:
tsne3d = TSNE(n_components=3)
X_tsne3d = pd.DataFrame(tsne3d.fit_transform(X_scaled) )

In [None]:
ax = plt.axes(projection='3d')

for n in range(0,k) :
    ax.scatter3D(X_tsne3d[c==n][0], X_tsne3d[c==n][1], X_tsne3d[c==n][2],
                 label='Cluster' + str(n), c=colors[n])

plt.title('15-dimension to 3D projection (TSNE)')
plt.legend()
plt.show()