# Unsupervised Learning | Clustering (K-Means) | Case-study

## Customer / Marketing Segmentation [Clustering (K-Means)]

### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

### Loading & Inspecting Data

In [None]:
# Load the data & check what's inside
mrkt = pd.read_csv ('data/marketing-segmentation.csv')
mrkt.head()

In [None]:
mrkt.info()

In [None]:
mrkt.describe().round(1)

### Exploring Data

In [None]:
# We are creating a scatter plot of the two variables
plt.scatter(mrkt['Satisfaction'],mrkt['Loyalty'])
plt.xlabel('Satisfaction')
plt.ylabel('Loyalty')

### Data Preprocessing

In [None]:
# Import a library which can do that easily
from sklearn import preprocessing

# Scale the inputs using 'preprocessing.scale()' which scales each variable (column in X) with respect to itself
x_scaled = preprocessing.scale(X)
x_scaled

### Model Training & Prediction

In [None]:
# Import the KMeans module so we can perform k-means clustering with sklearn
from sklearn.cluster import KMeans

In [None]:
# Select both features by creating a copy of the data variable
X = mrkt.copy()

In [None]:
# The number in the brackets is K, or the number of clusters we are aiming for
kmeans = KMeans(2)

In [None]:
# Fit the data
kmeans.fit(X)

In [None]:
# Create a copy of the input data
clusters = X.copy()

In [None]:
# Take note of the predicted clusters 
clusters['cluster_pred'] = kmeans.fit_predict(X)

In [None]:
# Plot the data using the longitude and the latitude
plt.scatter(clusters['Satisfaction'],clusters['Loyalty'],c=clusters['cluster_pred'],cmap='rainbow')
plt.xlabel('Satisfaction')
plt.ylabel('Loyalty')

### Finding the Optimal Number of Clusters (Elbow / Knee Method)

In [None]:
# Createa an empty list
wcss =[]

# Create all possible cluster solutions with a loop
# We have chosen to get solutions from 1 to 9 clusters; you can ammend that if you wish
for i in range(1,10):
    # Clsuter solution with i clusters
    kmeans = KMeans(i)
    # Fit the STANDARDIZED data
    kmeans.fit(x_scaled)
    # Append the WCSS for the iteration
    wcss.append(kmeans.inertia_)
    
# Check the result
wcss

In [None]:
# Plot the number of clusters vs WCSS
plt.plot(range(1,10),wcss)

# Name your axes
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')

In [None]:
# Fiddle with K (the number of clusters)
kmeans_new = KMeans(5)

# Fit the data
kmeans_new.fit(x_scaled)

# Create a new data frame with the predicted clusters
clusters_new = X.copy()
clusters_new['cluster_pred'] = kmeans_new.fit_predict(x_scaled)

In [None]:
# Check if everything seems right
clusters_new.head()

In [None]:
# Plotting the final clusters
plt.scatter(clusters_new['Satisfaction'],clusters_new['Loyalty'],c=clusters_new['cluster_pred'],cmap='rainbow')
plt.xlabel('Satisfaction')
plt.ylabel('Loyalty')

==========

# GOOD LUCK!