# Classification & Clustering

<font color='steelblue'>

<font size = 5>
    <b>Clustering v/s Classification</b><br><br>
    
Compare clustering v/s classificaiton on `iris dataset` <br><br>

</font>
<font size = 4>
<b>Following examples are included in the processing:</b>

- `Read` the Iris flower dataset
- `Explore` the dataset (`multi-class classification`) 
- `Find` optimal number of clusters
- `Predict` cluster number all samples
- `Compare` original flower types v/s clusters
   
</font>

</font>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Dataframe from a file

In [None]:
# Read csv file into a pandas dataframe
# Note there is no header column in this file so define it
iris_data = pd.read_csv("../datasets/iris.csv", 
                        names = ["sepal_l", "sepal_w", "petal_l", \
                                 "petal_w", "class"])

In [None]:
# default is show first 5 rows
iris_data.head()

In [None]:
iris_data.shape

In [None]:
iris_data.tail(10)

In [None]:
iris_data['class'].unique()

In [None]:
# Get rows 0, 50, 100
iris_data.iloc[[0,50,100]]

In [None]:
# count samples where  sepal_l >5 & < 7, get sepal_l, petal_l and class

iris_data.loc[(iris_data["sepal_l"] > 5) & \
              (iris_data["sepal_l"] < 7), \
              [ 'sepal_l', 'petal_l', 'class']].count()

In [None]:
# Get count of unique values in column
iris_data['class'].value_counts()

In [None]:
iris_data['class'].nunique()

# Basic Statistics

In [None]:
iris_data.describe().transpose()

In [None]:
iris_data.describe(include = 'object')

In [None]:
corr = iris_data.corr(numeric_only=True)

In [None]:
import seaborn as sns
sns.set(font_scale=1.4)
f, ax = plt.subplots(figsize=(8,7))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap = "Blues", vmax=.9, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot = True);

## Convert the flower types to numbers

In [None]:
mapping = {'Iris-setosa' : 0, 'Iris-versicolor' : 1, 'Iris-virginica' : 2}
mapping

## Create the features

In [None]:
features = iris_data.columns
features = features.drop('class')
features

In [None]:
X = iris_data[features].values

## KMeans Clustering<br>

<font size=4, color='grey'>
    
- Find the optimal number of clusters `(using weighted sum of squared errors)`
- Plot `Elbow Method` graph using wcss
- Apply KMeans clustering using the optimal number of clusters

    </font>

## WCSS - Within Cluster Sum of Squares<br>
<font size=4, color='grey'>
    
- Within a cluster take the distance of each point from the centriod, square it and then add them up
- WCSS will help in determining the goodness of fit

</font>

In [None]:
from sklearn.cluster import KMeans
wcss = []

for i in range(2, 5):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300,
                    n_init = 10, random_state = 2345)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)


In [None]:
wcss

In [None]:
import matplotlib.mlab as mlab
import seaborn as sbs
from matplotlib.ticker import MaxNLocator

fig, ax = plt.subplots(1,1, figsize =(8,6))
plt.plot(range(2, 5), wcss)
ax.set_xlabel('k')
ax.set_ylabel('cost')
ax.set_title('Elbow Method')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
plt.grid(True)
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 300,
                n_init = 10, random_state = 2345)

In [None]:
y_kmeans = kmeans.fit_predict(X)

In [None]:
y_kmeans

In [None]:
fig, ax = plt.subplots(figsize = (7,6))

ax.scatter(iris_data.loc[:49, 'sepal_l'], iris_data.loc[:49,'sepal_w'],\
           s = 100, c = 'r', alpha=.5, label='Iris-setosa')

ax.scatter(iris_data.loc[49:99, 'sepal_l'], iris_data.loc[49:99,'sepal_w'], \
           s = 100, c = 'blue', alpha=.5, label='Iris-versicolor')

ax.scatter(iris_data.loc[99:, 'sepal_l'], iris_data.loc[99:,'sepal_w'], \
           s = 100, c = 'green', alpha=.5, label='Iris-virginica')

# more convenient way of setting properties
ax.set(title="Iris Data", xlabel='sepal_l', ylabel="sepal_w")
ax.legend();

## Compare flower types

In [None]:
iris_data['cluster'] = y_kmeans

In [None]:
iris_data.head()

In [None]:
iris_data['cluster'].value_counts()

In [None]:
# find the mapping of cluster number to flower type based on sepal length and width

sns.scatterplot(iris_data, x = 'sepal_l', y = 'sepal_w', hue = 'cluster');

In [None]:
# perform mapping of cluster number to flower names
fmap = {0: 'Iris-virginica', 1: 'Iris-setosa', 2: 'Iris-versicolor'}
iris_data['cluster'] = iris_data['cluster'].map(fmap)

In [None]:
sns.pairplot(iris_data, hue = 'cluster', diag_kind = 'hist');

In [None]:
iris_data['cluster'].value_counts()

In [None]:
# Differences between predicted flower v/s actual flower
iris_data[iris_data['class'] != iris_data['cluster']]