In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
#from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.cluster import KMeans

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
from sklearn import datasets
dataset = datasets.load_iris()
x = pd.DataFrame(dataset.data)
x.columns = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width']
y = pd.DataFrame(dataset.target)
y.columns = ['Targets']
x.describe()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [3]:
# normalize each variable to have mean=0 and sd=1
clusterdata=x.copy()
for i in range(clusterdata.shape[1]):
    clusterdata.iloc[:,i] = preprocessing.scale(clusterdata.iloc[:,i])

In [None]:
clusterdata.head()

In [None]:
x.head()

In [None]:
# k-means cluster analysis for 1-10 clusters
from scipy.spatial.distance import cdist
clusters=range(1,10)
meandist=[]

#Perform k-means analysis, as well as check the average within cluster distances vs. different k values
for k in clusters:
    model=KMeans(n_clusters=k)
    model.fit(clusterdata)
    clusassign=model.predict(clusterdata)
    meandist.append(sum(np.min(cdist(clusterdata, model.cluster_centers_, 'euclidean'), axis=1))
    / clusterdata.shape[0])

In [None]:
%matplotlib inline
plt.plot(clusters, meandist)
plt.xlabel('Number of clusters')
plt.ylabel('Average distance')
plt.title('Selecting k with the Elbow Method')

In [None]:
# K Means Cluster choosing 3 as the number of clusters
model = KMeans(n_clusters=3)
model.fit(clusterdata)

In [None]:
model.labels_

In [None]:
# View the results
# Set the size of the plot
plt.figure(figsize=(14,7))

# Create a colormap
colormap = np.array(['red', 'lime', 'black'])

# Plot the Original Classifications
plt.subplot(1, 2, 1)
plt.scatter(x.Petal_Length, x.Petal_Width, c=colormap[y.Targets], s=40)
plt.title('Ground Truth Class')

# Plot the Models Classifications
plt.subplot(1, 2, 2)
plt.scatter(x.Petal_Length, x.Petal_Width, c=colormap[model.labels_], s=40)
plt.title('K Means Clustering Result')

In [None]:
# The meanings of "0", "1", and "2" in the ground truth label are different from those generated by model. We need to make them consistent.
predY = np.choose(model.labels_, [2, 0, 1]).astype(np.int64)

In [None]:
# View the results
# Set the size of the plot
plt.figure(figsize=(14,7))

# Create a colormap
colormap = np.array(['red', 'lime', 'black'])

# Plot Orginal
plt.subplot(1, 2, 1)
plt.scatter(x.Petal_Length, x.Petal_Width, c=colormap[y.Targets], s=40)
plt.title('Ground Truth Class')

# Plot Predicted with corrected values
plt.subplot(1, 2, 2)
plt.scatter(x.Petal_Length, x.Petal_Width, c=colormap[predY], s=40)
plt.title('K Mean Clustering Result')

In [None]:
# Performance Measures
import sklearn.metrics
sklearn.metrics.confusion_matrix(y, predY)

In [None]:
sklearn.metrics.accuracy_score(y, predY)

In [None]:
#Hierarchical Clustering
#apply the complete linkage agglomeration to our clusters using
#the linkage function from SciPy's cluster.hierarchy submodule
from scipy.cluster.hierarchy import linkage
row_clusters = linkage(x.values,method='complete', metric='euclidean')

In [None]:
from scipy.cluster.hierarchy import dendrogram

In [None]:
row_dendr = dendrogram(row_clusters)

In [None]:
#Applying agglomerative clustering via scikit-learn
from sklearn.cluster import AgglomerativeClustering

In [None]:
ac = AgglomerativeClustering(n_clusters=2,affinity='euclidean',linkage='complete')

In [None]:
labels = ac.fit_predict(x.values)

In [None]:
labels