In [2]:
# Importing and loading necessary packages
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

In [None]:
# Importing data
data = pd.read_csv(r'C:\zerve\usecases\data_clustering\dataset\wine-clustering.csv')
# Viewing data.head()
data.head()

In [None]:
# Viewing data.info()
data.info()

In [None]:
# Creating histograms of features to view data distribution
sns.set(style='darkgrid',font_scale=2, rc={'figure.figsize':(20,20)})
ax=data.hist(bins=20,color='blue' )

In [None]:
# Checking the skew of the data
data.skew()

In [None]:
# Plotting the distribution of data with boxplots
data.plot( kind = 'box', subplots = True, layout = (4,4), sharex = False, sharey = False,color='black')
plt.show()

In [None]:
# Creating a heatmap of feature correlation
plt.figure(figsize=(16, 10))
mask = np.tril(data.corr())
sns.heatmap(data.corr(), cmap="coolwarm", annot=True, fmt='.2f', annot_kws={'size': 'x-small'}, linewidths=0.5, square=True, mask=mask);

In [11]:
# Applying StandardScaler()
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data), columns = data.columns)

# PCA
# Applying PCA, n_components = 2
pca = PCA(n_components=2)
X_pca = pca.fit_transform(data_scaled)
pca_data = pd.DataFrame(data = X_pca, columns = ['PC1', 'PC2'])
print('Shape after PCA: ', pca_data.shape)
print('Original shape: ', data_scaled.shape)
print ('Cumulative variance explained by 2 principal components: {:.2%}'.format(np.sum(pca.explained_variance_ratio_)))

In [None]:
# Plotting PCA, n_components = 2
plt.scatter(pca_data.iloc[:,0],pca_data.iloc[:,1])
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('PCA Plot')

In [None]:
# Applying elbow method to determine optimal n_clusters
cs = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(data_scaled)
    cs.append(kmeans.inertia_)

plt.figure(figsize=(10, 6), dpi=80)
plt.plot(range(1, 11), cs, color='blue')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('CS')
plt.show()

In [None]:
# Plotting a dendrogram
import scipy.cluster.hierarchy as sch
from matplotlib import pyplot
pyplot.figure(figsize=(12, 5))
dendrogram = sch.dendrogram(sch.linkage(data_scaled, method = 'ward'))
plt.title('Dendrogram')
plt.ylabel('Euclidean distances')
plt.show()

In [17]:
# Applying KMeans
kmeans = KMeans(n_clusters = 3, n_init = 15, max_iter = 500, random_state = 42)
clusters = kmeans.fit_predict(data_scaled)

# Creating cluster centers
centroids = kmeans.cluster_centers_
centroids_pca = pca.transform(centroids)

# Plotting PCA in color
plt.figure(figsize=(12,10))
plt.scatter(pca_data.iloc[:,0], pca_data.iloc[:,1], c=clusters, cmap="brg", s=40)
plt.scatter(x=centroids_pca[:,0], y=centroids_pca[:,1], marker="x", s=500, linewidths=3, color="black")
plt.title('KMeans Clustered Data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')

In [20]:
# Apply t-SNE
tsne = TSNE(n_components = 2)
X_tsne = tsne.fit_transform(data_scaled)
tsne_data = pd.DataFrame(data = X_tsne, columns = ['tsne comp. 1', 'tsne comp. 2'])

# Plotting t-SNE Clusters
plt.figure(figsize=(12,10))
plt.scatter(tsne_data.iloc[:,0], tsne_data.iloc[:,1], c=clusters, cmap="brg", s=40)
plt.title('t-SNE Clustered Data')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')