In [None]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_wine
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D  # Import 3D plotting functionality


In [None]:
# Read csv into pandas df (dataset obtained from Kaggle)
# https://www.kaggle.com/datasets/harrywang/wine-dataset-for-clustering

data = pd.read_csv('wine-clustering.csv')
df = data.copy() #create a copy of the dataset incase we require the original
df

In [None]:
df.info() #no non-nulls and datatypes are all int or float
df.describe()

## K-Means Clustering

K-Means is an unsupervised learning algorithm that aims to group data based on similarity. Here's how it works:

1. **Algorithm Overview**:
   - Randomly assigns each data point to one of the K clusters.
   - Calculates centroids for each cluster.
   - Iteratively:
     - Evaluates each observation, assigning it to the closest cluster based on the Euclidean distance to the centroid.
     - Recalculates centroids when a cluster gains or loses a data point.
     - Repeats until no further reassignments occur.
   - The goal is to minimize the within-cluster variance (sum of squared distances from data points to their cluster centroids).

2. **Strengths**:
   - Simple and efficient.
   - Works well when clusters are roughly spherical and well-separated.

3. **Limitations**:
   - Sensitive to initial centroid placement.
   - Assumes equally sized and spherical clusters.

4. **Applications**:
   - Customer segmentation.
   - Image compression.
   - Anomaly detection.

In [None]:
# Remove any non-numeric columns (if needed)
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Standardise the features (mean - 0, std - 1)
scaler = StandardScaler()
scaled_df = scaler.fit_transform(numeric_df)

# Choose the number of clusters (K)
k = 2

# Instantiate the K-Means class
# init="random": Initializes cluster centroids randomly.
# n_clusters=k: Specifies the number of clusters.
# n_init=10: Number of times K-Means will be run with different initial centroids (to avoid local minima).
# random_state=1: Ensures reproducibility.
kmeans = KMeans(init="random", n_clusters=k, n_init=10, random_state=1)

# Fit K-Means algorithm to data
kmeans.fit(scaled_df)

# View cluster assignments for each observation
cluster_labels = kmeans.labels_
df['Cluster'] = cluster_labels

# Visualise the results
# We use Principal Component Analysis (PCA) to reduce the dimensionality of the data to 2D.
# The scatter plot shows the data points colored by their assigned clusters.

pca = PCA(n_components=2)
reduced_df = pca.fit_transform(scaled_df)

plt.scatter(reduced_df[:, 0], reduced_df[:, 1], c=cluster_labels, cmap='viridis')
plt.title("K-Means Clusters")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.show()



In [None]:
# Assuming you have already loaded your wine dataset into the 'df' DataFrame

# Remove any non-numeric columns (if needed)
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Standardise the features
scaler = StandardScaler()
scaled_df = scaler.fit_transform(numeric_df)

# Perform PCA with 3 components
pca = PCA(n_components=3)
pca.fit(scaled_df)
pca_results = pca.transform(scaled_df)

# Set up a 3D plotting environment
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Assign PCA features to their own axes
Xax = pca_results[:, 0]
Yax = pca_results[:, 1]
Zax = pca_results[:, 2]

# Create the 3D scatter plot
ax.scatter(Xax, Yax, Zax, c=cluster_labels, cmap='viridis')
ax.set_title("3D PCA Plot")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")

plt.show()


In [None]:


def evaluate_kmeans(X, k):
    """
    Evaluate K-means clustering with a given K.

    Args:
        X (array-like): Input data (features).
        k (int): Number of clusters.

    Returns:
        float: Silhouette score for the given K.
    """
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    labels = kmeans.labels_
    silhouette_avg = silhouette_score(X, labels)
    return silhouette_avg



silhouette_scores = []

for k_value in range(2, 100):
    silhouette_score_k = evaluate_kmeans(numeric_df, k_value)
    silhouette_scores.append({'k': k_value, 'silhouette_score': silhouette_score_k})

# Convert the list to a DataFrame
silhouettescores = pd.DataFrame(silhouette_scores)

# Print the first few rows of the DataFrame
print(silhouettescores.head())

# Assuming you have already computed the silhouette scores and stored them in 'silhouettescores'
# Replace 'silhouettescores' with your actual DataFrame

# Find the row with the highest silhouette score
best_k_row = silhouettescores.loc[silhouettescores['silhouette_score'].idxmax()]

# Extract the K value
best_k = best_k_row['k']

print(f"The K value with the highest silhouette score is K={best_k}.")



In [None]:
!pip install nbformat==5.10.4
!pip install nbconvert==7.16.1
