# K-Means Clustering

Load the libraries.

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, adjusted_mutual_info_score, confusion_matrix
from sklearn.metrics import silhouette_score, davies_bouldin_score

Read the data.

In [None]:
data = pd.read_csv("data/000webhost_subset_classifed_featureExtracted.csv", error_bad_lines=False)

Select the features. 

In [None]:
#X = data.drop(columns=["password", "strength", "cracking_time"])
#y = data["strength"]
X = data.drop(columns=["password", "strength", "length", "uppercase", "lowercase", "digits", "special", "consecutive_char_type_count", "cracking_time"])
y = data["strength"]


Perform undersampling.

In [None]:
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)


Standarize the features.

In [None]:
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)


Split the data into testing and training sets. 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled_scaled, y_resampled, test_size=0.2, random_state=42)

# Analysis of K-means

Choose the optimal number of clusters (k) using the Elbow method. 

In [None]:
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_train)
    sse.append(kmeans.inertia_)

Plot the SSE for different values of k.

In [None]:
plt.plot(range(1, 11), sse, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('SSE')
plt.title('Elbow Method')
plt.savefig("Scree Plot", dpi=300, bbox_inches='tight')
plt.show()


# $k = 2$

Choose the optimal k based on the Elbow Method and fit the model

In [None]:
optimal_k = 2  # Change this value based on the elbow point in the plot
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans.fit(X_train)


Analyze the distribution of zxcvbn classes within each cluster

In [None]:
cluster_assignments = kmeans.predict(X_resampled_scaled)
clusters_df = pd.DataFrame({'cluster': cluster_assignments, 'zxcvbn_class': y_resampled})
clusters_summary = clusters_df.groupby(['cluster', 'zxcvbn_class']).size().unstack().fillna(0)
print(clusters_summary)


Visualize clusters using PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_resampled_scaled)

plt.figure(figsize=(10, 6))
for i in range(optimal_k):
    cluster_points = X_pca[cluster_assignments == i]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {i}')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.legend()
plt.title('K-Means Clusters Visualization using PCA')
plt.savefig("Two Clusters", dpi=300, bbox_inches='tight')
plt.show()


Examine cluster centroids and feature importance

In [None]:
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=['entropy', 'bigram_freq', 'trigram_freq', 'fourgram_freq', 'levenshtein_distance', 'char_repetition_weight_sum', 'most_common_char_type_count', 'char_frequency_ratio', 'password_length_ratio_to_unique'])
print(centroids)


Step 6: Evaluate clustering quality with silhouette score and Davies-Bouldin Index

In [None]:
sil_score = silhouette_score(X_resampled_scaled, cluster_assignments)
db_score = davies_bouldin_score(X_resampled_scaled, cluster_assignments)


In [None]:
print(sil_score)
print(db_score)


Make predictions.

In [None]:
y_test_pred = kmeans.predict(X_test)

In [None]:
ari = adjusted_rand_score(y_test, y_test_pred)
nmi = normalized_mutual_info_score(y_test, y_test_pred)
ami = adjusted_mutual_info_score(y_test, y_test_pred)


In [None]:
print(f"Adjusted Rand Index: {ari:.4f}")
print(f"Normalized Mutual Information: {nmi:.4f}")
print(f"Adjusted Mutural Information: {ami:.4f}")


# $k = 4$

In [None]:
optimal_k = 4  # Change this value based on the elbow point in the plot
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans.fit(X_train)

In [None]:
cluster_assignments = kmeans.predict(X_resampled_scaled)
clusters_df = pd.DataFrame({'cluster': cluster_assignments, 'zxcvbn_class': y_resampled})
clusters_summary = clusters_df.groupby(['cluster', 'zxcvbn_class']).size().unstack().fillna(0)
print(clusters_summary)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_resampled_scaled)

plt.figure(figsize=(10, 6))
for i in range(optimal_k):
    cluster_points = X_pca[cluster_assignments == i]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {i}')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.legend()
plt.title('K-Means Clusters Visualization using PCA')
plt.savefig("Four Clusters", dpi=300, bbox_inches='tight')
plt.show()


Compute the centroids. 

In [None]:
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=['entropy', 'bigram_freq', 'trigram_freq', 'fourgram_freq', 'levenshtein_distance', 'char_repetition_weight_sum', 'most_common_char_type_count', 'char_frequency_ratio', 'password_length_ratio_to_unique'])
print(centroids)


In [None]:
sil_score = silhouette_score(X_resampled_scaled, cluster_assignments)
db_score = davies_bouldin_score(X_resampled_scaled, cluster_assignments)
print(sil_score)
print(db_score)

Make predictions. 

In [None]:
y_test_pred = kmeans.predict(X_test)

In [None]:
ari = adjusted_rand_score(y_test, y_test_pred)
nmi = normalized_mutual_info_score(y_test, y_test_pred)
ami = adjusted_mutual_info_score(y_test, y_test_pred)


In [None]:
print(f"Adjusted Rand Index: {ari:.4f}")
print(f"Normalized Mutual Information: {nmi:.4f}")
print(f"Adjusted Mutural Information: {ami:.4f}")