In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('data/breachcompilation.csv', header=None)
passwords = df[0].values  # Get passwords from first column

# Reduce dataset to 25k samples
np.random.seed(42)
passwords = np.random.choice(passwords, size=30000, replace=False)

In [2]:
def get_char_type(char):
    common_lower = 'esaitnruol'
    common_upper = 'ESAITNRUOL'
    numbers = '0123456789'
    common_special = '><-?.!/,%@&'
    
    if char in common_lower:
        return '1'
    elif char.islower():
        return '2'
    elif char in common_upper:
        return '3'
    elif char.isupper():
        return '4'
    elif char in numbers:
        return '5'
    elif char in common_special:
        return '6'
    elif not char.isalnum():
        return '7'
    else:
        return '0'

def mask_password(password):
    return ''.join(get_char_type(c) for c in str(password))

# Create masked versions of passwords
masked_passwords = np.array([mask_password(p) for p in passwords])

In [3]:
def create_features(masked_pass):
    # Get the maximum length in the dataset
    max_len = 30
    
    # Initialize a numpy array with zeros
    X = np.zeros((len(masked_pass), max_len))
    
    # Fill the array with the masked password values
    for i, password in enumerate(masked_pass):
        # Pad or truncate password to max_len
        padded = password.ljust(max_len, '0')
        # Convert characters to integers and store in array
        for j, char in enumerate(padded[:max_len]):
            X[i, j] = int(char)
    
    return X

# Create feature matrix
X = create_features(masked_passwords)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [4]:
# Perform clustering
clustering = AgglomerativeClustering(
    n_clusters=None,
    distance_threshold=8,
    linkage='ward'
)
clusters = clustering.fit(X_scaled)

# Get cluster centers
cluster_centers = []
for i in range(clustering.n_clusters_):
    mask = clusters.labels_ == i
    cluster_centers.append(np.mean(X_scaled[mask], axis=0))
cluster_centers = np.array(cluster_centers)

from sklearn.metrics import silhouette_score

# Calculate silhouette score
silhouette_avg = silhouette_score(X_scaled, clusters.labels_)
print(f"Silhouette Score: {silhouette_avg}")

Silhouette Score: 0.5648600699718502


In [5]:
def compute_password_similarity(password, clustering, X_scaled, scaler):
    """
    Compute similarity percentage between input password and breached password clusters.
    
    Parameters:
    -----------
    password : str
        Input password to check
    clustering : AgglomerativeClustering
        Fitted clustering model
    X_scaled : array
        Scaled feature matrix used for clustering
    scaler : StandardScaler
        Fitted scaler used to transform features
        
    Returns:
    --------
    float
        Similarity percentage (0-100)
    int
        Most similar cluster ID
    """
    # Convert password to masked version
    masked_pwd = mask_password(password)
    
    # Create features for the single password
    X_new = create_features([masked_pwd])
    
    # Scale the features using the same scaler
    X_new_scaled = scaler.transform(X_new)
    
    # Calculate distances to all cluster centers
    distances = []
    for i in range(clustering.n_clusters_):
        mask = clustering.labels_ == i
        center = np.mean(X_scaled[mask], axis=0)
        dist = np.linalg.norm(X_new_scaled - center)
        distances.append(dist)
    
    # Convert distance to similarity percentage
    min_dist = min(distances)
    closest_cluster = np.argmin(distances)
    
    # Use a steeper exponential decay with normalization
    max_reasonable_dist = 10.0  # May need to adjust this based on data
    normalized_dist = min_dist / max_reasonable_dist
    similarity = 100 * np.exp(-2 * normalized_dist)  # Steeper decay with factor -2
    
    # Clip to ensure we don't exceed 100%
    similarity = min(similarity, 100.0)
    
    return similarity, closest_cluster

In [6]:
test_password = "ikhwan2002"
similarity, cluster = compute_password_similarity(
    test_password, 
    clustering,
    X_scaled,
    scaler
)

print(f"Password similarity: {similarity:.2f}%")
print(f"Most similar cluster: {cluster}")

Password similarity: 85.38%
Most similar cluster: 175


In [7]:
import joblib

print(clustering.n_clusters_)
# Save both the clustering model and scaler
joblib.dump({
    'clustering': clustering,
    'scaler': scaler,
    'X_scaled': X_scaled
}, 'model3.pkl')

print("Model saved to model3.pkl")

496
Model saved to model3.pkl


In [8]:
import joblib

# Load the model and scaler
loaded_model = joblib.load('model3.pkl')
clustering = loaded_model['clustering']
scaler = loaded_model['scaler']

# Check the loaded model and scaler
print("Clustering model loaded:", clustering)
print("Scaler loaded:", scaler)


Clustering model loaded: AgglomerativeClustering(distance_threshold=8, n_clusters=None)
Scaler loaded: StandardScaler()
