# Clustering - Ouray County Parcel Risk
**Author:** Bryce A Young  
**Created:** 2025-01-17 | 
**Modified:** 2025-01-17  

#### Overview
In this notebook, I do two things: 
1. I use unsupervized learning models to cluster home types **without** risk score as a variable. I then draw comparisons between clusters and average risk scores. 
2. I use PCA and t-SNE to reduce the dimensionality of the dataset to derive home archetypes and test the reduced dataset with supervized risk prediction methods.

*NOTE: It's possible that both the supervized and unsupervized methods can work together. The supervized method generates a risk score, then the clustering groups homes into archetypes, then we can go back and assign archetypes to homes and assess how many of those homes burned in historic fires such as Palisades, Lahaina, Marshall and Camp.*

## Clustering
Clustering can be used to reveal structure between samples of data and assign group membership to similar groups of samples.

In [None]:
################################
# Load the data
################################
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs, make_moons

# Create / load the datasets:
n_samples = 1500
X0, _ = make_blobs(n_samples=n_samples, centers=2, n_features=2, random_state=0)
X1, _ = make_blobs(n_samples=n_samples, centers=5, n_features=2, random_state=0)

random_state = 170
X, y = make_blobs(n_samples=n_samples, random_state=random_state, cluster_std=1.3)
transformation = [[0.6, -0.6], [-0.2, 0.8]]
X2 = np.dot(X, transformation)
X3, _ = make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state)
X4, _ = make_moons(n_samples=n_samples, noise=.12)

X = [X0, X1, X2, X3, X4]
# The datasets are X[i], where i ranges from 0 to 4

In [None]:
################################
# Code to plot clusters
################################
def plot_cluster(ax, data, cluster_assignments):
    '''Plot two-dimensional data clusters
    
    Parameters
    ----------
    ax : matplotlib axis
        Axis to plot on
    data : list or numpy array of size [N x 2] 
        Clustered data
    cluster_assignments : list or numpy array [N]
        Cluster assignments for each point in data

    '''
    clusters = np.unique(cluster_assignments)
    n_clusters = len(clusters)
    for ca in clusters:
        kwargs = {}
        if ca == -1:
            # if samples are not assigned to a cluster (have a cluster assignment of -1, color them gray)
            kwargs = {'color':'gray'}
            n_clusters = n_clusters - 1
        ax.scatter(data[cluster_assignments==ca, 0], data[cluster_assignments==ca, 1],s=5,alpha=0.5, **kwargs)
        ax.set_xlabel('feature 1')
        ax.set_ylabel('feature 2')
        ax.set_title(f'No. Clusters = {n_clusters}')
        ax.axis('equal')

In [None]:
# Set up subplots
fig, axes = plt.subplots(1, 5, figsize=(15, 3))

# Plot each dataset in a separate subplot
for i, (dataset, title) in enumerate(zip(X, ["X0", "X1", "X2", "X3", "X4"])):
    axes[i].scatter(dataset[:, 0], dataset[:, 1], s=10)
    axes[i].set_title(title)
    axes[i].axis('equal')  # Set equal scaling for better visualization

# Adjust layout
plt.tight_layout()
plt.show()

Now that we have viewed the basic shape of the data, we can create elbow curves for each dataset to determine what $k$ value to set for k-means clustering. (Refer to machine learning assignment 5)

In [None]:
from sklearn.cluster import KMeans
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# Create a 3x3 grid of subplots
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 8))
fig.suptitle('Elbow Curves for Different Datasets')

# Create an inertia list for each dataset
for i, dataset in enumerate(X):
    sumsqs = []
    
    # Iterate through cluster sizes and calculate inertia, appending to inertia list   
    for k in range(1, 11):
        kmeans = KMeans(n_clusters=k, random_state=0)
        kmeans.fit(dataset)
        sumsqs.append(kmeans.inertia_)
        
    # Plot the elbow curve on the corresponding subplot
    row = i // 3
    col = i % 3
    axes[row, col].plot(range(1, 11), sumsqs, marker='o')
    axes[row, col].set_title(f'Elbow Curve for X{i}')
    axes[row, col].set_xlabel('Number of Clusters (k)')
    axes[row, col].set_ylabel('Sum of Squares')

# Adjust layout and show the plot
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

Now we can plot the cluster assignments.

In [None]:
#### K MEANS CLUSTERING ####

# List of best k values for each dataset
best_kmeans = [2, 4, 3, 3, 2]

# Create a 2x3 grid of subplots
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 8))
fig.suptitle('KMeans Clustered Data')

# Iterate over datasets and corresponding best k values
for i, (dataset, best_k) in enumerate(zip(X, best_kmeans)):
    # Fit the specified k-means model
    kmeans = KMeans(n_clusters=best_k, random_state=0)
    cluster_assignments = kmeans.fit_predict(dataset)
    
    # Plot the clustered data on the corresponding subplot
    row = i // 3
    col = i % 3
    axes[row, col].scatter(dataset[:, 0], dataset[:, 1], c=cluster_assignments, cmap='viridis', s=10)
    axes[row, col].set_title(f'X{i} (k={best_k})')
    axes[row, col].set_xlabel('Feature 1')
    axes[row, col].set_ylabel('Feature 2')

# Adjust layout and show the plot
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

In [None]:
#### DBSCAN CLUSTERING ####

from sklearn.cluster import DBSCAN

# Create a 2x3 grid of subplots
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 8))
fig.suptitle('DBSCAN Clustered Data')

# Iterate over datasets
for i, dataset in enumerate(X):
    # Apply DBSCAN clustering
    dbscan = DBSCAN(eps=0.3, min_samples=4)
    cluster_assignments = dbscan.fit_predict(dataset)
    
    # Plot the clustered data on the corresponding subplot
    row = i // 3
    col = i % 3
    axes[row, col].scatter(dataset[:, 0], dataset[:, 1], c=cluster_assignments, cmap='viridis', s=10)
    axes[row, col].set_title(f'X{i}')
    axes[row, col].set_xlabel('Feature 1')
    axes[row, col].set_ylabel('Feature 2')

# Adjust layout and show the plot
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

In [None]:
#### SPECTRAL CLUSTERING ####

from sklearn.cluster import SpectralClustering as spc

# Create a 2x3 grid of subplots
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 8))
fig.suptitle('Spectral Clustered Data')

# Iterate over datasets and corresponding best k values
for i, (dataset, n) in enumerate(zip(X, best_kmeans)):
    # Fit the specified spectral clustering model
    spectral = spc(n_clusters=n)
    cluster_assignments = spectral.fit_predict(dataset)
    
    # Plot the clustered data on the corresponding subplot
    row = i // 3
    col = i % 3
    axes[row, col].scatter(dataset[:, 0], dataset[:, 1], c=cluster_assignments, cmap='viridis', s=10)
    axes[row, col].set_title(f'X{i} (k={n})')
    axes[row, col].set_xlabel('Feature 1')
    axes[row, col].set_ylabel('Feature 2')

# Adjust layout and show the plot
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

## Dimensionality Reduction
Here we use Principal Components Analysis (PCA) and t-distributed stochastic neighbor embedding (t-SNE) to reduce the dimensionality of the dataset. Then we will compare the two techniques and assess which tended to cluster best.

In [None]:
################################
# Load the data
################################
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# load dataset
digits = datasets.load_digits()
n_sample = digits.target.shape[0]
n_feature = digits.images.shape[1] * digits.images.shape[2]
X_digits = np.zeros((n_sample, n_feature))
for i in range(n_sample):
    X_digits[i, :] = digits.images[i, :, :].flatten()
y_digits = digits.target

In [None]:
# Find the dimensions of the dataset
num_samples, num_features = X_digits.shape

print(f"Number of samples: {num_samples}")
print(f"Number of features: {num_features}")

In [None]:
# Apply PCA
n_components = 2  # Number of components for PCA
pca = PCA(n_components = n_components)
X_pca = pca.fit_transform(X_digits)

# Plot the data in 2D space with labels
plt.figure(figsize=(7.5, 6))

for i in range(10):
    indices = (y_digits == i)
    plt.scatter(X_pca[indices, 0], X_pca[indices, 1], label=str(i), s=20)

plt.title('PCA of Digits Dataset (2D)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Digit', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

Validation: cumulative fraction of variance explained by principal components.

In [None]:
# Apply PCA
pca1 = PCA()
pca1.fit(X_digits)

# Calculate cumulative fraction of variance explained
cumulative_var_ratio = np.cumsum(pca1.explained_variance_ratio_)

# Plot the cumulative fraction of variance explained
plt.figure(figsize=(6, 4))
plt.plot(range(1, len(cumulative_var_ratio) + 1), cumulative_var_ratio, marker='.', linestyle='-')
plt.title('Cumulative Fraction of Variance Explained by Principal Components')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Fraction of Variance Explained')
plt.grid(True)
plt.show()

In [None]:
# Get the fraction of variance explained by the first 2 principal components
fraction_variance_explained = pca.explained_variance_ratio_

# Print the result
print("Fraction of Variance Explained by the First 2 Principal Components:")
print(f"Principal Component 1: {fraction_variance_explained[0]:.4f}")
print(f"Principal Component 2: {fraction_variance_explained[1]:.4f}")
print(f"Total Cumulative Variance Unexplained: {(1-(fraction_variance_explained[0] + fraction_variance_explained[1])): .4f}")

t-SNE

In [None]:
# Apply t-SNE
tsne = TSNE(n_components=2, 
            perplexity = 25, 
            n_iter = 500,
            random_state=0
           )
X_tsne = tsne.fit_transform(X_digits)

# Plot the data in 2D with associated labels
plt.figure(figsize=(7.5, 6))

for i in range(10):
    indices = (y_digits == i)
    plt.scatter(X_tsne[indices, 0], X_tsne[indices, 1], label=str(i), s=15)

plt.title('t-SNE of Digits Dataset (2D) with True Labels')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend(title='Digit', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()