In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
from scipy.spatial.distance import pdist, squareform, cdist
import seaborn as sns
from sklearn.neighbors import NearestNeighbors

# 1. Dataset Preparation

## 1.1 Loading Dataset

In [None]:
dataset = pd.read_csv("data/final_dataset.csv")
print("Shape of dataset:", dataset.shape)
dataset.head()

In [None]:
print("There are NaN values:", dataset.isnull().values.any())

In [None]:
print(dataset.columns)

## 1.2 Outlier detection

In [None]:
def return_whiskers(df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower_whisker = q1 - 1.5 * iqr
    upper_whisker = q3 + 1.5 * iqr
    return lower_whisker, upper_whisker

In [None]:
boxplot = dataset.boxplot(column=['povertyPercentage'])
plt.show()

In [None]:
lower_whisker_poverty, upper_whisker_poverty = return_whiskers(dataset, 'povertyPercentage')
print("Amount of outliers in povertyPercentage:", dataset[(dataset['povertyPercentage'] < lower_whisker_poverty) | (dataset['povertyPercentage'] > upper_whisker_poverty)].shape[0])

Values for poverty percentages that are outliers are still realistic and close to whiskers. Furthermore, poverty percentage is a key attribute of our analysis, so we decide to keep all the data points.

In [None]:
boxplot = dataset.boxplot(column = ['avg_age_participants'])
plt.show()

In [None]:
boxplot = dataset.boxplot(column = ['killed_ratio'])
plt.show()

Drop outliers for unrealistic values of avg age participants which may negatively influence clustering.

In [None]:
# Get whisker value of boxplot for avg_age_participants
lower_whisker_avg_age, upper_whisker_avg_age = return_whiskers(dataset, 'avg_age_participants')

# Drop rows with avg_age_participants > upper_whisker
dataset = dataset[dataset['avg_age_participants'] <= upper_whisker_avg_age]
print("Shape of dataset after removing outliers:", dataset.shape)

In [None]:
boxplot = dataset.boxplot(column = ['democrats_ratio', 'republicans_ratio'])
plt.show()

Values of 100% or 0% are pretty unrealistic, so we decide to drop all over 95% (or below 5%), considering them as outliers / wrongly observed data. Moreover, we will keep only one of the two ratios, and also drop the winning party attribute. For this reason, we decide to drop strange observations where the party that won had less than 50% of the votes.

In [None]:
print("Amount of reublicans_ratio outliers:", dataset[dataset['republicans_ratio'] == 1].shape[0] + dataset[dataset['republicans_ratio'] == 0].shape[0])
print("Amount of democrats_ratio outliers:", dataset[dataset['democrats_ratio'] == 1].shape[0] + dataset[dataset['democrats_ratio'] == 0].shape[0])

# Drop rows with republicans_ratio <= 0.05 or >= 0.95
dataset = dataset[(dataset['republicans_ratio'] > 0.05) & (dataset['republicans_ratio'] < 0.95)]
print("Shape of dataset after removing outliers:", dataset.shape)

In [None]:
# Check how many times a party won and the percentage of votes for that part was <0.5
democrats_strange = len(dataset[(dataset['republicans_ratio'] > 0.5 ) & (dataset['party'] == 0)])
republicans_strange = len(dataset[(dataset['republicans_ratio'] < 0.5 ) & (dataset['party'] == 1)])

print("Amount of times democrats won and the percentage of votes for that part was <0.5:", democrats_strange)
print("Amount of times republicans won and the percentage of votes for that part was <0.5:", republicans_strange)
print("Amount of strange wins:", democrats_strange + republicans_strange)

# Drop rows with republican strange wins
dataset = dataset[((dataset['republicans_ratio'] > 0.5) & (dataset['party'] == 1)) | ((dataset['republicans_ratio'] < 0.5) & (dataset['party'] == 0))]
print("Shape of dataset after removing outliers:", dataset.shape)


## 1.3 Dropping columns for different algorithms

In [None]:
dropped_columns = ['min_age_participants', 'max_age_participants', 'teen_ratio', 'totalvotes', 'year', 'party', 'democrats_ratio']
print("Attributes to drop:", dropped_columns)
dataset_reduced = dataset.drop(columns=dropped_columns, axis = 1)
print("Shape of dataset:", dataset_reduced.shape)

In [None]:
selected_state = "Florida"
dataset_reduced_florida = dataset_reduced[dataset_reduced["state_" + selected_state] == True]

In [None]:
dropped_columns = [c for c in dataset_reduced.columns if c.startswith('state_')]
print("Attributes to drop:", dropped_columns)

dataset_reduced = dataset_reduced.drop(columns=dropped_columns, axis = 1)
dataset_reduced_florida = dataset_reduced_florida.drop(columns=dropped_columns, axis = 1)

print("Shape of dataset:", dataset_reduced.shape)
print("Shape of dataset for florida:", dataset_reduced_florida.shape)

## 1.4 Scaling

In [None]:
scaler = MinMaxScaler()
scaled_dataset = scaler.fit_transform(dataset_reduced.values)

scaler_florida= MinMaxScaler()
scaled_dataset_florida = scaler_florida.fit_transform(dataset_reduced_florida.values)

print("Shape of scaled dataset:", scaled_dataset.shape)
print("Shape of scaled dataset for florida:", scaled_dataset_florida.shape)

# 2. K Means

## 2.1 Identification of the best value of k

In [None]:
sse_list = []
silhouette_list = []
davies_bouldin_list = []

max_k = 20
for k in tqdm(range(2, max_k + 1), ):
    kmeans = KMeans(n_clusters=k, n_init=10)
    kmeans.fit(scaled_dataset)

    sse_list.append(kmeans.inertia_)
    silhouette_list.append(silhouette_score(scaled_dataset, kmeans.labels_, sample_size=10000))
    davies_bouldin_list.append(davies_bouldin_score(scaled_dataset, kmeans.labels_))

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(20, 15))
ax[0].plot(range(2, len(sse_list) + 2), sse_list)
ax[0].set_ylabel('SSE', fontsize=22)
ax[0].set_xticks(range(2, len(sse_list) + 2))

ax[1].plot(range(2, len(silhouette_list) + 2), silhouette_list)
ax[1].set_ylabel('Silhouette Score', fontsize=22)
ax[1].set_xticks(range(2, len(silhouette_list) + 2))

ax[2].plot(range(2, len(davies_bouldin_list) + 2), davies_bouldin_list)
ax[2].set_ylabel('Davies Bouldin Score', fontsize=22)
ax[2].set_xticks(range(2, len(davies_bouldin_list) + 2))


plt.xlabel('K', fontsize=22)
plt.show()

# NICER PLOTS

## 2.2 Analysis of the centroids and clusters

In [None]:
n_clusters = 9
kmeans = KMeans(n_clusters=n_clusters, n_init=10)
kmeans.fit(scaled_dataset)

In [None]:
print("Shape of scaled_dataset: ", scaled_dataset.shape)
print("Shape of kmeans.labels_: ", kmeans.labels_.shape)
print("Shape of kmeans.cluster_centers_: ", kmeans.cluster_centers_.shape)

In [None]:
centers = scaler.inverse_transform(kmeans.cluster_centers_)
centers_df = pd.DataFrame(centers, columns=dataset_reduced.columns)
centers_df.head(7)

In [None]:
cluster_num_points = []
cluster_points = []
for i in range(n_clusters):
    cluster_points.append(scaled_dataset[kmeans.labels_ == i])
    cluster_num_points.append(len(cluster_points[-1]))
    
cluster_num_points, cluster_points[0].shape

In [None]:
kmeans_tot = KMeans(n_clusters=1, n_init=10)
kmeans_tot.fit(scaled_dataset)
total_SSE = kmeans_tot.inertia_ / len(scaled_dataset)

cluster_SSE = []
for i in range(n_clusters):
    cluster_sse = 0
    for point in cluster_points[i]:
        cluster_sse += np.linalg.norm(point - kmeans.cluster_centers_[i])**2
    cluster_SSE.append(cluster_sse / cluster_num_points[i])

print("Cluster SSE: ", cluster_SSE)
print("Total SSE: ", total_SSE)
print("SSE of cluster with min SSE: ", min(cluster_SSE))
print("SSE of cluster with max SSE: ", max(cluster_SSE))
print("Mean of SSE: ", np.mean(cluster_SSE))

In [None]:
cluster_distance_variance = []
total_distance_variance = 0
dataset_centroid = np.mean(scaled_dataset, axis=0)

###### CI RIPENSIAMO SU QUESTO ######
for point in scaled_dataset:
    total_distance_variance += (np.linalg.norm(point - dataset_centroid)**2 - total_SSE) ** 2
total_distance_variance /= len(scaled_dataset)
#####################################

for i in range(n_clusters):
    variance = 0
    for p in cluster_points[i]:
        variance += ((np.linalg.norm(p - kmeans.cluster_centers_[i]))**2 - cluster_SSE[i])**2
    cluster_distance_variance.append(variance/cluster_num_points[i])

print("Cluster distance variance: ", cluster_distance_variance)
print("Total distance variance: ", total_distance_variance)
print("Distance variance of cluster with min SSE: ", min(cluster_distance_variance))
print("Distance variance of cluster with max SSE: ", max(cluster_distance_variance))
print("Mean of distance variance: ", np.mean(cluster_distance_variance))

Construct a distance matrix among cluster centroids.

In [None]:
centroid_distance_matrix = squareform(pdist(kmeans.cluster_centers_))
sns.heatmap(centroid_distance_matrix, annot=True, fmt = '.2f', cmap='crest')
plt.show()

Construct matrix displaying correlation of attribute values to belonging to a certain cluster.

In [None]:
# Transfrom kmeans labels into onehot encoding
onehot = np.zeros((len(kmeans.labels_), n_clusters))
onehot[np.arange(len(kmeans.labels_)), kmeans.labels_] = 1

# Compute correlation between onehot encoding and scaled dataset
onehot_corr = np.corrcoef(scaled_dataset, onehot, rowvar=False)

cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(onehot_corr[:scaled_dataset.shape[1], scaled_dataset.shape[1]:], cmap=cmap)

# Set ticks on y axis with feature names
plt.yticks(np.arange(scaled_dataset.shape[1]) + 0.5, dataset_reduced.columns, rotation=0, fontsize=12)
plt.show()

Compute the similarity matrix of a sample of the dataset.

In [None]:
# Downsample the dataset to 10000
samples = np.random.choice(scaled_dataset.shape[0], 1000, replace=False)
downsampled_dataset = scaled_dataset[samples]
downsampled_labels = kmeans.labels_[samples]

# Sort based on labels
sorted_indexes = np.argsort(downsampled_labels)
downsampled_dataset = downsampled_dataset[sorted_indexes]
downsampled_labels = downsampled_labels[sorted_indexes]

# Compute similarity matrix
pdist_matrix = squareform(pdist(downsampled_dataset, metric='minkowski', p=2))
sns.heatmap(pdist_matrix, fmt = '.2f', cmap='crest')
plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
kmeans.cluster_centers_.shape

In [None]:
tsne = TSNE(n_components=2, n_jobs=-1)
tsne_dataset = np.concatenate((downsampled_dataset, kmeans.cluster_centers_))
tsne_labels = np.concatenate((downsampled_labels, range(n_clusters)))
tsne_map = tsne.fit_transform(tsne_dataset)

scatter = plt.scatter(tsne_map[:-n_clusters, 0], tsne_map[:-n_clusters, 1], c = tsne_labels[:-n_clusters], s=10, cmap='tab10')
plt.scatter(tsne_map[-n_clusters:, 0], tsne_map[-n_clusters:, 1], c = tsne_labels[-n_clusters:], s=100, cmap='tab10', marker='*', edgecolors='black')

#for i in range(n_clusters):
#    plt.annotate(i, tsne_map[-n_clusters + i, :], fontsize=20)

# Get unique cluster labels
unique_labels = set(tsne_labels[:-n_clusters])

# Create a legend with a color for each cluster
legend_entries = []
for label in unique_labels:
    # Find the indices of data points with the current label
    indices = tsne_labels[:-n_clusters] == label
    # Add a legend entry for the current label with the corresponding color
    legend_entries.append(plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=scatter.cmap(scatter.norm(label)), markersize=8, label=f'Cluster {label}'))

# Add legend to the plot
plt.legend(handles=legend_entries, loc='lower right')
plt.xlim(-50,100)
plt.show()

## 2.3 Distribution of variables: within clusters vs whole dataset

Analysis of distribution of repubblican vs democrats in the clusters.

In [None]:
party_xt_pct = pd.crosstab(kmeans.labels_, dataset_reduced['republicans_ratio'] > 0.5)
party_xt_pct

In [None]:
party_xt_pct.plot(kind='bar', stacked=False, 
                   title='Party per cluster')
plt.xlabel('Cluster')
plt.ylabel('Party')
plt.show()

In [None]:

populous_city_xt_pct = pd.crosstab(kmeans.labels_, dataset_reduced['populous_city'])
populous_city_xt_pct

In [None]:
populous_city_xt_pct.plot(kind='bar', stacked=False, 
                   title='Popolous city per cluster')
plt.xlabel('Cluster')
plt.ylabel('Party')
plt.show()

Let's explore some continuous variables, such as the arrested ratio, and look at clusters positively and negatively correlated to that variable. We can see that the distributions are very different amongst the two clusters, and they represent different trends which are present in the full distribution.

In [None]:
# Plot distribution of average age for whole dataset and clusters
arrested_analysis = [3, 4]
dataset_reduced_with_clusters = dataset_reduced.copy()
dataset_reduced_with_clusters['cluster'] = kmeans.labels_
dataset_reduced_with_clusters_3_4 = dataset_reduced_with_clusters[(dataset_reduced_with_clusters['cluster'] == arrested_analysis[0]) | \
                                                              (dataset_reduced_with_clusters['cluster'] == arrested_analysis[1]) ]  

sns.displot(dataset_reduced_with_clusters_3_4, x="arrested_ratio", kind='kde', hue="cluster")
plt.gcf().set_size_inches(10, 5)

sns.displot(dataset_reduced_with_clusters, x="arrested_ratio", kind='kde')
plt.gcf().set_size_inches(10, 5)
plt.show()


# 3. DBSCAN

## 3.1 Identification of best eps value

In [None]:
k = 3
nbrs = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree', n_jobs=-1).fit(scaled_dataset_florida)
distances, indices = nbrs.kneighbors(scaled_dataset_florida)
kth_distances = distances[:, k]
distances.shape

In [None]:
plt.plot(range(0, len(kth_distances)), sorted(kth_distances))
plt.axhline(y = 0.5, color = 'r')
plt.ylabel('dist from %sth neighbor' % k, fontsize=18)
plt.xlabel('sorted distances', fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.show()

## 3.2 Cluster Analysis

In [None]:
dbscan = DBSCAN(eps=0.55, min_samples=150)
dbscan.fit(scaled_dataset_florida)
labels, cluster_num_points = np.unique(dbscan.labels_, return_counts=True)
n_clusters = len(labels)
cluster_num_points

In [None]:
cluster_points = []
for i in range(n_clusters):
    cluster_points.append(scaled_dataset_florida[dbscan.labels_ == i-1])

In [None]:
print('Silhouette %s' % silhouette_score(scaled_dataset_florida, dbscan.labels_))
print('Davies-Bouldin %s' % davies_bouldin_score(scaled_dataset_florida, dbscan.labels_))

In [None]:
# Transfrom kmeans labels into onehot encoding
onehot = np.zeros((len(dbscan.labels_), n_clusters))
onehot[np.arange(len(dbscan.labels_)), dbscan.labels_] = 1

# Compute correlation between onehot encoding and scaled dataset
onehot_corr = np.corrcoef(scaled_dataset_florida, onehot, rowvar=False)

cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(onehot_corr[:scaled_dataset_florida.shape[1], scaled_dataset_florida.shape[1]:], fmt = '.2f', cmap=cmap)

# Set ticks on y axis with feature names
plt.yticks(np.arange(scaled_dataset_florida.shape[1]) + 0.5, dataset_reduced_florida.columns, rotation=0, fontsize=12)
plt.show()

In [None]:
# Downsample the dataset to 10000
samples = np.random.choice(scaled_dataset_florida.shape[0], 1000, replace=False)
downsampled_dataset = scaled_dataset_florida[samples]
downsampled_labels = dbscan.labels_[samples]

# Sort based on labels
sorted_indexes = np.argsort(downsampled_labels)
downsampled_dataset = downsampled_dataset[sorted_indexes]
downsampled_labels = downsampled_labels[sorted_indexes]

# Compute similarity matrix
pdist_matrix = squareform(pdist(downsampled_dataset, metric='minkowski', p=2))
sns.heatmap(pdist_matrix, fmt = '.2f', cmap='crest')
plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
tsne = TSNE(n_components=2, n_jobs=-1)
tsne_dataset = tsne.fit_transform(scaled_dataset_florida)

In [None]:
colors = ['gray', 'red', 'blue', 'green', 'yellow', 'purple', 'orange', 'cyan', 'brown']
for i in range(n_clusters):
    plt.scatter(tsne_dataset[:,0][dbscan.labels_ == i-1], tsne_dataset[:,1][dbscan.labels_ == i-1], s=3, c=colors[i])
plt.show()

In [None]:
umap_reducer = UMAP().fit(scaled_dataset_florida)


In [None]:
umap_dataset = umap_reducer.transform(scaled_dataset)
for i in range(n_clusters):
    plt.scatter(umap_dataset[:,0][dbscan.labels_ == i-1], umap_dataset[:,1][dbscan.labels_ == i-1], s=3, c=colors[i])
plt.show()

In [None]:
for i in range(n_clusters):
    plt.scatter(dataset_reduced_florida["longitude"][dbscan.labels_ == i-1], dataset_reduced_florida["latitude"][dbscan.labels_ == i-1], s=10)

plt.xlim(-88, -78)
plt.ylim(26, 36)
plt.show()

## 3.3 Distribution of variables: within clusters vs whole dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.manifold import TSNE
from umap import UMAP
import seaborn as sns
from scipy.spatial.distance import pdist, squareform
from tqdm import tqdm

In [None]:
selected_state = "Florida"
dataset = pd.read_csv("data/final_dataset.csv")
dataset = dataset[dataset["state_" + selected_state] == True]

dropped_columns = [c for c in dataset.columns if c.startswith('state_')]
dataset_reduced = dataset.drop(columns=dropped_columns, axis = 1)
dataset_reduced.drop(columns=["min_age_participants", "max_age_participants", "totalvotes", "teen_ratio", "povertyPercentage", "year", "democrats_ratio", "party"], axis=1, inplace=True)

dataset_reduced.head()

In [None]:
numeric_dataset = dataset_reduced._get_numeric_data()
numeric_dataset.head()

In [None]:
numeric_dataset.values.shape

In [None]:
scaler = MinMaxScaler()
scaled_dataset = scaler.fit_transform(numeric_dataset.values)
scaled_dataset.shape

In [None]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)
    
    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [None]:
linkages = ["single", "complete", "average", "ward"]
hierarchical_results = []
for linkage in linkages:
    hierarchical_clustering = AgglomerativeClustering(linkage=linkage, distance_threshold=0, n_clusters=None)
    hierarchical_clustering.fit(scaled_dataset)
    hierarchical_results.append(hierarchical_clustering)

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(15, 15))
for i, (linkage, model) in enumerate(zip(linkages, hierarchical_results)):
    # plot the top three levels of the dendrogram
    ax[i // 2][i % 2].set_title(linkage)
    plot_dendrogram(model, truncate_mode="level", p=3, ax=ax[i // 2][i % 2], orientation="right")
    ax[i // 2][i % 2].set_ylabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

In [None]:
def calculate_sse(labels):
    n_cluster = len(np.unique(labels))
    global_sse = 0
    for i in range(n_cluster): # for each cluster
        cluster_points = scaled_dataset[labels == i]
        centroid = np.mean(cluster_points, axis=0)
        # calculate the sse for the single cluster
        current_sse = 0
        for p in cluster_points:
            current_sse += np.linalg.norm( np.subtract(p, centroid)) ** 2

        global_sse += current_sse
    return global_sse

In [None]:
sse_list = []
silhoutte_list = []
davies_bouldin_list = []

max_k = 20
for k in tqdm(range(2, max_k + 1), ):
    ward_clustering = AgglomerativeClustering(n_clusters=k, linkage="ward")
    ward_clustering.fit(scaled_dataset)

    sse_list.append(calculate_sse(ward_clustering.labels_))
    silhoutte_list.append(silhouette_score(scaled_dataset, ward_clustering.labels_, sample_size=10000))
    davies_bouldin_list.append(davies_bouldin_score(scaled_dataset, ward_clustering.labels_))


In [None]:
fig, ax = plt.subplots(3, 1, figsize=(20, 15))
ax[0].plot(range(2, len(sse_list) + 2), sse_list)
ax[0].set_ylabel('SSE', fontsize=22)
ax[0].set_xticks(range(2, len(sse_list) + 2))

ax[1].plot(range(2, len(silhoutte_list) + 2), silhoutte_list)
ax[1].set_ylabel('Silhouette Score', fontsize=22)

ax[2].plot(range(2, len(davies_bouldin_list) + 2), davies_bouldin_list)
ax[2].set_xlabel('K', fontsize=22)
ax[2].set_ylabel('Davies Bouldin Score', fontsize=22)


plt.xlabel('K', fontsize=22)
plt.show()

In [None]:
n_clusters = 8
ward_clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward")
ward_clustering.fit(scaled_dataset)

In [None]:
cluster_points = []
for i in range(n_clusters):
    cluster_points.append(scaled_dataset[ward_clustering.labels_ == i])

In [None]:
print('Silhouette %s' % silhouette_score(scaled_dataset, ward_clustering.labels_))
print('Davies-Bouldin %s' % davies_bouldin_score(scaled_dataset, ward_clustering.labels_))

In [None]:
# Transfrom kmeans labels into onehot encoding
onehot = np.zeros((len(ward_clustering.labels_), n_clusters))
onehot[np.arange(len(ward_clustering.labels_)), ward_clustering.labels_] = 1

# Compute correlation between onehot encoding and scaled dataset
onehot_corr = np.corrcoef(scaled_dataset, onehot, rowvar=False)

cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(onehot_corr[:scaled_dataset.shape[1], scaled_dataset.shape[1]:], fmt = '.2f', cmap=cmap)

# Set ticks on y axis with feature names
plt.yticks(np.arange(scaled_dataset.shape[1]) + 0.5, numeric_dataset.columns, rotation=0, fontsize=12)
[]

In [None]:
# Downsample the dataset to 10000
samples = np.random.choice(scaled_dataset.shape[0], 1000, replace=False)
downsampled_dataset = scaled_dataset[samples]
downsampled_labels = ward_clustering.labels_[samples]

# Sort based on labels
sorted_indexes = np.argsort(downsampled_labels)
downsampled_dataset = downsampled_dataset[sorted_indexes]
downsampled_labels = downsampled_labels[sorted_indexes]

# Compute similarity matrix
pdist_matrix = squareform(pdist(downsampled_dataset, metric='minkowski', p=2))
sns.heatmap(pdist_matrix, fmt = '.2f', cmap='crest')
plt.xticks([])
plt.yticks([])

In [None]:
tsne = TSNE(n_components=2, n_jobs=-1)
tsne_dataset = tsne.fit_transform(scaled_dataset)

In [None]:
colors = ['red', 'blue', 'green', 'yellow', 'purple', 'orange', 'cyan', 'brown']
for i in range(n_clusters):
    plt.scatter(tsne_dataset[:,0][ward_clustering.labels_ == i], tsne_dataset[:,1][ward_clustering.labels_ == i], s=3, c=colors[i])
plt.show()

In [None]:
umap_reducer = UMAP().fit(scaled_dataset)


In [None]:
umap_dataset = umap_reducer.transform(scaled_dataset)
for i in range(n_clusters):
    plt.scatter(umap_dataset[:,0][ward_clustering.labels_ == i], umap_dataset[:,1][ward_clustering.labels_ == i], s=3, c=colors[i])
plt.show()

In [None]:
for i in range(n_clusters):
    plt.scatter(numeric_dataset["males_ratio"][ward_clustering.labels_ == i], numeric_dataset["avg_age_participants"][ward_clustering.labels_ == i], c=colors[i], s=10)
plt.show()

In [None]:
party_xt_pct = pd.crosstab(ward_clustering.labels_, numeric_dataset['republicans_ratio'] > 0.5)
party_xt_pct

In [None]:
party_xt_pct.plot(kind='bar', stacked=False, 
                   title='Party per cluster')
plt.xlabel('Cluster')
plt.ylabel('Party')
plt.show()

In [None]:
for i in range(n_clusters):
    plt.scatter(numeric_dataset["adults_ratio"][ward_clustering.labels_ == i-1], numeric_dataset["killed_ratio"][ward_clustering.labels_ == i-1], s=10)
plt.show()

In [None]:
for i in range(n_clusters):
    plt.scatter(numeric_dataset["longitude"][ward_clustering.labels_ == i-1], numeric_dataset["latitude"][ward_clustering.labels_ == i-1], s=10)

plt.xlim(-88, -78)
plt.ylim(26, 36)
plt.show()