In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
from scipy.spatial.distance import pdist, squareform, cdist
import seaborn as sns

# 1. Dataset Preparation

## 1.1 Loading Dataset

In [None]:
dataset = pd.read_csv("data/final_dataset.csv")
print("Shape of dataset:", dataset.shape)
dataset.head()

In [None]:
print("There are NaN values:", dataset.isnull().values.any())

In [None]:
print(dataset.columns)

In [None]:
dropped_columns = [c for c in dataset.columns if c.startswith('state_')]
dropped_columns += ['min_age_participants', 'max_age_participants', 'teen_ratio', 'totalvotes']
print("Attributes to drop:", dropped_columns)
dataset_reduced = dataset.drop(columns=dropped_columns, axis = 1)
print("Shape of dataset:", dataset_reduced.shape)

In [None]:
numeric_dataset = dataset_reduced._get_numeric_data()
print("Shape of numeric_dataset:", numeric_dataset.shape)
numeric_dataset.head()

## 1.2 Outlier detection

In [None]:
boxplot = dataset_reduced.boxplot(column=['povertyPercentage'])

In [None]:
boxplot = dataset_reduced.boxplot(column = ['avg_age_participants', 'n_participants'])

In [None]:
boxplot = dataset_reduced.boxplot(column = ['adults_ratio', 'males_ratio', 'killed_ratio', 'injured_ratio', 'arrested_ratio', 'votes_ratio'])

In [None]:
boxplot = dataset_reduced.boxplot(column = ['party'])

In [None]:
boxplot = dataset_reduced.boxplot(column = ['totalvotes'])

In [None]:
boxplot = dataset_reduced.boxplot(column = [ 'population'])

## 1.3 Scaling

In [None]:
scaler = MinMaxScaler()
scaled_dataset = scaler.fit_transform(numeric_dataset.values)
scaled_dataset.shape

# 2. K Means

## 2.1 Identification of the best value of k

In [None]:
sse_list = []
silhouette_list = []
davies_bouldin_list = []

max_k = 20
for k in tqdm(range(2, max_k + 1), ):
    kmeans = KMeans(n_clusters=k, n_init=10)
    kmeans.fit(scaled_dataset)

    sse_list.append(kmeans.inertia_)
    silhouette_list.append(silhouette_score(scaled_dataset, kmeans.labels_, sample_size=10000))
    davies_bouldin_list.append(davies_bouldin_score(scaled_dataset, kmeans.labels_))

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(20, 15))
ax[0].plot(range(2, len(sse_list) + 2), sse_list)
ax[0].set_ylabel('SSE', fontsize=22)
ax[0].set_xticks(range(2, len(sse_list) + 2))

ax[1].plot(range(2, len(silhouette_list) + 2), silhouette_list)
ax[1].set_ylabel('Silhouette Score', fontsize=22)
ax[1].set_xticks(range(2, len(silhouette_list) + 2))

ax[2].plot(range(2, len(davies_bouldin_list) + 2), davies_bouldin_list)
ax[2].set_ylabel('Davies Bouldin Score', fontsize=22)
ax[2].set_xticks(range(2, len(davies_bouldin_list) + 2))


plt.xlabel('K', fontsize=22)
plt.show()

# NICER PLOTS

## 2.2 Analysis of the centroids and clusters

In [None]:
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, n_init=10)
kmeans.fit(scaled_dataset)

In [None]:
print("Shape of scaled_dataset: ", scaled_dataset.shape)
print("Shape of kmeans.labels_: ", kmeans.labels_.shape)
print("Shape of kmeans.cluster_centers_: ", kmeans.cluster_centers_.shape)

In [None]:
centers = scaler.inverse_transform(kmeans.cluster_centers_)
centers_df = pd.DataFrame(centers, columns=numeric_dataset.columns)
centers_df.head(7)

In [None]:
cluster_num_points = []
cluster_points = []
for i in range(n_clusters):
    cluster_points.append(scaled_dataset[kmeans.labels_ == i])
    cluster_num_points.append(len(cluster_points[-1]))
    
cluster_num_points, cluster_points[0].shape

In [None]:
kmeans_tot = KMeans(n_clusters=1, n_init=10)
kmeans_tot.fit(scaled_dataset)
total_SSE = kmeans_tot.inertia_ / len(scaled_dataset)

cluster_SSE = []
for i in range(n_clusters):
    cluster_sse = 0
    for point in cluster_points[i]:
        cluster_sse += np.linalg.norm(point - kmeans.cluster_centers_[i])**2
    cluster_SSE.append(cluster_sse / cluster_num_points[i])

print("Cluster SSE: ", cluster_SSE)
print("Total SSE: ", total_SSE)
print("SSE of cluster with min SSE: ", min(cluster_SSE))
print("SSE of cluster with max SSE: ", max(cluster_SSE))
print("Mean of SSE: ", np.mean(cluster_SSE))

In [None]:
cluster_distance_variance = []
total_distance_variance = 0
dataset_centroid = np.mean(scaled_dataset, axis=0)

###### CI RIPENSIAMO SU QUESTO ######
for point in scaled_dataset:
    total_distance_variance += (np.linalg.norm(point - dataset_centroid)**2 - total_SSE) ** 2
total_distance_variance /= len(scaled_dataset)
#####################################

for i in range(n_clusters):
    variance = 0
    for p in cluster_points[i]:
        variance += ((np.linalg.norm(p - kmeans.cluster_centers_[i]))**2 - cluster_SSE[i])**2
    cluster_distance_variance.append(variance/cluster_num_points[i])

print("Cluster distance variance: ", cluster_distance_variance)
print("Total distance variance: ", total_distance_variance)
print("Distance variance of cluster with min SSE: ", min(cluster_distance_variance))
print("Distance variance of cluster with max SSE: ", max(cluster_distance_variance))
print("Mean of distance variance: ", np.mean(cluster_distance_variance))

Analysis of distribution of repubblican vs democrats in the clusters.

In [None]:
party_xt_pct = pd.crosstab(kmeans.labels_, numeric_dataset['party'])
party_xt_pct

In [None]:
party_xt_pct.plot(kind='bar', stacked=False, 
                   title='Party per cluster')
plt.xlabel('Cluster')
plt.ylabel('Party')
plt.show()

Construct a distance matrix among cluster centroids.

In [None]:
centroid_distance_matrix = squareform(pdist(kmeans.cluster_centers_))
sns.heatmap(centroid_distance_matrix, annot=True, fmt = '.2f', cmap='crest')

Construct matrix displaying correlation of attribute values to belonging to a certain cluster.

In [None]:
# Transfrom kmeans labels into onehot encoding
onehot = np.zeros((len(kmeans.labels_), n_clusters))
onehot[np.arange(len(kmeans.labels_)), kmeans.labels_] = 1

# Compute correlation between onehot encoding and scaled dataset
onehot_corr = np.corrcoef(scaled_dataset, onehot, rowvar=False)

cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(onehot_corr[:scaled_dataset.shape[1], scaled_dataset.shape[1]:], cmap=cmap)

# Set ticks on y axis with feature names
plt.yticks(np.arange(scaled_dataset.shape[1]) + 0.5, numeric_dataset.columns, rotation=0, fontsize=12)
[]

Compute the similarity matrix of a sample of the dataset.

In [None]:
# Downsample the dataset to 10000
samples = np.random.choice(scaled_dataset.shape[0], 1000, replace=False)
downsampled_dataset = scaled_dataset[samples]
downsampled_labels = kmeans.labels_[samples]

# Sort based on labels
sorted_indexes = np.argsort(downsampled_labels)
downsampled_dataset = downsampled_dataset[sorted_indexes]
downsampled_labels = downsampled_labels[sorted_indexes]

# Compute similarity matrix
pdist_matrix = squareform(pdist(downsampled_dataset, metric='minkowski', p=2))
sns.heatmap(pdist_matrix, fmt = '.2f', cmap='crest')
plt.xticks([])
plt.yticks([])

In [None]:
kmeans.cluster_centers_.shape

In [None]:
tsne = TSNE(n_components=2, n_jobs=-1)
tsne_dataset = np.concatenate((downsampled_dataset, kmeans.cluster_centers_))
tsne_labels = np.concatenate((downsampled_labels, range(7)))
tsne_map = tsne.fit_transform(tsne_dataset)

plt.scatter(tsne_map[:-n_clusters, 0], tsne_map[:-n_clusters, 1], c = tsne_labels[:-n_clusters], s=10, cmap='tab10')
plt.scatter(tsne_map[-n_clusters:, 0], tsne_map[-n_clusters:, 1], c = tsne_labels[-n_clusters:], s=100, cmap='tab10', marker='*', edgecolors='black')


## 2.3 Distribution of variables: within clusters vs whole dataset

In [None]:
print(kmeans.labels_.shape)
print(numeric_dataset.shape)

In [None]:
# Plot distribution of average age for whole dataset and clusters
numeric_dataset_with_clusters = numeric_dataset.copy()
numeric_dataset_with_clusters['cluster'] = kmeans.labels_

sns.displot(numeric_dataset_with_clusters, x="avg_age_participants", kind='kde', hue="cluster")

sns.displot(numeric_dataset_with_clusters, x="males_ratio", kind='kde', hue="cluster")

# Stretch horizontally
plt.gcf().set_size_inches(20, 5)


In [None]:
sns.displot(numeric_dataset_with_clusters, x="povertyPercentage", y='n_participants', hue="cluster")
plt.ylim(0,15)

In [None]:
sns.displot(numeric_dataset_with_clusters, x="povertyPercentage", y='n_participants', kind='kde', hue="cluster")
# Restric the plot to number of participants between 0 and 30
plt.ylim(0, 8)

In [None]:
print(numeric_dataset.shape)

In [None]:
plt.scatter(numeric_dataset['longitude'], numeric_dataset['latitude'],  c=kmeans.labels_, s=20)
plt.scatter(centers_df['longitude'], centers_df['latitude'], marker='*', c='r', s=150)
#plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()

# Best Clustering Approach