# Example clustering notebook

Note how we use PCA for data visualization, but cluster on the non-dimension-reduced data (though we use PCA to then visualize the clustering in 2D)! This is only a good idea if the distances in the original data are somehow meaningful. I don't know if it is here, this is just a coding example.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import pyreadr

# Load the dataset
presidential_speech = pyreadr.read_r('./data/presidential_speech.rds')[None]

# Quickly look at the dataset
print(presidential_speech.iloc[:6, :6])

In [None]:

# Plot histogram
plt.hist(presidential_speech.to_numpy().flatten())
plt.show()

In [None]:

# Visualize the data via PCA
pca = PCA(n_components=2)
pca_scores = pca.fit_transform(presidential_speech)
pca_scores

In [None]:

# Convert to DataFrame for ease of plotting
pc_scores_df = pd.DataFrame(dict(PC1=pca_scores[:, 0], PC2=pca_scores[:, 1]), index=presidential_speech.index)

# Plot PCA results
sns.scatterplot(data=pc_scores_df, x='PC1', y='PC2')
plt.title('PCA of Presidential Speech Data')
plt.show()

In [None]:

# Run DBSCAN
#scaler = StandardScaler()
#scaled_data = scaler.fit_transform(presidential_speech)
dbscan = DBSCAN(eps=10.5, min_samples=3)
clusters = dbscan.fit_predict(np.array(presidential_speech))

# Add cluster labels to the DataFrame
pc_scores_df['cluster'] = clusters

# Plot clusters on PC1 vs PC2 plot
sns.scatterplot(data=pc_scores_df, x='PC1', y='PC2', hue='cluster', palette='viridis', legend='full')
plt.title('DBSCAN Clustering of PCA Results')
plt.show()

In [None]:

# Plot clusters on PC1 vs PC2 plot with president names
# Assuming row names are preserved as index in pc_scores_df
plt.figure(figsize=(12, 8))
sns.scatterplot(data=pc_scores_df, x='PC1', y='PC2', hue='cluster', palette='viridis', legend='full')

for i in range(pc_scores_df.shape[0]):
    plt.text(pc_scores_df.PC1[i], pc_scores_df.PC2[i], pc_scores_df.index[i], fontsize=9)

plt.title('DBSCAN Clustering with President Names')
plt.show()
