In [51]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import plotly.express as px
from sklearn.manifold import TSNE
from tabulate import tabulate
from collections import Counter

encoded_df = pd.read_csv('../prepare_dataset/data/dataset_new_cat.csv')

In [52]:
encoded_data = encoded_df.copy()

features = [col for col in encoded_data.columns if col != 'density']
X = encoded_data[features]
y = encoded_data['density']

In [53]:
def analyze_dataset(X, y):
    n_samples = len(X)
    n_categories = len(np.unique(y))
    samples_per_cat = dict(Counter(y))
    dimensionality = X.shape[1] if len(X.shape) > 1 else 1
    
    headers = ["Metric", "Value"]
    table_data = [
        ["Total Samples", n_samples],
        ["Number of Categories", n_categories],
        ["Dimensionality", dimensionality]
    ]
    
    for cat, count in samples_per_cat.items():
        table_data.append([f"Samples in Category {cat}", count])
    
    table = tabulate(table_data, headers=headers, tablefmt="grid")
    return table

print(analyze_dataset(X, y))

+-----------------------+---------+
| Metric                |   Value |
| Total Samples         |    2121 |
+-----------------------+---------+
| Number of Categories  |       3 |
+-----------------------+---------+
| Dimensionality        |      12 |
+-----------------------+---------+
| Samples in Category 0 |     821 |
+-----------------------+---------+
| Samples in Category 2 |     563 |
+-----------------------+---------+
| Samples in Category 1 |     737 |
+-----------------------+---------+


In [32]:
# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Create a DataFrame with PCA results
pca_df = pd.DataFrame(
    X_pca, 
    columns=['PC1', 'PC2']
)

# Add metadata for visualization
pca_df['Density'] = encoded_data['density']

# Create custom color scale with red, blue, green
color_scale = {
#    'Very Low': 'red',
    'Low': 'orange',
    'Medium': 'green',
    'High': 'blue',
#    'Very High': 'purple'
}

# Create interactive 2D scatter plot
fig = px.scatter(
    pca_df,
    x='PC1',
    y='PC2',
    color='Density',
    color_discrete_map=color_scale,
    title='2D PCA Visualization (Density Categories Based on Standard Deviations)',
    labels={
        'PC1': f'First Principal Component (Variance Explained: {pca.explained_variance_ratio_[0]:.2%})',
        'PC2': f'Second Principal Component (Variance Explained: {pca.explained_variance_ratio_[1]:.2%})'
    }
)

# Update layout
fig.update_layout(
    width=800,
    height=800,
    showlegend=True,
    plot_bgcolor='white',
    paper_bgcolor='white'
)

# Update axes
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')

# Show the plot
fig.show()

# Print density distribution
print("\nDensity distribution by campus:")
density_dist = pd.crosstab(encoded_data['campus'], encoded_data['density'], normalize='index')
print(density_dist)



Density distribution by campus:
density         0         1         2
campus                               
0        0.396277  0.417553  0.186170
1        0.321256  0.500000  0.178744
2        0.375000  0.331250  0.293750
3        0.385390  0.307305  0.307305
4        0.397222  0.272222  0.330556
5        0.442029  0.241546  0.316425


In [33]:
# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
X_tsne = tsne.fit_transform(X)

# Create interactive 2D scatter plot
fig = px.scatter(
    pca_df,
    x='PC1',
    y='PC2',
    color='Density',
    color_discrete_map=color_scale,
    title='t-SNE Visualization (Density Categories Based on Standard Deviations)',
    labels={
        'PC1': 'First t-SNE Component',
        'PC2': 'Second t-SNE Component'
    }
)

# Update layout
fig.update_layout(
    width=800,
    height=800,
    showlegend=True,
    plot_bgcolor='white',
    paper_bgcolor='white'
)

# Update axes
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')

# Show the plot
fig.show()

# Print density distribution
print("\nDensity distribution by campus:")
density_dist = pd.crosstab(encoded_data['campus'], encoded_data['density'], normalize='index')
print(density_dist)


Density distribution by campus:
density         0         1         2
campus                               
0        0.396277  0.417553  0.186170
1        0.321256  0.500000  0.178744
2        0.375000  0.331250  0.293750
3        0.385390  0.307305  0.307305
4        0.397222  0.272222  0.330556
5        0.442029  0.241546  0.316425

Statistics by campus:


KeyError: 'normalized_count'

In [34]:
from sklearn.metrics import pairwise_distances
import numpy as np

distances = pairwise_distances(X_tsne)
clusters = encoded_data['density'].values
cluster_intra_distances = {}
all_intra_distances = []

for cluster in np.unique(clusters):
    cluster_indices = np.where(clusters == cluster)[0]
    if len(cluster_indices) > 1:
        cluster_distances = distances[np.ix_(cluster_indices, cluster_indices)]
        intra_distances = cluster_distances[np.triu_indices(len(cluster_indices), k=1)]
        cluster_intra_distances[cluster] = np.mean(intra_distances)
        all_intra_distances.extend(intra_distances)

overall_mean_intra_cluster_distance = np.mean(all_intra_distances)

for cluster, intra_distance in cluster_intra_distances.items():
    print(f"Intra-Cluster Distance for Cluster {cluster}: {intra_distance:.3f}")
print(f"Overall Mean Intra-Cluster Distance: {overall_mean_intra_cluster_distance:.3f}")


Intra-Cluster Distance for Cluster 0: 54.649
Intra-Cluster Distance for Cluster 1: 55.066
Intra-Cluster Distance for Cluster 2: 54.552
Overall Mean Intra-Cluster Distance: 54.777


In [35]:
cluster_inter_distances = {}
all_inter_distances = []

for cluster_a in np.unique(clusters):
    for cluster_b in np.unique(clusters):
        if cluster_a < cluster_b:
            indices_a = np.where(clusters == cluster_a)[0]
            indices_b = np.where(clusters == cluster_b)[0]
            cluster_distances = distances[np.ix_(indices_a, indices_b)]
            mean_inter_distance = np.mean(cluster_distances)
            cluster_inter_distances[(cluster_a, cluster_b)] = mean_inter_distance
            all_inter_distances.extend(cluster_distances.flatten())

overall_mean_inter_cluster_distance = np.mean(all_inter_distances)

for (cluster_a, cluster_b), inter_distance in cluster_inter_distances.items():
    print(f"Inter-Cluster Distance between Cluster {cluster_a} and {cluster_b}: {inter_distance:.3f}")
print(f"Overall Mean Inter-Cluster Distance: {overall_mean_inter_cluster_distance:.3f}")


Inter-Cluster Distance between Cluster 0 and 1: 54.892
Inter-Cluster Distance between Cluster 0 and 2: 54.836
Inter-Cluster Distance between Cluster 1 and 2: 54.914
Overall Mean Inter-Cluster Distance: 54.881


In [36]:
from sklearn.metrics import pairwise_distances
import numpy as np

distances = pairwise_distances(X_tsne)
similarities = 1 / (1 + distances)
clusters = encoded_data['density'].values

intra_class_similarities = []
for cluster in np.unique(clusters):
    cluster_indices = np.where(clusters == cluster)[0]
    if len(cluster_indices) > 1:
        cluster_similarities = similarities[np.ix_(cluster_indices, cluster_indices)]
        intra_class_similarities.append(np.mean(cluster_similarities[np.triu_indices(len(cluster_indices), k=1)]))

inter_class_similarities = []
for cluster_a in np.unique(clusters):
    for cluster_b in np.unique(clusters):
        if cluster_a < cluster_b:
            indices_a = np.where(clusters == cluster_a)[0]
            indices_b = np.where(clusters == cluster_b)[0]
            inter_class_similarities.append(np.mean(similarities[np.ix_(indices_a, indices_b)]))

overall_intra_class_similarity = np.mean(intra_class_similarities)
overall_inter_class_similarity = np.mean(inter_class_similarities)

for cluster, similarity in zip(np.unique(clusters), intra_class_similarities):
    print(f"Intra-Class Similarity for Cluster {cluster}: {similarity:.3f}")
print(f"Overall Intra-Class Similarity: {overall_intra_class_similarity:.3f}")

for (cluster_a, cluster_b), similarity in zip(
    [(a, b) for a in np.unique(clusters) for b in np.unique(clusters) if a < b],
    inter_class_similarities,
):
    print(f"Inter-Class Similarity between Cluster {cluster_a} and {cluster_b}: {similarity:.3f}")
print(f"Overall Inter-Class Similarity: {overall_inter_class_similarity:.3f}")

Intra-Class Similarity for Cluster 0: 0.032
Intra-Class Similarity for Cluster 1: 0.031
Intra-Class Similarity for Cluster 2: 0.033
Overall Intra-Class Similarity: 0.032
Inter-Class Similarity between Cluster 0 and 1: 0.029
Inter-Class Similarity between Cluster 0 and 2: 0.028
Inter-Class Similarity between Cluster 1 and 2: 0.030
Overall Inter-Class Similarity: 0.029


In [42]:
# A clustering with an average silhouette width of over 0.7 is considered to be "strong", a value over 0.5 "reasonable" and over 0.25 "weak", but with increasing dimensionality of the data, it becomes difficult to achieve such high values because of the curse of dimensionality, as the distances become more similar.
from sklearn.metrics import silhouette_score

score = silhouette_score(X_tsne, encoded_data['density'])
print(f"Silhouette Score: {score:.3f}")

Silhouette Score: -0.015


In [38]:
from sklearn.metrics import calinski_harabasz_score

score = calinski_harabasz_score(X_tsne, encoded_data['density'])
print(f"Calinski-Harabasz Score: {score:.3f}")

Calinski-Harabasz Score: 2.639
