### 📦 Imports & Setup

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import matplotlib.patches as mpatches
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
from matplotlib.gridspec import GridSpec

### 📁 Load Dataset

In [None]:
df = pd.read_csv("Data for Modeling/MPBROCKMETAL_KGsubset.csv")

### 🌳 Hierarchical Clustering Function

In [None]:
def hierarchical_clustering(df, iloc_lower, iloc_upper, num_clusters=106):
    features = df.iloc[:, iloc_lower:iloc_upper].copy()
    features_scaled = MinMaxScaler().fit_transform(features)
    distance_matrix = pdist(features_scaled, metric='euclidean')
    linked = linkage(distance_matrix, method='ward')
    cluster_ids = fcluster(linked, num_clusters, criterion='maxclust')
    features['cluster'] = cluster_ids
    sns.clustermap(features.corr(), cmap='coolwarm', annot=False)
    return features

result = hierarchical_clustering(df,6,20 )
print(result)

### 🔬 PCA + Clustering Visualization

In [None]:
def plot_pca_clusters_with_legend(df, iloc_lower, iloc_upper, num_clusters=106):
    features = df.iloc[:, iloc_lower:iloc_upper].copy()
    features_scaled = MinMaxScaler().fit_transform(features)
    linked = linkage(pdist(features_scaled), method='ward')
    cluster_ids = fcluster(linked, num_clusters, criterion='maxclust')
    pca = PCA(n_components=2)
    pca_transformed = pca.fit_transform(features_scaled)
    pca_df = pd.DataFrame(pca_transformed, columns=["PC1", "PC2"])
    pca_df["cluster"] = cluster_ids

    plt.figure(figsize=(12, 6))
    scatter = plt.scatter(pca_df["PC1"], pca_df["PC2"], c=pca_df["cluster"], cmap="tab20", alpha=0.6)
    legend1 = plt.legend(*scatter.legend_elements(), loc="upper right", title="Cluster")
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.title("PCA Cluster Projection")
    plt.show()
    return pca_df

pca_df =plot_pca_clusters_with_legend(df,6,20)
pca_df

### 🔗 Merge Clustering and PCA Results

In [None]:
df_cluster_PCA = pd.merge(result, pca_df, left_index=True, right_index=True)
df_cluster_PCA.drop(columns=["cluster_y"], inplace=True)
df_cluster_PCA.reset_index(inplace=True)

pc1 = df_cluster_PCA["PC1"].tolist()
pc2 = df_cluster_PCA["PC2"].tolist()

def get_pca_distance_with_x_sign(x, y, boundary_x=-0.25, boundary_y=0):
    distance = np.sqrt((x - boundary_x)**2 + (y - boundary_y)**2)
    return distance if x > boundary_x else -distance

pc_distances = [get_pca_distance_with_x_sign(x, y) for x, y in zip(pc1, pc2)]
df_cluster_PCA["PCA_EuclidianDist(Xsign)"] = pc_distances

def add_first_boundary_group(df, distance_column="PCA_EuclidianDist(Xsign)"):
    df["1th_Boundary"] = df[distance_column].apply(lambda x: -1 if x < 0 else 1)
    return df

df_cluster_PCA_gp = add_first_boundary_group(df_cluster_PCA)
df_cluster_PCA_gp = pd.merge(df["track_genre"], df_cluster_PCA_gp, left_index=True, right_index=True)

groupA = df_cluster_PCA_gp[df_cluster_PCA_gp["1th_Boundary"] == -1]
groupB = df_cluster_PCA_gp[df_cluster_PCA_gp["1th_Boundary"] == 1]

### 📊 Plot Feature Comparisons (Group A vs B)

In [None]:
def plot_feature_comparisons(groupA, groupB, comparisons, groupA_label='Group A', groupB_label='Group B'):
    for x_feat, y_feat in comparisons:
        plt.figure(figsize=(6, 4))
        plt.scatter(groupA[x_feat], groupA[y_feat], alpha=0.6, label=groupA_label)
        plt.scatter(groupB[x_feat], groupB[y_feat], alpha=0.6, label=groupB_label)
        plt.xlabel(x_feat)
        plt.ylabel(y_feat)
        plt.title(f"{x_feat} vs {y_feat}")
        plt.legend()
        plt.show()

comparisons = [("acousticness", "energy"), ( "energy", "loudness"), ("danceability","valence" )]
plot_feature_comparisons(groupA,groupB,comparisons)

### 🧠 PCA Feature Contribution Analysis

In [None]:
def pca_feature_contributions(df, iloc_lower, iloc_upper, n_components=2, sort_by='PC1'):
    features = df.iloc[:, iloc_lower:iloc_upper]
    scaled = MinMaxScaler().fit_transform(features)
    pca = PCA(n_components=n_components)
    pca.fit(scaled)
    loadings = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(n_components)], index=features.columns)
    return loadings.sort_values(by=sort_by, ascending=False)

pca_f = pca_feature_contributions(df,6,20)
print(pca_f)

### 🧹 Drop Binary Feature "mode"

In [None]:
del df["mode"]

### 🌳 Re-run Clustering Without "mode"

In [None]:
result2 = hierarchical_clustering(df,6,19 )
print(result2)

### 🔬 Re-plot PCA After Removing "mode"

In [None]:
pca_df2 =plot_pca_clusters_with_legend(df,6,19)
pca_df2

### 🔗 Merge and Analyze PCA Groups Again

In [None]:
df_cluster_PCA2 = pd.merge(result2, pca_df2, left_index=True, right_index=True)
df_cluster_PCA2.drop(columns=["cluster_y"], inplace=True)
df_cluster_PCA2.reset_index(inplace=True)

pc1_2 = df_cluster_PCA2["PC1"].tolist()
pc2_2 = df_cluster_PCA2["PC2"].tolist()

pc_distances_2 = [get_pca_distance_with_x_sign(x, y) for x, y in zip(pc1_2, pc2_2)]
df_cluster_PCA2["PCA_EuclidianDist(Xsign)"] = pc_distances_2

df_cluster_PCA_gp2 = add_first_boundary_group(df_cluster_PCA2)
df_cluster_PCA_gp2 = pd.merge(df["track_genre"], df_cluster_PCA_gp2, left_index=True, right_index=True)

groupA2 = df_cluster_PCA_gp2[df_cluster_PCA_gp2["1th_Boundary"] == -1]
groupB2 = df_cluster_PCA_gp2[df_cluster_PCA_gp2["1th_Boundary"] == 1]

plot_feature_comparisons(groupA2,groupB2,comparisons)

### 📈 Final Feature Contributions (Without "mode")

In [None]:
pca_f2 = pca_feature_contributions(df,6,19)