In [1]:
# ============================
# 1. Environment Setup
# ============================

# Import Libraries
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

# Spark Session Initialization
spark = SparkSession.builder \
    .appName("Cluster Analysis & Visualization - MSD") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .getOrCreate()

print("Spark session successfully initialized.")

# Verify Data Paths
data_paths = {
    "PCA_Clusters": "/kaggle/input/msd-parquet-by-artist/clustering_results/clusters_pca",
    "UMAP_2D_Clusters": "/kaggle/input/msd-parquet-by-artist/clustering_results/clusters_umap2d",
    "UMAP_3D_Clusters": "/kaggle/input/msd-parquet-by-artist/clustering_results/clusters_umap3d",
    "PCA_Embeddings": "/kaggle/input/msd-parquet-by-artist/dimreduce_parquet/pca_parquet",
    "UMAP_2D_Embeddings": "/kaggle/input/msd-parquet-by-artist/dimreduce_parquet/umap_2d.parquet",
    "UMAP_3D_Embeddings": "/kaggle/input/msd-parquet-by-artist/dimreduce_parquet/umap_3d.parquet",
    "Full_Dataset": "/kaggle/input/msd-parquet-by-artist/msd_parquet_by_artist",
    "Sample_CSV": "/kaggle/input/msd-parquet-by-artist/msd_sample.csv",
    "Schema_Log": "/kaggle/input/msd-parquet-by-artist/schema_log.txt"
}

# Check data paths
for key, path in data_paths.items():
    if os.path.exists(path):
        print(f"{key}: ✅ Path Verified")
    else:
        print(f"{key}: ❌ Path Not Found")

print("\nEnvironment setup complete.")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/12 14:42:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark session successfully initialized.
PCA_Clusters: ✅ Path Verified
UMAP_2D_Clusters: ✅ Path Verified
UMAP_3D_Clusters: ✅ Path Verified
PCA_Embeddings: ✅ Path Verified
UMAP_2D_Embeddings: ✅ Path Verified
UMAP_3D_Embeddings: ✅ Path Verified
Full_Dataset: ✅ Path Verified
Sample_CSV: ✅ Path Verified
Schema_Log: ✅ Path Verified

Environment setup complete.


In [2]:
# ============================
# 2.1 Data Loading - PCA Clusters and Embeddings
# ============================

# Define paths
pca_clusters_path = data_paths["PCA_Clusters"]
pca_embeddings_path = data_paths["PCA_Embeddings"]

# Load PCA Cluster Labels
print("Loading PCA Cluster Labels...")
pca_clusters_df = spark.read.parquet(pca_clusters_path)
print(f"PCA Clusters - Schema:\n")
pca_clusters_df.printSchema()
print("\nPCA Clusters - Sample Data:")
pca_clusters_df.show(5)

# Load PCA Embeddings
print("\nLoading PCA Embeddings...")
pca_embeddings_df = spark.read.parquet(pca_embeddings_path)
print(f"PCA Embeddings - Schema:\n")
pca_embeddings_df.printSchema()
print("\nPCA Embeddings - Sample Data:")
pca_embeddings_df.show(5)

Loading PCA Cluster Labels...


                                                                                                    

PCA Clusters - Schema:

root
 |-- song_id: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- pca_features: vector (nullable = true)
 |-- cluster: integer (nullable = true)


PCA Clusters - Sample Data:


                                                                                                    

+------------------+--------------+--------------------+-------+
|           song_id|   artist_name|        pca_features|cluster|
+------------------+--------------+--------------------+-------+
|SOAWNUW12AC46877F8|   edmundo_ros|[-458.60783350067...|      1|
|SOJMRHG12AB018B633|   edmundo_ros|[-401.85952901015...|      4|
|SOMFNPC12A8C13E5FE|       benabar|[-331.02762743953...|      3|
|SOLTTJA12A8C13FC12|       benabar|[-522.84704956715...|      0|
|SOYTPEP12AB0180E7B|the_shangrilas|[-416.46061347865...|      1|
+------------------+--------------+--------------------+-------+
only showing top 5 rows


Loading PCA Embeddings...
PCA Embeddings - Schema:

root
 |-- song_id: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- pca_features: vector (nullable = true)


PCA Embeddings - Sample Data:
+------------------+------------+--------------------+
|           song_id| artist_name|        pca_features|
+------------------+------------+--------------------+
|SOXFQVY1

In [3]:
# ============================
# 3.1 Data Integrity Check - Row Count and Unique IDs
# ============================

# Row count and unique song_id check
print("PCA Clusters - Row Count:", pca_clusters_df.count())
print("PCA Clusters - Unique song_id Count:", pca_clusters_df.select("song_id").distinct().count())

print("\nPCA Embeddings - Row Count:", pca_embeddings_df.count())
print("PCA Embeddings - Unique song_id Count:", pca_embeddings_df.select("song_id").distinct().count())

# Check for missing song_ids in clusters that exist in embeddings
missing_in_clusters = pca_embeddings_df.select("song_id").subtract(pca_clusters_df.select("song_id"))
missing_count = missing_in_clusters.count()
print(f"\nNumber of song_ids in PCA Embeddings not in PCA Clusters: {missing_count}")

# Check for missing song_ids in embeddings that exist in clusters
missing_in_embeddings = pca_clusters_df.select("song_id").subtract(pca_embeddings_df.select("song_id"))
missing_count_embeddings = missing_in_embeddings.count()
print(f"Number of song_ids in PCA Clusters not in PCA Embeddings: {missing_count_embeddings}")

PCA Clusters - Row Count: 10000


                                                                                                    

PCA Clusters - Unique song_id Count: 10000


                                                                                                    


PCA Embeddings - Row Count: 10000


                                                                                                    

PCA Embeddings - Unique song_id Count: 10000


                                                                                                    


Number of song_ids in PCA Embeddings not in PCA Clusters: 0


                                                                                                    

Number of song_ids in PCA Clusters not in PCA Embeddings: 0


In [4]:
# ============================
# 4.1 Merge PCA Clusters and Embeddings
# ============================

from pyspark.sql.functions import col

# Merge DataFrames on 'song_id'
pca_combined_df = pca_clusters_df.join(pca_embeddings_df, on="song_id", how="inner")

# Verify the merge
print("Merged DataFrame Schema:")
pca_combined_df.printSchema()

# Display sample data
print("\nSample Data from Merged DataFrame:")
pca_combined_df.show(5)

Merged DataFrame Schema:
root
 |-- song_id: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- pca_features: vector (nullable = true)
 |-- cluster: integer (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- pca_features: vector (nullable = true)


Sample Data from Merged DataFrame:
+------------------+------------+--------------------+-------+------------+--------------------+
|           song_id| artist_name|        pca_features|cluster| artist_name|        pca_features|
+------------------+------------+--------------------+-------+------------+--------------------+
|SOXFQVY12A58A7B456|phil_collins|[-341.59995853429...|      2|phil_collins|[-341.59995853429...|
|SOFSRCP12CF5CFD696|phil_collins|[-281.62082721270...|      3|phil_collins|[-281.62082721270...|
|SOYHHHT12A6D4F7F97|phil_collins|[-334.15859220591...|      3|phil_collins|[-334.15859220591...|
|SOXHMJM12A58A7A33A|phil_collins|[-256.91899597566...|      2|phil_collins|[-256.91899597566...|
|

In [8]:
# ============================
# 4.2 Data Cleanup and Preparation for Plotting (Revised Again)
# ============================

# Explicitly select and rename to avoid ambiguity
pca_vis_df = pca_combined_df.select(
    pca_clusters_df["song_id"],
    pca_clusters_df["cluster"],
    pca_clusters_df["pca_features"].alias("pca_vector")
)

# Convert to Pandas DataFrame for Plotly visualization
pca_vis_pd = pca_vis_df.toPandas()

# Extract PC1 and PC2 from the PCA vector
pca_vis_pd["PC1"] = pca_vis_pd["pca_vector"].apply(lambda x: x[0])
pca_vis_pd["PC2"] = pca_vis_pd["pca_vector"].apply(lambda x: x[1])

# Drop the original PCA vector column to avoid redundancy
pca_vis_pd.drop("pca_vector", axis=1, inplace=True)

# Display the first few rows to confirm structure
print(pca_vis_pd.head())

                                                                                                    

              song_id  cluster         PC1         PC2
0  SOAWNUW12AC46877F8        1 -458.607834  362.166957
1  SOJMRHG12AB018B633        4 -401.859529  238.815386
2  SOMFNPC12A8C13E5FE        3 -331.027627  323.219000
3  SOLTTJA12A8C13FC12        0 -522.847050  382.753914
4  SOYTPEP12AB0180E7B        1 -416.460613  291.190827


In [10]:
# ============================
# 4.3 PCA 2D Scatter Plot Visualization
# ============================

import plotly.express as px

# Create 2D scatter plot
fig = px.scatter(
    pca_vis_pd,
    x="PC1",
    y="PC2",
    color="cluster",
    title="PCA 2D Scatter Plot - Cluster Visualization",
    hover_data=["song_id"],
    color_continuous_scale="Viridis"
)

# Update layout for clarity
fig.update_layout(
    width=900,
    height=600,
    template="plotly_dark",
    title_x=0.5,
    title_font=dict(size=18)
)

# Show plot
fig.show()

## PCA 2D Scatter Plot Analysis

### Cluster Distribution

- The PCA 2D plot reveals **three dominant clusters** with significant overlap, particularly in the central region.
- **Cluster 1 (Yellow)** and **Cluster 0 (Teal)** exhibit the highest density, suggesting a larger number of songs with similar feature sets.
- **Cluster 2 (Purple)** is more dispersed, indicating songs with more varied characteristics.

### Outlier Analysis

- Several isolated points are present in the **bottom left corner** and **top right corner**.
- These could potentially indicate **rare or niche musical styles**, **unique audio profiles**, or **emerging trends**.

### Overlapping Regions

- Noticeable overlap between **Cluster 0 (Teal)** and **Cluster 1 (Yellow)**, suggesting songs with **mixed or ambiguous characteristics**.
- This overlap may indicate that **PCA is not fully capturing non-linear relationships**, hence the shift to **UMAP**.


In [11]:
# ============================
# 5.1 Load UMAP 2D Clusters and Embeddings
# ============================

# Define paths
umap_2d_clusters_path = data_paths["UMAP_2D_Clusters"]
umap_2d_embeddings_path = data_paths["UMAP_2D_Embeddings"]

# Load UMAP 2D Cluster Labels
print("Loading UMAP 2D Cluster Labels...")
umap_2d_clusters_df = spark.read.parquet(umap_2d_clusters_path)
print(f"UMAP 2D Clusters - Schema:\n")
umap_2d_clusters_df.printSchema()
umap_2d_clusters_df.show(5)

# Load UMAP 2D Embeddings
print("\nLoading UMAP 2D Embeddings...")
umap_2d_embeddings_df = spark.read.parquet(umap_2d_embeddings_path)
print(f"UMAP 2D Embeddings - Schema:\n")
umap_2d_embeddings_df.printSchema()
umap_2d_embeddings_df.show(5)

Loading UMAP 2D Cluster Labels...
UMAP 2D Clusters - Schema:

root
 |-- song_id: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- UMAP_1: float (nullable = true)
 |-- UMAP_2: float (nullable = true)
 |-- cluster: integer (nullable = true)

+------------------+--------------------+---------+---------+-------+
|           song_id|         artist_name|   UMAP_1|   UMAP_2|cluster|
+------------------+--------------------+---------+---------+-------+
|SOCHDXC12A8C13837F|      dungeon_family|3.2482305|4.1154366|      0|
|SOPXLMX12AB01884D1|melanie_and_the_s...|5.0003867|7.2905426|      0|
|SOHLFAO12AB0189832|            dj_nasty|3.2217772|5.2945986|      0|
|SOVFFIH12A81C217E6|               tbone|4.5663486| 7.974386|      0|
|SOJQYBG12AB018B656|perez_prado__his_...|4.1665735| 8.156345|      0|
+------------------+--------------------+---------+---------+-------+
only showing top 5 rows


Loading UMAP 2D Embeddings...
UMAP 2D Embeddings - Schema:

root
 |-- song_id: s

In [13]:
# ============================
# 5.2 Data Merge and Preparation for Plotting (Revised)
# ============================

# Select and rename to avoid ambiguity
umap_2d_vis_df = umap_2d_combined_df.select(
    umap_2d_clusters_df["song_id"],
    umap_2d_clusters_df["cluster"],
    umap_2d_embeddings_df["UMAP_1"].alias("UMAP_1"),
    umap_2d_embeddings_df["UMAP_2"].alias("UMAP_2")
)

# Convert to Pandas DataFrame for Plotly visualization
umap_2d_vis_pd = umap_2d_vis_df.toPandas()

# Display the first few rows to confirm structure
print(umap_2d_vis_pd.head())

              song_id  cluster    UMAP_1    UMAP_2
0  SOXFQVY12A58A7B456        2  0.758184  4.998288
1  SOFSRCP12CF5CFD696        2 -0.265198  3.518531
2  SOYHHHT12A6D4F7F97        2  1.682284  3.265269
3  SOXHMJM12A58A7A33A        2 -1.223929  5.828059
4  SOBJUKG12A58A7DCA8        2 -1.400908  6.287881


In [14]:
# ============================
# 5.3 UMAP 2D Scatter Plot Visualization
# ============================

# Create 2D scatter plot using Plotly
fig = px.scatter(
    umap_2d_vis_pd,
    x="UMAP_1",
    y="UMAP_2",
    color="cluster",
    title="UMAP 2D Scatter Plot - Cluster Visualization",
    hover_data=["song_id"],
    color_continuous_scale="Viridis"
)

# Update layout for clarity
fig.update_layout(
    width=900,
    height=600,
    template="plotly_dark",
    title_x=0.5,
    title_font=dict(size=18)
)

# Show plot
fig.show()

## UMAP 2D Scatter Plot Analysis

### Enhanced Separation

- **UMAP 2D** provides a **clearer separation of clusters** compared to PCA.
- **Cluster 0 (Teal)** is more distinctly defined, occupying a more cohesive region.
- **Cluster 1 (Yellow)** and **Cluster 2 (Purple)** are still somewhat overlapping but are **more distinguishable** than in PCA.

### Cluster Structure

- UMAP is better at **preserving local structures**, evident in the **dense, compact regions** in Cluster 0.
- This compactness suggests that the UMAP algorithm effectively captures songs with **highly similar audio features**.

### Outlier Identification

- **Fewer isolated points** compared to PCA, indicating that UMAP is **more effective at grouping similar data points**.
- The **outliers are more concentrated**, suggesting potential areas of interest for **further analysis**.



In [15]:
# ============================
# 6.1 Load UMAP 3D Clusters and Embeddings
# ============================

# Define paths
umap_3d_clusters_path = data_paths["UMAP_3D_Clusters"]
umap_3d_embeddings_path = data_paths["UMAP_3D_Embeddings"]

# Load UMAP 3D Cluster Labels
print("Loading UMAP 3D Cluster Labels...")
umap_3d_clusters_df = spark.read.parquet(umap_3d_clusters_path)
print(f"UMAP 3D Clusters - Schema:\n")
umap_3d_clusters_df.printSchema()
umap_3d_clusters_df.show(5)

# Load UMAP 3D Embeddings
print("\nLoading UMAP 3D Embeddings...")
umap_3d_embeddings_df = spark.read.parquet(umap_3d_embeddings_path)
print(f"UMAP 3D Embeddings - Schema:\n")
umap_3d_embeddings_df.printSchema()
umap_3d_embeddings_df.show(5)

Loading UMAP 3D Cluster Labels...
UMAP 3D Clusters - Schema:

root
 |-- song_id: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- UMAP_1: float (nullable = true)
 |-- UMAP_2: float (nullable = true)
 |-- UMAP_3: float (nullable = true)
 |-- cluster_3d: integer (nullable = true)

+------------------+----------------+---------+---------+---------+----------+
|           song_id|     artist_name|   UMAP_1|   UMAP_2|   UMAP_3|cluster_3d|
+------------------+----------------+---------+---------+---------+----------+
|SORLYZU12AB017CE89|  sergio_franchi| 3.733284| 4.137823| 5.463098|         2|
|SONJFLA12AB018BB67|   faiz_ali_faiz|7.1774898| 3.225655| 6.858816|         0|
|SOILPQQ12AB017E82A|   faiz_ali_faiz|6.8982735|2.8283317|6.5005045|         0|
|SOWDLGO12AB0182434|junior_kimbrough|0.7632514|2.6186917| 7.034706|         1|
|SOHPAVE12A8AE47190|junior_kimbrough|5.2476087|2.1003077| 6.909487|         0|
+------------------+----------------+---------+---------+-------

In [16]:
# ============================
# 6.2 Data Merge and Preparation for Plotting
# ============================

# Merge DataFrames on 'song_id'
umap_3d_combined_df = umap_3d_clusters_df.join(umap_3d_embeddings_df, on="song_id", how="inner")

# Select relevant columns and rename for clarity
umap_3d_vis_df = umap_3d_combined_df.select(
    umap_3d_clusters_df["song_id"],
    umap_3d_clusters_df["cluster_3d"].alias("cluster"),
    umap_3d_embeddings_df["UMAP_1"],
    umap_3d_embeddings_df["UMAP_2"],
    umap_3d_embeddings_df["UMAP_3"]
)

# Convert to Pandas DataFrame for Plotly visualization
umap_3d_vis_pd = umap_3d_vis_df.toPandas()

# Display the first few rows to confirm structure
print(umap_3d_vis_pd.head())

              song_id  cluster    UMAP_1    UMAP_2    UMAP_3
0  SOXFQVY12A58A7B456        1  1.167929  4.989672  7.630533
1  SOFSRCP12CF5CFD696        1  0.074257  4.014765  7.827751
2  SOYHHHT12A6D4F7F97        1  1.736577  3.700082  9.062063
3  SOXHMJM12A58A7A33A        1 -0.512746  6.065361  7.428175
4  SOBJUKG12A58A7DCA8        1 -0.610780  6.477085  7.192257


In [17]:
# ============================
# 6.3 UMAP 3D Scatter Plot Visualization
# ============================

# Create 3D scatter plot using Plotly
fig = px.scatter_3d(
    umap_3d_vis_pd,
    x="UMAP_1",
    y="UMAP_2",
    z="UMAP_3",
    color="cluster",
    title="UMAP 3D Scatter Plot - Cluster Visualization",
    hover_data=["song_id"],
    color_continuous_scale="Viridis"
)

# Update layout for clarity
fig.update_layout(
    width=1000,
    height=700,
    template="plotly_dark",
    title_x=0.5,
    title_font=dict(size=18)
)

# Show plot
fig.show()

## 1.3 UMAP 3D Scatter Plot Analysis

### Depth and Density Analysis

- The **third dimension adds depth**, revealing **sub-clusters** and more **granular structures**.
- **Cluster 0 (Teal)** has a significant spread in the third dimension, indicating **variability in audio features**.
- **Clusters 1 (Yellow)** and **2 (Purple)** are more compact, suggesting **more homogeneous audio characteristics**.

### Potential Outliers

- Certain points are more **isolated in 3D**, particularly in the **far left and right regions**.
- These could indicate songs with **rare feature combinations** or **highly distinct musical styles**.


In [19]:
from pymongo import MongoClient

# MongoDB Connection String
MONGO_URI = "mongodb+srv://admin:yourpassword123@bigdatahw.udemiib.mongodb.net/?retryWrites=true&w=majority&appName=BigDataHW"

# Establish MongoDB Connection
try:
    client = MongoClient(MONGO_URI)
    db = client["msd_database"]
    print("MongoDB connection established successfully.")
except Exception as e:
    print(f"MongoDB connection failed: {e}")

MongoDB connection established successfully.


In [20]:
# ============================
# Data Extraction from MongoDB
# ============================

# Collection reference
collection = db["song_data"]

# Define query and projection
query = {}
projection = {
    "song_id": 1,
    "tempo": 1,
    "loudness": 1,
    "song_hotttnesss": 1,
    "_id": 0
}

# Extract data
try:
    print("Extracting data from MongoDB...")
    mongo_data = list(collection.find(query, projection))
    print(f"Data extracted successfully. Total records: {len(mongo_data)}")
except Exception as e:
    print(f"Data extraction failed: {e}")

# Convert to Pandas DataFrame for easier handling
import pandas as pd

features_df = pd.DataFrame(mongo_data)
print("Sample Data:")
print(features_df.head())


Extracting data from MongoDB...
Data extracted successfully. Total records: 10000
Sample Data:
              song_id
0  SOTEDAD12A8AE4735F
1  SOUWZWV12AB0181ACF
2  SOFAJOM12A8C141EE4
3  SOCCBOH12A8C13E947
4  SOBBWJS12AB0182522


In [21]:
# ============================
# Data Extraction - Timbre & Pitch Features
# ============================

# Update projection to include timbre and pitch features
projection = {
    "song_id": 1,
    "timbre_mean": 1,
    "timbre_max": 1,
    "timbre_min": 1,
    "timbre_std": 1,
    "pitch_mean": 1,
    "pitch_max": 1,
    "pitch_min": 1,
    "pitch_std": 1,
    "_id": 0
}

# Extract data
try:
    print("Extracting timbre and pitch data from MongoDB...")
    mongo_data = list(collection.find({}, projection))
    print(f"Data extracted successfully. Total records: {len(mongo_data)}")
except Exception as e:
    print(f"Data extraction failed: {e}")

# Convert to Pandas DataFrame for easier handling
features_df = pd.DataFrame(mongo_data)
print("Sample Data:")
print(features_df.head())

Extracting timbre and pitch data from MongoDB...
Data extracted successfully. Total records: 10000
Sample Data:
              song_id                                        timbre_mean  \
0  SOTEDAD12A8AE4735F  [39.321571350097656, 6.516843795776367, 23.708...   
1  SOUWZWV12AB0181ACF  [43.89027786254883, -89.25192260742188, -22.63...   
2  SOFAJOM12A8C141EE4  [46.81022644042969, 41.30602264404297, 40.8432...   
3  SOCCBOH12A8C13E947  [46.884056091308594, 31.062284469604492, 12.59...   
4  SOBBWJS12AB0182522  [44.009483337402344, -0.6849293112754822, -35....   

                                          timbre_max  \
0  [48.8380012512207, 191.81300354003906, 189.507...   
1  [50.505001068115234, 171.1300048828125, 139.67...   
2  [52.86899948120117, 171.1300048828125, 127.591...   
3  [54.18000030517578, 129.39599609375, 105.94899...   
4  [52.16699981689453, 171.1300048828125, 72.2509...   

                                          timbre_min  \
0  [0.0, -248.25999450683594, -173.738

In [22]:
# ============================
# Data Merging with Cluster Labels
# ============================

# Merge with PCA data
pca_merged_df = pca_combined_df.join(
    spark.createDataFrame(features_df),
    on="song_id",
    how="inner"
)

# Merge with UMAP 2D data
umap_2d_merged_df = umap_2d_combined_df.join(
    spark.createDataFrame(features_df),
    on="song_id",
    how="inner"
)

# Merge with UMAP 3D data
umap_3d_merged_df = umap_3d_combined_df.join(
    spark.createDataFrame(features_df),
    on="song_id",
    how="inner"
)

# Display a sample of each merged DataFrame
print("PCA Merged Data Sample:")
pca_merged_df.show(5)

print("UMAP 2D Merged Data Sample:")
umap_2d_merged_df.show(5)

print("UMAP 3D Merged Data Sample:")
umap_3d_merged_df.show(5)

PCA Merged Data Sample:


25/05/12 15:26:05 WARN TaskSetManager: Stage 48 contains a task of very large size (1849 KiB). The maximum recommended task size is 1000 KiB.
                                                                                                    

+------------------+------------+--------------------+-------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|           song_id| artist_name|        pca_features|cluster| artist_name|        pca_features|         timbre_mean|          timbre_max|          timbre_min|          timbre_std|          pitch_mean|           pitch_max|           pitch_min|           pitch_std|
+------------------+------------+--------------------+-------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|SOXFQVY12A58A7B456|phil_collins|[-341.59995853429...|      2|phil_collins|[-341.59995853429...|[41.7702293395996...|[52.0200004577636...|[0.0, -148.018997...|[6.99290275573730...|[0.26835367083549...|[1.0

25/05/12 15:26:07 WARN TaskSetManager: Stage 53 contains a task of very large size (1849 KiB). The maximum recommended task size is 1000 KiB.
                                                                                                    

+------------------+------------+-----------+---------+-------+------------+-----------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|           song_id| artist_name|     UMAP_1|   UMAP_2|cluster| artist_name|     UMAP_1|   UMAP_2|         timbre_mean|          timbre_max|          timbre_min|          timbre_std|          pitch_mean|           pitch_max|           pitch_min|           pitch_std|
+------------------+------------+-----------+---------+-------+------------+-----------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|SOXFQVY12A58A7B456|phil_collins|  0.7581841|4.9982877|      2|phil_collins|  0.7581841|4.9982877|[41.7702293395996...|[52.0200004577636...|[0.0, -148.018997...|[6.99290275573730...|[0.26835367083549

25/05/12 15:26:09 WARN TaskSetManager: Stage 58 contains a task of very large size (1849 KiB). The maximum recommended task size is 1000 KiB.
                                                                                                    

+------------------+------------+-----------+---------+---------+----------+------------+-----------+---------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|           song_id| artist_name|     UMAP_1|   UMAP_2|   UMAP_3|cluster_3d| artist_name|     UMAP_1|   UMAP_2|   UMAP_3|         timbre_mean|          timbre_max|          timbre_min|          timbre_std|          pitch_mean|           pitch_max|           pitch_min|           pitch_std|
+------------------+------------+-----------+---------+---------+----------+------------+-----------+---------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|SOXFQVY12A58A7B456|phil_collins|   1.167929|4.9896717| 7.630533|         1|phil_collins|   1.167929|4.9896717| 7.630533|[41.77022

In [25]:
# ============================
# Cluster Profiling and Feature Analysis (Revised for UMAP 3D)
# ============================

from pyspark.sql.functions import avg, col

# Rename cluster column for UMAP 3D
umap_3d_merged_df = umap_3d_merged_df.withColumnRenamed("cluster_3d", "cluster")

# PCA Cluster Profiling
pca_profile = pca_merged_df.select(
    col("cluster"),
    col("timbre_mean")[0].alias("timbre_mean"),
    col("pitch_mean")[0].alias("pitch_mean")
).groupBy("cluster").agg(
    avg("timbre_mean").alias("avg_timbre_mean"),
    avg("pitch_mean").alias("avg_pitch_mean")
)

# UMAP 2D Cluster Profiling
umap_2d_profile = umap_2d_merged_df.select(
    col("cluster"),
    col("timbre_mean")[0].alias("timbre_mean"),
    col("pitch_mean")[0].alias("pitch_mean")
).groupBy("cluster").agg(
    avg("timbre_mean").alias("avg_timbre_mean"),
    avg("pitch_mean").alias("avg_pitch_mean")
)

# UMAP 3D Cluster Profiling (with renamed column)
umap_3d_profile = umap_3d_merged_df.select(
    col("cluster"),
    col("timbre_mean")[0].alias("timbre_mean"),
    col("pitch_mean")[0].alias("pitch_mean")
).groupBy("cluster").agg(
    avg("timbre_mean").alias("avg_timbre_mean"),
    avg("pitch_mean").alias("avg_pitch_mean")
)

# Display cluster profiles
print("PCA Cluster Profile:")
pca_profile.show()

print("UMAP 2D Cluster Profile:")
umap_2d_profile.show()

print("UMAP 3D Cluster Profile:")
umap_3d_profile.show()

PCA Cluster Profile:


25/05/12 15:32:45 WARN TaskSetManager: Stage 63 contains a task of very large size (1849 KiB). The maximum recommended task size is 1000 KiB.
25/05/12 15:32:48 WARN TaskSetManager: Stage 70 contains a task of very large size (1849 KiB). The maximum recommended task size is 1000 KiB.


+-------+-----------------+------------------+
|cluster|  avg_timbre_mean|    avg_pitch_mean|
+-------+-----------------+------------------+
|      1|44.37475043051108|0.4307106161481949|
|      3|37.46861965931891|0.3694374855827071|
|      4|45.01171201626634|0.4620673689526593|
|      2|44.16333970353623|0.4539422231448561|
|      0|41.43197799648838|0.4883138532458718|
+-------+-----------------+------------------+

UMAP 2D Cluster Profile:


                                                                                                    

+-------+------------------+------------------+
|cluster|   avg_timbre_mean|    avg_pitch_mean|
+-------+------------------+------------------+
|      1| 42.97977788217972|0.4616494812408213|
|      2|40.906489905551595|0.4132227588515618|
|      0| 44.29018415319466|0.4632275590563804|
+-------+------------------+------------------+

UMAP 3D Cluster Profile:


25/05/12 15:32:49 WARN TaskSetManager: Stage 80 contains a task of very large size (1849 KiB). The maximum recommended task size is 1000 KiB.
                                                                                                    

+-------+-----------------+-------------------+
|cluster|  avg_timbre_mean|     avg_pitch_mean|
+-------+-----------------+-------------------+
|      1|41.28555183643742| 0.4147219720178785|
|      2|43.37807749734689|0.46299076468503775|
|      0|43.36812896790577|0.45555261128992164|
+-------+-----------------+-------------------+



## ✅ Cluster Profile Analysis and Real-World Implications

### 1. PCA Cluster Profile Analysis

| Cluster | Avg Timbre Mean | Avg Pitch Mean | Characteristics                                      |
|---------|------------------|----------------|------------------------------------------------------|
| 0       | 41.43            | 0.49           | High pitch, balanced timbre — Upbeat, energetic songs. |
| 1       | 44.37            | 0.43           | High timbre, moderate pitch — Potentially cinematic or emotional songs. |
| 2       | 44.16            | 0.45           | Balanced timbre and pitch — Neutral, versatile tracks. |
| 3       | 37.47            | 0.37           | Lower timbre and pitch — Likely softer or acoustic music. |
| 4       | 45.01            | 0.46           | High timbre, balanced pitch — Vocally intense or expressive music. |

### 2. UMAP 2D Cluster Profile Analysis

| Cluster | Avg Timbre Mean | Avg Pitch Mean | Characteristics                                         |
|---------|------------------|----------------|---------------------------------------------------------|
| 0       | 44.29            | 0.46           | Highly expressive, vocally dominant tracks.             |
| 1       | 42.97            | 0.46           | Balanced timbre and pitch, general-purpose songs.       |
| 2       | 40.90            | 0.41           | Softer, potentially acoustic or instrumental tracks.    |

### 3. UMAP 3D Cluster Profile Analysis

| Cluster | Avg Timbre Mean | Avg Pitch Mean | Characteristics                                       |
|---------|------------------|----------------|-------------------------------------------------------|
| 0       | 43.36            | 0.45           | Balanced timbre, high pitch — Energetic or upbeat songs. |
| 1       | 41.28            | 0.41           | Softer, mellow, or acoustic tracks.                   |
| 2       | 43.37            | 0.46           | Highly expressive or intense vocal tracks.            |

---

## ✅ Real-World Application: Music Recommendation System

### Recommendation Strategy

Given a song, we can recommend songs from the same cluster that have similar timbre and pitch values.

**Example:**

- If a user is listening to a song in **Cluster 0 (PCA)** (high pitch, balanced timbre), we can recommend other **upbeat or energetic songs** in the same cluster.

### Implementation Logic

- **Input:** `song_id`, `cluster`, `timbre_mean`, `pitch_mean`
- **Recommendation:** Find songs in the **same cluster** with the **closest timbre and pitch values**.
