In [3]:
!pip install pyspark --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


#data set link
https://www.kaggle.com/datasets/abdulszz/spotify-most-streamed-songs?select=Spotify+Most+Streamed+Songs.csv


In [5]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#### Creating Spark Session
import pyspark
from pyspark.sql import SparkSession


In [6]:
spark = (
    SparkSession.builder
    .appName('Kmeans')
    .master('local[*]')
    .config('spark.driver.memory', '1g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

In [7]:
df = spark.read.csv("/content/drive/MyDrive/DataSet/Big Data data Sets/Spotify.csv", header=True, inferSchema=True)

In [8]:
df.select('track_name', 'artist(s)_name', 'artist_count', 'released_year', 'released_month', 'released_day'
         , 'in_spotify_playlists', 'in_spotify_charts').show(5)

+--------------------+----------------+------------+-------------+--------------+------------+--------------------+-----------------+
|          track_name|  artist(s)_name|artist_count|released_year|released_month|released_day|in_spotify_playlists|in_spotify_charts|
+--------------------+----------------+------------+-------------+--------------+------------+--------------------+-----------------+
|Seven (feat. Latt...|Latto, Jung Kook|           2|         2023|             7|          14|                 553|              147|
|                LALA|     Myke Towers|           1|         2023|             3|          23|                1474|               48|
|             vampire|  Olivia Rodrigo|           1|         2023|             6|          30|                1397|              113|
|        Cruel Summer|    Taylor Swift|           1|         2019|             8|          23|                7858|              100|
|      WHERE SHE GOES|       Bad Bunny|           1|         2

In [9]:
df.select('streams', 'in_apple_playlists', 'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts',
          'bpm', 'mode').show()

+----------+------------------+---------------+-------------------+----------------+----------------+---+-----+
|   streams|in_apple_playlists|in_apple_charts|in_deezer_playlists|in_deezer_charts|in_shazam_charts|bpm| mode|
+----------+------------------+---------------+-------------------+----------------+----------------+---+-----+
| 141381703|                43|            263|                 45|              10|             826|125|Major|
| 133716286|                48|            126|                 58|              14|             382| 92|Major|
| 140003974|                94|            207|                 91|              14|             949|138|Major|
| 800840817|               116|            207|                125|              12|             548|170|Major|
| 303236322|                84|            133|                 87|              15|             425|144|Minor|
| 183706234|                67|            213|                 88|              17|             946|141

In [10]:
df.select('danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%','speechiness_%',
          'cover_url').show(5)

+--------------+---------+--------+--------------+------------------+----------+-------------+--------------------+
|danceability_%|valence_%|energy_%|acousticness_%|instrumentalness_%|liveness_%|speechiness_%|           cover_url|
+--------------+---------+--------+--------------+------------------+----------+-------------+--------------------+
|            80|       89|      83|            31|                 0|         8|            4|           Not Found|
|            71|       61|      74|             7|                 0|        10|            4|https://i.scdn.co...|
|            51|       32|      53|            17|                 0|        31|            6|https://i.scdn.co...|
|            55|       58|      72|            11|                 0|        11|           15|https://i.scdn.co...|
|            65|       23|      80|            14|                63|        11|            6|https://i.scdn.co...|
+--------------+---------+--------+--------------+------------------+---

In [11]:
select_df = df.select('released_year', 'bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%', 'liveness_%','speechiness_%')


In [12]:
from pyspark.sql.functions import min, col
select_df.select(min(col('released_year'))).show()

+------------------+
|min(released_year)|
+------------------+
|              1930|
+------------------+



In [13]:
select_df = select_df.withColumn('released_year', col('released_year') - 1930)

In [14]:
select_df.schema

StructType([StructField('released_year', IntegerType(), True), StructField('bpm', IntegerType(), True), StructField('danceability_%', IntegerType(), True), StructField('valence_%', IntegerType(), True), StructField('energy_%', IntegerType(), True), StructField('acousticness_%', IntegerType(), True), StructField('instrumentalness_%', IntegerType(), True), StructField('liveness_%', IntegerType(), True), StructField('speechiness_%', IntegerType(), True)])

In [15]:
# scaling
from pyspark.ml.feature import StandardScaler, VectorAssembler
assembler = VectorAssembler(inputCols=select_df.columns, outputCol="features")
vector_df = assembler.transform(select_df)


scaler = StandardScaler(inputCol="features", outputCol="scaled_feature", withMean=True, withStd=True)
scaler_model = scaler.fit(vector_df)
scaled_df = scaler_model.transform(vector_df)

In [16]:
# model building
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=4, seed=1, featuresCol="scaled_feature", predictionCol="prediction")

kmeans_model = kmeans.fit(scaled_df)

cluster_df = kmeans_model.transform(scaled_df)

In [17]:
kmeans_model.clusterCenters()


[array([ 0.22188327,  0.1847769 ,  0.38052706, -0.0460923 , -0.08174629,
        -0.02777888, -0.17925476, -0.10168455,  2.01720758]),
 array([-0.3749167 ,  0.04181704, -0.8969617 , -0.72752395, -1.01870064,
         0.95515389, -0.04248089, -0.112376  , -0.45567518]),
 array([ 0.11169791, -0.06913108,  0.30614911,  0.35915555,  0.48254957,
        -0.41873823, -0.13903235,  0.08665072, -0.3345288 ]),
 array([-0.21722008,  0.00799447, -0.45224558, -0.81752381, -0.34029491,
         0.15164932,  6.63873612, -0.29868702, -0.47608731])]

In [18]:
cluster_df.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  241|
|         3|   17|
|         2|  546|
|         0|  149|
+----------+-----+

