In [1]:
# DEFINE FUNCTIONS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

def get_access_token(client_id:str, client_sc:str):
    import requests
    
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
    }
    data = f'grant_type=client_credentials&client_id={client_id}&client_secret={client_sc}'.encode()
    response = requests.post('https://accounts.spotify.com/api/token', headers=headers, data=data).json()
    access_token = response['access_token']

    return access_token

def get_response(access_token:str, endpoint:str, params:dict=None):
    import requests, json

    url = f"https://api.spotify.com/v1/{endpoint}"
    headers = {
        'Authorization': f'Bearer {access_token}',
    }

    if params != None:
        response = requests.get(url=url, params=params, headers=headers)
    else:
        response = requests.get(url=url, headers=headers)
    print(response)
    
    if response.status_code == 200:
        try:
            data = response.json()
            return data
        except json.decoder.JSONDecodeError:
            raise ValueError(f"API Server Error - {endpoint} - Invalid JSON content in response: {response.text}")
    else:
        raise ValueError(f"API Server Error - {endpoint} - Non-200 status code received: {response.status_code}")
    

def post_response(access_token:str, endpoint:str, data:dict=None):
    import requests

    url = f"https://api.spotify.com/v1/{endpoint}"
    headers = {
        'Authorization': f'Bearer {access_token}',
    }

    response = requests.post(url=url, headers=headers)
    print(response)
    
    if response.status_code == 200:
        pass
    else:
        raise ValueError(f"API Server Error - {endpoint} - Non-200 status code received: {response.status_code}")
    
# INFOS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

client_id = "67634b8925dc48f79e59045d9e4d5014"
client_sc = "b6190d9f6d404f6b86e8f0abcd1d3779"
user_id = "k3u4dn9nb7cll8gtzvxb9whvt"    

# START CODE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from math import ceil
import json

### Build Session
spark = SparkSession.builder \
    .appName("pipeline_demo") \
    .getOrCreate()

### Create Access Token
access_token = get_access_token(client_id=client_id, client_sc=client_sc)

### Create Playlist Lists
endpoint = f"users/{user_id}/playlists"
params = {
    "limit": 50,
    "offset": 0
}

playlists = get_response(access_token=access_token, endpoint=endpoint, params=params)
json_string  = json.dumps(playlists)
json_rdd = spark.sparkContext.parallelize([json_string])
df_plinfo = spark.read.json(json_rdd, multiLine=True)

items = df_plinfo \
    .withColumn("items", explode("items")) \
    .select("items.id") \
    .rdd.flatMap(lambda x: x).collect()

### Create Playlist Item Lists
track_list = [] # <---------- "Need To Use"
for id in items:
    endpoint = f"playlists/{id}/tracks"
    playlist_spec = get_response(access_token=access_token, endpoint=endpoint)
    
    json_string  = json.dumps(playlist_spec)
    json_rdd = spark.sparkContext.parallelize([json_string])
    df_playlist_spec = spark.read.json(json_rdd, multiLine=True)
    
    ids = df_playlist_spec \
    .withColumn("items", explode("items")) \
    .select("items.track.id") \
    .rdd.flatMap(lambda x: x).collect()
    
    track_list += ids
    
    total = df_playlist_spec.select("total").first()[0]
    left = int(total)-100
    cnt = ceil(left/100)
    
    for i in range(cnt):
        offset = 100 + 100 * i
        params = {"offset":offset}
        
        playlist_spec = get_response(access_token=access_token, endpoint=endpoint, params=params)
        
        json_string  = json.dumps(playlist_spec)
        json_rdd = spark.sparkContext.parallelize([json_string])
        df_playlist_spec = spark.read.json(json_rdd, multiLine=True)
        
        ids = df_playlist_spec \
        .withColumn("items", explode("items")) \
        .select("items.track.id") \
        .rdd.flatMap(lambda x: x).collect()
        
        track_list += ids      

cnt = ceil(len(track_list)/50)

big_list = []
for j in range(cnt):
    big_list.append(track_list[j*50:(j+1)*50])

# Create Dataframe : main_df
main_df = None
cnt = 0
for small_list in big_list:
    
    print(cnt)
    
    tracks = ""
    for id in small_list:
        tracks += f",{id}"
    tracks = tracks[1:]
    
    endpoint = "tracks"
    params = {"ids":tracks}
    track = get_response(access_token=access_token, endpoint=endpoint, params=params)
    
    json_string  = json.dumps(track)
    json_rdd = spark.sparkContext.parallelize([json_string])
    df_tracks = spark.read.json(json_rdd, multiLine=True)
    
    df_tracks = spark.read.json(json_rdd, multiLine=True) \
        .withColumn("tracks", explode("tracks")) \
        .selectExpr("tracks.id",
                    "tracks.popularity")
    
    endpoint = "audio-features"
    params = {"ids":tracks}
    audio_features = get_response(access_token=access_token, endpoint=endpoint, params=params)
    
    json_string  = json.dumps(audio_features)
    json_rdd = spark.sparkContext.parallelize([json_string])
    df_audio_features = spark.read.json(json_rdd, multiLine=True) \
        .withColumn("audio_features", explode("audio_features")) \
        .selectExpr("audio_features.id",
                    "audio_features.key",
                    "audio_features.mode",
                    "audio_features.time_signature",
                    "audio_features.tempo",
                    "audio_features.acousticness",
                    "audio_features.danceability",
                    "audio_features.energy",
                    "audio_features.instrumentalness",
                    "audio_features.liveness",
                    "audio_features.loudness",
                    "audio_features.speechiness",
                    "audio_features.valence")
    
    result_track_df = df_tracks.join(df_audio_features, "id", "left")
    if cnt == 0:
        main_df = result_track_df
    else:
        main_df = main_df.union(result_track_df)
    cnt += 1

### Load Dataframe : df_dw
dw_tracks = spark.read.parquet("file:///home/kjh/data/Spotify/tracks")
dw_audioFeatures = spark.read.parquet("file:///home/kjh/data/Spotify/tracks_audioFeatures")
df_dw = dw_tracks.join(dw_audioFeatures, "id", "left")

### Union Dataframe : df
df = df_dw.union(main_df)

df = df.dropna()

 ### Scale Dataframe : minmax_scaler
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler

selected_features = ["popularity", "key", "mode", "time_signature", "tempo", "acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "valence"]
assembler = VectorAssembler(inputCols=selected_features, outputCol="features")
df_assembled = assembler.transform(df)

minmax_scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
minmax_model = minmax_scaler.fit(df_assembled)
minmax_scaled_df = minmax_model.transform(df_assembled)

### Split Dataframe : train & test
from pyspark.sql.functions import col

minmax_scaled_train = minmax_scaled_df.filter(col("id").isin(track_list))
minmax_scaled_test = minmax_scaled_df.filter(~col("id").isin(track_list))

### Mege Datas into Group
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import col

kmeans = KMeans().setK(4).setSeed(1)
model = kmeans.fit(minmax_scaled_train)

centers = model.clusterCenters()
print("Cluster Centers:")
for center in centers:
    print(center)

df_result = model.transform(minmax_scaled_test)
df_result.show()

df_prediction = df_result.select("features", "prediction").show()

df_mean_datas = df_result.groupBy("prediction").agg(
    col("prediction")
)

131072x1 화면 크기가 잘못됐습니다. 문제가 예상됩니다
24/01/06 23:32:55 WARN Utils: Your hostname, KJH-DESKTOP resolves to a loopback address: 127.0.1.1; using 192.168.69.220 instead (on interface eth0)
24/01/06 23:32:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/06 23:32:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/01/06 23:32:56 WARN RapidsPluginUtils: RAPIDS Accelerator 23.12.0 using cudf 23.12.0.
24/01/06 23:32:56 WARN RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.
24/01/06 23:32:56 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.


<Response [200]>


                                                                                

<Response [200]>
<Response [200]>
0
<Response [200]>
<Response [200]>
1
<Response [200]>
<Response [200]>
2
<Response [200]>
<Response [200]>


24/01/06 23:33:10 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/01/06 23:33:22 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
                                                                                

Cluster Centers:
[ 3.18714286e+01  5.48571429e+00  5.57142857e-01  4.00000000e+00
  1.25724286e+02  2.65791143e-01  6.26900000e-01  7.36600000e-01
  8.28374143e-04  1.67651429e-01 -4.41340000e+00  4.72457143e-02
  5.05185714e-01]
[ 7.23214286e+01  5.25000000e+00  5.71428571e-01  4.00000000e+00
  1.36270750e+02  1.20300000e-01  7.49714286e-01  7.80964286e-01
  2.98342857e-04  1.31807143e-01 -4.60378571e+00  7.30321429e-02
  6.91214286e-01]
[ 5.06785714e+01  6.10714286e+00  5.35714286e-01  4.00000000e+00
  9.71753214e+01  2.98810714e-01  6.66071429e-01  7.59928571e-01
  1.20296429e-05  1.87292857e-01 -4.12535714e+00  9.61642857e-02
  5.77292857e-01]
[ 3.54117647e+01  5.47058824e+00  7.64705882e-01  3.82352941e+00
  1.61670471e+02  1.64845059e-01  4.89176471e-01  7.78117647e-01
  9.30452941e-04  1.62670588e-01 -4.40194118e+00  7.25294118e-02
  5.58235294e-01]


                                                                                

+--------------------+----------+---+----+--------------+-------+------------+------------+------+----------------+--------+--------+-----------+-------+--------------------+--------------------+----------+
|                  id|popularity|key|mode|time_signature|  tempo|acousticness|danceability|energy|instrumentalness|liveness|loudness|speechiness|valence|            features|     scaled_features|prediction|
+--------------------+----------+---+----+--------------+-------+------------+------------+------+----------------+--------+--------+-----------+-------+--------------------+--------------------+----------+
|1SJS8NiUV1aI6Gxhx...|         2|  0|   1|             4|115.009|       0.367|       0.803| 0.672|         4.55E-4|   0.229|  -4.486|     0.0348|  0.962|[2.0,0.0,1.0,4.0,...|[0.02,0.0,1.0,0.8...|         0|
|6L2BtMXLBpBCUiivS...|         5|  0|   1|             3| 77.528|     0.00931|       0.542| 0.542|         6.77E-5|   0.146|  -6.951|     0.0233|  0.384|[5.0,0.0,1.0,3.0,..

                                                                                

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[0.0,2.0,1.0,4.0,...|         0|
|[0.0,1.0,1.0,5.0,...|         0|
|[32.0,0.0,0.0,4.0...|         2|
|[2.0,1.0,1.0,3.0,...|         0|
|[21.0,10.0,1.0,4....|         0|
|[0.0,7.0,1.0,4.0,...|         0|
|[0.0,11.0,0.0,4.0...|         0|
|[0.0,5.0,1.0,4.0,...|         0|
|[1.0,0.0,1.0,4.0,...|         0|
|[1.0,5.0,1.0,4.0,...|         3|
|[0.0,6.0,0.0,4.0,...|         0|
|[0.0,9.0,1.0,4.0,...|         2|
|[2.0,7.0,0.0,3.0,...|         2|
|[0.0,1.0,0.0,4.0,...|         0|
|[0.0,10.0,1.0,4.0...|         2|
|[0.0,2.0,1.0,4.0,...|         0|
|[3.0,2.0,0.0,3.0,...|         3|
|[0.0,8.0,1.0,4.0,...|         0|
|[16.0,9.0,1.0,4.0...|         0|
|[6.0,7.0,1.0,4.0,...|         3|
+--------------------+----------+
only showing top 20 rows

