In [30]:
from pyspark.sql import SparkSession

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

In [2]:
ss = SparkSession.builder\
                .appName('Clustering')\
                .getOrCreate()

24/12/13 13:37:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [38]:
# 데이터 생성
data = [
    (0, 0, 4.0),  # user 0 rated item 0 with 4.0
    (0, 1, 2.0),
    (1, 1, 3.0),
    (1, 2, 1.0),
    (2, 0, 5.0),
    (2, 2, 4.0)
]

columns = ['user_id', 'item_id', 'rating'] #cluster

In [9]:
rating_df = ss.createDataFrame(data = data, schema = columns)
rating_df

DataFrame[user_id: bigint, item_id: bigint, rating: double]

In [11]:
rating_df.show()

                                                                                

+-------+-------+------+
|user_id|item_id|rating|
+-------+-------+------+
|      0|      0|   4.0|
|      0|      1|   2.0|
|      1|      1|   3.0|
|      1|      2|   1.0|
|      2|      0|   5.0|
|      2|      2|   4.0|
+-------+-------+------+



### 전처리
- 클러스터링을 하기 위한 전처리로 pivot으로 matrix를 만듦

In [21]:
# user_id, item_id - rating 정보틀 >> 사용자 그룹을 만든다
user_item_matric = rating_df.groupBy('user_id')\
                            .pivot('item_id')\
                            .avg('rating')\
                            .fillna(0)

user_item_matric.show()

                                                                                

+-------+---+---+---+
|user_id|  0|  1|  2|
+-------+---+---+---+
|      0|4.0|2.0|0.0|
|      1|0.0|3.0|1.0|
|      2|5.0|0.0|4.0|
+-------+---+---+---+



### feature 벡터
피처 선택, 전처리 > 모델 학습 > 예측

In [45]:
assembler = VectorAssembler(inputCols = ['0', '1', '2']
                            , outputCol = 'feature')

assembler

VectorAssembler_ff41b2ad74f1

In [29]:
user_features = assembler.transform(user_item_matric)

user_features.show()

                                                                                

+-------+---+---+---+-------------+
|user_id|  0|  1|  2|      feature|
+-------+---+---+---+-------------+
|      0|4.0|2.0|0.0|[4.0,2.0,0.0]|
|      1|0.0|3.0|1.0|[0.0,3.0,1.0]|
|      2|5.0|0.0|4.0|[5.0,0.0,4.0]|
+-------+---+---+---+-------------+



### 모델 생성 > 학습

In [50]:
# 모델 생성
kmeans = KMeans(k = 2
                , seed = 1
                , featuresCol = 'feature'
                , predictionCol = 'cluster')

kmeans

KMeans_a049b7718604

In [34]:
# 모델 학습
model = kmeans.fit(user_features)
model

24/12/13 14:09:43 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/12/13 14:09:43 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

KMeansModel: uid=KMeans_98420ded757c, k=2, distanceMeasure=euclidean, numFeatures=3

In [36]:
# 모델을 이용한 예측
clusters = model.transform(user_features)

In [37]:
# 유저의 클러스터링 결과
clusters.show()

                                                                                

+-------+---+---+---+-------------+-------+
|user_id|  0|  1|  2|      feature|cluster|
+-------+---+---+---+-------------+-------+
|      0|4.0|2.0|0.0|[4.0,2.0,0.0]|      0|
|      1|0.0|3.0|1.0|[0.0,3.0,1.0]|      0|
|      2|5.0|0.0|4.0|[5.0,0.0,4.0]|      1|
+-------+---+---+---+-------------+-------+



In [48]:
# 모델 생성
kmeans = KMeans(k = 3
                , seed = 1
                , featuresCol = 'feature'
                , predictionCol = 'cluster')

# 모델 학습
model = kmeans.fit(user_features)
# 모델을 이용한 예측
clusters = model.transform(user_features)
# 유저의 클러스터링 결과
clusters.show()

                                                                                

+-------+---+---+---+-------------+-------+
|user_id|  0|  1|  2|      feature|cluster|
+-------+---+---+---+-------------+-------+
|      0|4.0|2.0|0.0|[4.0,2.0,0.0]|      1|
|      1|0.0|3.0|1.0|[0.0,3.0,1.0]|      2|
|      2|5.0|0.0|4.0|[5.0,0.0,4.0]|      0|
+-------+---+---+---+-------------+-------+



### 사용자 그룹화
유사한 취향의 사용자끼리 그룹으로 묶어주는 것

### 아이템 그룹화
아이템 간의 군집화를 통해 사용자에게 추천을 해주는 것

In [51]:
ss.stop()