In [1]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.functions import col,avg,dense_rank
from pyspark.sql.functions import corr
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.linalg import Vectors
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.ml.linalg import SparseVector
from pyspark.sql.types import DoubleType
spark = SparkSession.builder.appName("MovieTags").getOrCreate()
movie = spark.read.csv("file:///home/kjh/data/ml-latest/movies.csv", header=True, inferSchema=True)
tag=spark.read.csv("file:///home/kjh/data/ml-latest/tags.csv", header=True, inferSchema=True)
rating=spark.read.csv("file:///home/kjh/data/ml-latest/ratings.csv", header=True, inferSchema=True)
join_data = movie.join(tag,on="movieId",how='Inner')
total_data = join_data.join(rating,on=['movieId','userId'],how="Inner")
total_data.show()
print(total_data.count())
mean_by_user = total_data.groupBy(['userId', 'tag']).agg(avg('rating').alias('mean_rating'))
windowSpec = Window.partitionBy('userId').orderBy(mean_by_user['mean_rating'].desc())
mean_by_user_ranked = mean_by_user.withColumn('rank', dense_rank().over(windowSpec))
mean_by_user_ranked.show()
top_tags_by_user = mean_by_user_ranked.filter(col('rank') == 1).select('userId', 'tag', 'mean_rating')
top_tags_by_user.show()
# 최고 순위 확인
max_rank = mean_by_user_ranked.groupBy().max('rank').collect()[0][0]
# 최고 순위인 데이터 가져오기
top_ranked_users = mean_by_user_ranked.filter(mean_by_user_ranked['rank'] == max_rank)
top_ranked_users.show()
mean_by_tag = total_data.groupBy('tag').agg(avg('rating').alias('mean_rating'))
windowSpec = Window.orderBy(mean_by_tag['mean_rating'].desc())
mean_by_tag_ranked = mean_by_tag.withColumn('rank', dense_rank().over(windowSpec))
mean_by_tag_ranked.show()
# mean_by_tag와 그룹화된 rank의 최대값을 얻습니다.
tag_max_rank = mean_by_tag_ranked.groupBy().max('rank').collect()[0][0]
# 최하위 순위인 데이터를 얻습니다.
bottom_ranked_tag = mean_by_tag_ranked.filter(mean_by_tag_ranked['rank'] == tag_max_rank)
bottom_ranked_tag.show()
# 'tag' 컬럼을 수치형으로 변환
indexer = StringIndexer(inputCol='tag', outputCol='tag_index')
indexed = indexer.fit(total_data).transform(total_data)
# OneHotEncoder를 사용하여 수치형 컬럼을 원-핫 인코딩
encoder = OneHotEncoder(inputCol='tag_index', outputCol='tag_encoded')
encoded = encoder.fit(indexed).transform(indexed)
# 첫 번째 값을 추출하는 UDF (User Defined Function) 생성
first_value_udf = udf(lambda x: float(x[0]), DoubleType())
# 'tag_encoded' 컬럼의 첫 번째 값을 'tag_encoded_value' 컬럼에 추가
encoded = encoded.withColumn('tag_encoded_value', first_value_udf('tag_encoded'))
# 코릴레이션 계산을 위한 준비
correlation = encoded.withColumn('rating_double', F.col('rating').cast('double'))
# 코릴레이션 계산
result = correlation.select(F.corr('tag_encoded_value', 'rating_double')).collect()[0][0]
print(result)
spark.stop()

131072x1 화면 크기가 잘못됐습니다. 문제가 예상됩니다
23/12/11 00:31:05 WARN Utils: Your hostname, KJH-DESKTOP resolves to a loopback address: 127.0.1.1; using 192.168.69.220 instead (on interface eth0)
23/12/11 00:31:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/11 00:31:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+-------+------+----------------+--------------------+------------------+----------+------+----------+
|movieId|userId|           title|              genres|               tag| timestamp|rating| timestamp|
+-------+------+----------------+--------------------+------------------+----------+------+----------+
|      1|  2483|Toy Story (1995)|Adventure|Animati...|     lots of heart|1229405305|   4.0|1229405222|
|      1|  9262|Toy Story (1995)|Adventure|Animati...|     Os dois viram|1500229250|   4.5|1527796540|
|      1| 16853|Toy Story (1995)|Adventure|Animati...|          humorous|1445294945|   3.5|1445294927|
|      1| 16853|Toy Story (1995)|Adventure|Animati...|             pixar|1445294920|   3.5|1445294927|
|      1| 31107|Toy Story (1995)|Adventure|Animati...|         animation|1430792017|   3.0|1195850074|
|      1| 31107|Toy Story (1995)|Adventure|Animati...|            family|1430792017|   3.0|1195850074|
|      1| 31107|Toy Story (1995)|Adventure|Animati...|            sci-fi|

                                                                                

1729292


                                                                                

+------+--------------------+-----------+----+
|userId|                 tag|mean_rating|rank|
+------+--------------------+-----------+----+
|    78|         influential|        5.0|   1|
|    78|          space epic|        5.0|   1|
|   137|           emotional|        5.0|   1|
|   137|       new york city|        5.0|   1|
|   137|      Director's Cut|        5.0|   1|
|   137|               music|        5.0|   1|
|   137|   thought-provoking|        5.0|   1|
|   137|       philosophical|        5.0|   1|
|   137|              trippy|        5.0|   1|
|   137|          reflective|        5.0|   1|
|   137|     meaning of life|        5.0|   1|
|   137|          love story|        5.0|   1|
|   137|   Michelle Williams|        5.0|   1|
|   137|        Ryan Gosling|        5.0|   1|
|   137|   thought provoking|        5.0|   1|
|   137|      theatrical cut|        5.0|   1|
|   137|character develop...|        5.0|   1|
|   137|       philip k dick|        5.0|   1|
|   137|   so

                                                                                

+------+--------------------+-----------+
|userId|                 tag|mean_rating|
+------+--------------------+-----------+
|    26|           cult film|        4.5|
|    26|               crime|        4.5|
|    26|   quentin tarantino|        4.5|
|    78|          space epic|        5.0|
|    78|         influential|        5.0|
|   137|         rotoscoping|        5.0|
|   137|          slow paced|        5.0|
|   137|     dialogue driven|        5.0|
|   137|surprisingly touc...|        5.0|
|   137|         good sequel|        5.0|
|   137|           adventure|        5.0|
|   137|               bleak|        5.0|
|   137|  Visually appealing|        5.0|
|   137|actor talks to au...|        5.0|
|   137|                time|        5.0|
|   137|  beautifully filmed|        5.0|
|   137|     Natalie Portman|        5.0|
|   137|             wistful|        5.0|
|   137|         apocalyptic|        5.0|
|   137|           ambitious|        5.0|
+------+--------------------+-----

                                                                                

+------+--------------------+-----------+----+
|userId|                 tag|mean_rating|rank|
+------+--------------------+-----------+----+
|215490|      scientist hero|        0.5|1696|
|215490|          naturalist|        0.5|1696|
|215490|        power cosmic|        0.5|1696|
|215490|   reference to aang|        0.5|1696|
|215490|  fake haunted house|        0.5|1696|
|215490| trapped beneath ice|        0.5|1696|
|215490|      building owner|        0.5|1696|
|215490|  watching tv in bed|        0.5|1696|
|215490|       floor cleaner|        0.5|1696|
|215490|reference to mr. ...|        0.5|1696|
|215490|     creepy landlord|        0.5|1696|
|215490|killed with a nai...|        0.5|1696|
|215490|reference to clar...|        0.5|1696|
|215490|reference to mich...|        0.5|1696|
|215490|      america online|        0.5|1696|
|215490|  nail gun as weapon|        0.5|1696|
|215490|shot with a nail gun|        0.5|1696|
|215490|reference to prid...|        0.5|1696|
|215490|tying

23/12/11 00:32:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/11 00:32:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/11 00:32:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/11 00:32:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/11 00:32:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/11 00:32:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/11 0

+--------------------+-----------+----+
|                 tag|mean_rating|rank|
+--------------------+-----------+----+
|hate love relatio...|        5.0|   1|
|             fuck it|        5.0|   1|
|  adios turd nuggets|        5.0|   1|
|         clubedaluta|        5.0|   1|
|tradition and cus...|        5.0|   1|
|       Thomas Gibson|        5.0|   1|
|           Mark Addy|        5.0|   1|
|        Middle child|        5.0|   1|
|       show and tell|        5.0|   1|
|mutual assured de...|        5.0|   1|
|          disbarment|        5.0|   1|
|Director: Leo McC...|        5.0|   1|
| high blood pressure|        5.0|   1|
|           Sid James|        5.0|   1|
|      fascist regime|        5.0|   1|
|          Atsuya Uki|        5.0|   1|
|Pop culture refer...|        5.0|   1|
|  literary narration|        5.0|   1|
|   funy and touching|        5.0|   1|
|     motion sickness|        5.0|   1|
+--------------------+-----------+----+
only showing top 20 rows



23/12/11 00:32:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/11 00:32:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/11 00:32:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/11 00:32:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/11 00:32:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/11 00:32:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/11 0

+--------------------+-----------+----+
|                 tag|mean_rating|rank|
+--------------------+-----------+----+
|         tree hugger|        0.5|5515|
|            web-star|        0.5|5515|
|     wheelchair jump|        0.5|5515|
|         overhearing|        0.5|5515|
|        Killing baby|        0.5|5515|
|         incoherence|        0.5|5515|
|      hell's kitchen|        0.5|5515|
| ben urich character|        0.5|5515|
|animal wearing cl...|        0.5|5515|
|           dribbling|        0.5|5515|
|quentin tarantino...|        0.5|5515|
|              siesta|        0.5|5515|
|       petting a dog|        0.5|5515|
|   Justin timberlake|        0.5|5515|
|elektra natchios ...|        0.5|5515|
|      special prison|        0.5|5515|
|cut in half by a ...|        0.5|5515|
|gruesome (torture...|        0.5|5515|
|why would you was...|        0.5|5515|
|     teen indulgence|        0.5|5515|
+--------------------+-----------+----+
only showing top 20 rows



23/12/11 00:33:02 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
                                                                                

0.02395415506103178
