In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.functions import col,avg,dense_rank
from pyspark.sql.functions import corr
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.linalg import Vectors
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.ml.linalg import SparseVector
from pyspark.sql.types import DoubleType
spark = SparkSession.builder.appName("MovieTags").getOrCreate()
movie = spark.read.csv("file:///home/kjh/data/ml-latest/movies.csv", header=True, inferSchema=True)
tag=spark.read.csv("file:///home/kjh/data/ml-latest/tags.csv", header=True, inferSchema=True)
rating=spark.read.csv("file:///home/kjh/data/ml-latest/ratings.csv", header=True, inferSchema=True)
join_data = movie.join(tag,on="movieId",how='Inner')
total_data = join_data.join(rating,on=['movieId','userId'],how="Inner")
total_data.show()
print(total_data.count())
mean_by_user = total_data.groupBy(['userId', 'tag']).agg(avg('rating').alias('mean_rating'))
windowSpec = Window.partitionBy('userId').orderBy(mean_by_user['mean_rating'].desc())
mean_by_user_ranked = mean_by_user.withColumn('rank', dense_rank().over(windowSpec))
mean_by_user_ranked.show()
top_tags_by_user = mean_by_user_ranked.filter(col('rank') == 1).select('userId', 'tag', 'mean_rating')
top_tags_by_user.show()
# 최고 순위 확인
max_rank = mean_by_user_ranked.groupBy().max('rank').collect()[0][0]
# 최고 순위인 데이터 가져오기
top_ranked_users = mean_by_user_ranked.filter(mean_by_user_ranked['rank'] == max_rank)
top_ranked_users.show()
mean_by_tag = total_data.groupBy('tag').agg(avg('rating').alias('mean_rating'))
windowSpec = Window.orderBy(mean_by_tag['mean_rating'].desc())
mean_by_tag_ranked = mean_by_tag.withColumn('rank', dense_rank().over(windowSpec))
mean_by_tag_ranked.show()
# mean_by_tag와 그룹화된 rank의 최대값을 얻습니다.
tag_max_rank = mean_by_tag_ranked.groupBy().max('rank').collect()[0][0]
# 최하위 순위인 데이터를 얻습니다.
bottom_ranked_tag = mean_by_tag_ranked.filter(mean_by_tag_ranked['rank'] == tag_max_rank)
bottom_ranked_tag.show()
# 'tag' 컬럼을 수치형으로 변환
indexer = StringIndexer(inputCol='tag', outputCol='tag_index')
indexed = indexer.fit(total_data).transform(total_data)
# OneHotEncoder를 사용하여 수치형 컬럼을 원-핫 인코딩
encoder = OneHotEncoder(inputCol='tag_index', outputCol='tag_encoded')
encoded = encoder.fit(indexed).transform(indexed)
# 첫 번째 값을 추출하는 UDF (User Defined Function) 생성
first_value_udf = udf(lambda x: float(x[0]), DoubleType())
# 'tag_encoded' 컬럼의 첫 번째 값을 'tag_encoded_value' 컬럼에 추가
encoded = encoded.withColumn('tag_encoded_value', first_value_udf('tag_encoded'))
# 코릴레이션 계산을 위한 준비
correlation = encoded.withColumn('rating_double', F.col('rating').cast('double'))
# 코릴레이션 계산
result = correlation.select(F.corr('tag_encoded_value', 'rating_double')).collect()[0][0]
print(result)
spark.stop()

                                                                                

+-------+------+----------------+--------------------+------------------+----------+------+----------+
|movieId|userId|           title|              genres|               tag| timestamp|rating| timestamp|
+-------+------+----------------+--------------------+------------------+----------+------+----------+
|      1|  2483|Toy Story (1995)|Adventure|Animati...|     lots of heart|1229405305|   4.0|1229405222|
|      1|  9262|Toy Story (1995)|Adventure|Animati...|     Os dois viram|1500229250|   4.5|1527796540|
|      1| 16853|Toy Story (1995)|Adventure|Animati...|          humorous|1445294945|   3.5|1445294927|
|      1| 16853|Toy Story (1995)|Adventure|Animati...|             pixar|1445294920|   3.5|1445294927|
|      1| 31107|Toy Story (1995)|Adventure|Animati...|         animation|1430792017|   3.0|1195850074|
|      1| 31107|Toy Story (1995)|Adventure|Animati...|            family|1430792017|   3.0|1195850074|
|      1| 31107|Toy Story (1995)|Adventure|Animati...|            sci-fi|

                                                                                

1729292


                                                                                

+------+--------------------+-----------+----+
|userId|                 tag|mean_rating|rank|
+------+--------------------+-----------+----+
|    78|         influential|        5.0|   1|
|    78|          space epic|        5.0|   1|
|   137|          depressing|        5.0|   1|
|   137|               Paris|        5.0|   1|
|   137|          Jared Leto|        5.0|   1|
|   137|surprisingly touc...|        5.0|   1|
|   137|         rotoscoping|        5.0|   1|
|   137|         good sequel|        5.0|   1|
|   137|          slow paced|        5.0|   1|
|   137|     dialogue driven|        5.0|   1|
|   137|           ambiguous|        5.0|   1|
|   137|         bittersweet|        5.0|   1|
|   137|      great dialogue|        5.0|   1|
|   137|  Scarlett Johansson|        5.0|   1|
|   137|  amazing soundtrack|        5.0|   1|
|   137|            break-up|        5.0|   1|
|   137|        true to life|        5.0|   1|
|   137|  great performances|        5.0|   1|
|   137|artif

                                                                                

+------+--------------------+-----------+
|userId|                 tag|mean_rating|
+------+--------------------+-----------+
|    78|         influential|        5.0|
|    78|          space epic|        5.0|
|   137|          depressing|        5.0|
|   137|               Paris|        5.0|
|   137|          Jared Leto|        5.0|
|   137|surprisingly touc...|        5.0|
|   137|         rotoscoping|        5.0|
|   137|         good sequel|        5.0|
|   137|          slow paced|        5.0|
|   137|     dialogue driven|        5.0|
|   137|           ambiguous|        5.0|
|   137|         bittersweet|        5.0|
|   137|      great dialogue|        5.0|
|   137|  Scarlett Johansson|        5.0|
|   137|  amazing soundtrack|        5.0|
|   137|            break-up|        5.0|
|   137|        true to life|        5.0|
|   137|  great performances|        5.0|
|   137|artificial intell...|        5.0|
|   137|               music|        5.0|
+------+--------------------+-----

2023-12-09 16:48:08,798 ERROR scheduler.TaskSchedulerImpl: Lost executor 0 on 172.24.249.89: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
2023-12-09 16:48:08,814 WARN scheduler.TaskSetManager: Lost task 5.0 in stage 57.0 (TID 227) (172.24.249.89 executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
2023-12-09 16:48:08,815 WARN scheduler.TaskSetManager: Lost task 4.0 in stage 57.0 (TID 226) (172.24.249.89 executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
2023-12-09 16:48:08,816 WARN scheduler.TaskSetManager: Lost task 7.0 in stage 57.0 (TID 229

+------+--------------------+-----------+----+
|userId|                 tag|mean_rating|rank|
+------+--------------------+-----------+----+
|215490|   ducati motorcycle|        0.5|1696|
|215490|green strapless d...|        0.5|1696|
|215490|emotionally vulne...|        0.5|1696|
|215490| female supervillain|        0.5|1696|
|215490|         diving gear|        0.5|1696|
|215490|  good becoming evil|        0.5|1696|
|215490|       kamino planet|        0.5|1696|
|215490|              m3 lee|        0.5|1696|
|215490|             bad cgi|        0.5|1696|
|215490| kit fisto character|        0.5|1696|
|215490|  shaak ti character|        0.5|1696|
|215490| religious education|        0.5|1696|
|215490|          rain fight|        0.5|1696|
|215490|                   b|        0.5|1696|
|215490|hell's kitchen ma...|        0.5|1696|
|215490|    cheating fiancée|        0.5|1696|
|215490|    five dollar bill|        0.5|1696|
|215490|      capriciousness|        0.5|1696|
|215490|     

2023-12-09 16:49:09,411 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-12-09 16:49:09,419 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-12-09 16:49:21,080 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-12-09 16:49:22,234 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-12-09 16:49:22,758 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-12-09 16:49:33,368 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to 

+--------------------+-----------+----+
|                 tag|mean_rating|rank|
+--------------------+-----------+----+
| adaptive camouflage|        5.0|   1|
|        Found family|        5.0|   1|
|    marguerite duras|        5.0|   1|
|        Dug Too Deep|        5.0|   1|
|Writer: George Ab...|        5.0|   1|
|chinese railroad ...|        5.0|   1|
|            Kindness|        5.0|   1|
|           colombian|        5.0|   1|
|            ministry|        5.0|   1|
|     children's show|        5.0|   1|
|         breast exam|        5.0|   1|
|            interwar|        5.0|   1|
|          Overlooked|        5.0|   1|
|  Awesome soundtrack|        5.0|   1|
| BEST FILM EVER MADE|        5.0|   1|
|   amazing sountrack|        5.0|   1|
|Writer: Jay Press...|        5.0|   1|
|         Nuclear war|        5.0|   1|
|     Multiple twists|        5.0|   1|
|         punch drunk|        5.0|   1|
+--------------------+-----------+----+
only showing top 20 rows



2023-12-09 16:49:47,054 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-12-09 16:49:48,409 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-12-09 16:49:49,365 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-12-09 16:50:01,640 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-12-09 16:50:01,887 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
2023-12-09 16:50:02,301 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to 

+--------------------+-----------+----+
|                 tag|mean_rating|rank|
+--------------------+-----------+----+
|the first one was...|        0.5|5515|
|machine a metapho...|        0.5|5515|
|  Writer: Ben Zazove|        0.5|5515|
|Writer: Shonali Bose|        0.5|5515|
|         diving gear|        0.5|5515|
|           globalist|        0.5|5515|
|            bittunes|        0.5|5515|
|            the west|        0.5|5515|
|  driving instructor|        0.5|5515|
|Take a great stor...|        0.5|5515|
| Larry the Cable Guy|        0.5|5515|
|               fffff|        0.5|5515|
|       outlaw leader|        0.5|5515|
|Joel Schumacher c...|        0.5|5515|
|            no theme|        0.5|5515|
|count dooku chara...|        0.5|5515|
|    The Matrix-corny|        0.5|5515|
|   unbelievable crap|        0.5|5515|
|This movie will i...|        0.5|5515|
|         californian|        0.5|5515|
+--------------------+-----------+----+
only showing top 20 rows



2023-12-09 16:51:14,910 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 4.7 MiB
2023-12-09 16:51:27,778 ERROR scheduler.TaskSchedulerImpl: Lost executor 1 on 172.24.249.89: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
2023-12-09 16:51:27,779 WARN scheduler.TaskSetManager: Lost task 6.0 in stage 154.0 (TID 577) (172.24.249.89 executor 1): ExecutorLostFailure (executor 1 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
2023-12-09 16:51:27,779 WARN scheduler.TaskSetManager: Lost task 5.0 in stage 154.0 (TID 576) (172.24.249.89 executor 1): ExecutorLostFailure (executor 1 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WA

Py4JJavaError: An error occurred while calling o231.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: ShuffleMapStage 154 (collect at /tmp/ipykernel_73608/1828192008.py:52) has failed the maximum allowable number of times: 4. Most recent failure reason:
org.apache.spark.shuffle.MetadataFetchFailedException: Missing an output location for shuffle 38 partition 80
	at org.apache.spark.MapOutputTracker$.validateStatus(MapOutputTracker.scala:1623)
	at org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$10(MapOutputTracker.scala:1570)
	at org.apache.spark.MapOutputTracker$.$anonfun$convertMapStatuses$10$adapted(MapOutputTracker.scala:1569)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.MapOutputTracker$.convertMapStatuses(MapOutputTracker.scala:1569)
	at org.apache.spark.MapOutputTrackerWorker.getMapSizesByExecutorIdImpl(MapOutputTracker.scala:1234)
	at org.apache.spark.MapOutputTrackerWorker.getMapSizesByExecutorId(MapOutputTracker.scala:1196)
	at org.apache.spark.shuffle.sort.SortShuffleManager.getReader(SortShuffleManager.scala:140)
	at org.apache.spark.shuffle.ShuffleManager.getReader(ShuffleManager.scala:63)
	at org.apache.spark.shuffle.ShuffleManager.getReader$(ShuffleManager.scala:57)
	at org.apache.spark.shuffle.sort.SortShuffleManager.getReader(SortShuffleManager.scala:73)
	at org.apache.spark.sql.execution.ShuffledRowRDD.compute(ShuffledRowRDD.scala:208)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:89)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1491)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2450)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2399)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2398)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2398)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1839)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2635)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2580)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2569)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
