In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, sum as spark_sum
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler
from pyspark.ml import Pipeline

# 1. Spark 세션 생성
spark = SparkSession.builder.appName("MobDropValueRegression").getOrCreate()

# 2. 데이터 불러오기
mobs = spark.read.csv('learning_spark_data/Mobs.csv', header=True, inferSchema=True)
food = spark.read.csv('learning_spark_data/Food.csv', header=True, inferSchema=True)
mob_food = spark.read.csv('learning_spark_data/MobFoodDrops.csv', header=True, inferSchema=True)

In [2]:
# 필요없는 데이터 컬럼 drop
mobs = mobs.drop("behaviorTypes", "spawnBehavior", "debutDate", "minecraftVersion", "reproductiveRequirement")
food = food.drop("debutDate", "minecraftVersion")

In [3]:
# mobID → 몹 ID, foodID → 음식 ID로 연결

# mob_food + food 조인 (foodID 기준)
mob_food_value = mob_food.join(food, mob_food["foodID"] == food["ID"], how="left") \
                         .select(mob_food["mobID"], food["hunger"])

# 몹별 총 음식 드롭 가치 합산
mob_drop_value = mob_food_value.groupBy("mobID").agg(
    spark_sum("hunger").alias("totalDropValue").cast('Double')
)

mob_drop_value.show(10)

+-----+---------------------------------------------+
|mobID|CAST(sum(hunger) AS totalDropValue AS DOUBLE)|
+-----+---------------------------------------------+
|   53|                                          3.0|
|   78|                                          8.0|
|   34|                                          3.0|
|   28|                                          2.0|
|   76|                                          8.0|
|   26|                                          2.0|
|   22|                                          3.0|
|   52|                                          4.0|
|    6|                                          2.0|
|   54|                                          3.0|
+-----+---------------------------------------------+
only showing top 10 rows



In [4]:
# NULL 처리 및 캐스팅
mobs = mobs.withColumn("healthPoints", col("healthPoints").cast("double"))
mobs = mobs.withColumn("maxDamage", when(col("maxDamage").isNull(), 0).otherwise(col("maxDamage")).cast("double"))
mobs = mobs.withColumn("difficultyScore", col("healthPoints") + col("maxDamage") * 2)

food = food.withColumn("hunger", col("hunger").cast("double"))  # 드롭 가치 = hunger

In [5]:
from pyspark.sql.functions import col, when, format_number

# mobs.ID ↔ mob_drop_value.mobID 연결
mob_efficiency = mobs.join(mob_drop_value, mobs["ID"] == mob_drop_value["mobID"], how="left")

# NULL 처리: 드롭 없으면 0
mob_efficiency = mob_efficiency.withColumn(
    "totalDropValue",
    when(col("totalDropValue").isNull(), 0).otherwise(col("totalDropValue"))
)

# 가성비 계산: 드롭 가치 / (사냥 난이도 + 1)
mob_efficiency = mob_efficiency.withColumn(
    "efficiencyScore",
    col("totalDropValue") / (col("difficultyScore") + 1)
)

# totalDropValue가 0인 행 제거
mob_efficiency = mob_efficiency.filter(col("totalDropValue") > 0)

# 소수점 둘째자리로 포맷 (출력용)
mob_efficiency = mob_efficiency.withColumn(
    "efficiencyScore", format_number("efficiencyScore", 2)
)

# 출력
mob_efficiency.select(
    "name", "healthPoints", "maxDamage", "difficultyScore", "totalDropValue", "efficiencyScore"
).orderBy(col("efficiencyScore").desc()).show(30, truncate=False)

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `totalDropValue` cannot be resolved. Did you mean one of the following? [`maxDamage`, `healthPoints`, `ID`, `mobID`, `name`].;
'Project [ID#17, name#18, healthPoints#135, maxDamage#140, difficultyScore#145, mobID#81, CAST(sum(hunger) AS totalDropValue AS DOUBLE)#114, CASE WHEN isnull('totalDropValue) THEN 0 ELSE 'totalDropValue END AS totalDropValue#170]
+- Join LeftOuter, (ID#17 = mobID#81)
   :- Project [ID#17, name#18, healthPoints#135, maxDamage#140, (healthPoints#135 + (maxDamage#140 * cast(2 as double))) AS difficultyScore#145]
   :  +- Project [ID#17, name#18, healthPoints#135, cast(CASE WHEN isnull(maxDamage#22) THEN 0 ELSE maxDamage#22 END as double) AS maxDamage#140]
   :     +- Project [ID#17, name#18, cast(healthPoints#21 as double) AS healthPoints#135, maxDamage#22]
   :        +- Project [ID#17, name#18, healthPoints#21, maxDamage#22]
   :           +- Relation [ID#17,name#18,behaviorTypes#19,spawnBehavior#20,healthPoints#21,maxDamage#22,debutDate#23,minecraftVersion#24,reproductiveRequirement#25] csv
   +- Aggregate [mobID#81], [mobID#81, cast(sum(hunger#55) as double) AS CAST(sum(hunger) AS totalDropValue AS DOUBLE)#114]
      +- Project [mobID#81, hunger#55]
         +- Join LeftOuter, (foodID#82 = ID#52)
            :- Relation [mobID#81,foodID#82] csv
            +- Project [ID#52, name#53, type#54, hunger#55]
               +- Relation [ID#52,name#53,type#54,hunger#55,debutDate#56,minecraftVersion#57] csv


In [None]:
from pyspark.sql.functions import col

mob_efficiency = mob_efficiency.withColumn("efficiencyScore", col("efficiencyScore").cast("double"))


In [None]:
# 5. 피처 엔지니어링: 필요 컬럼 선택
features = ["difficultyScore", "totalDropValue"]


In [None]:
stages = []

In [None]:
from pyspark.ml.feature import StandardScaler, VectorAssembler

num_assembler = VectorAssembler(inputCols=["difficultyScore", "totalDropValue"], outputCol= 'feature_vector')
stages += [num_assembler]

stages

In [None]:
train_df, test_df = mob_efficiency.randomSplit([0.8,0.2], seed=360)

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)
fitted_transform = pipeline.fit(train_df)
vtrain_df = fitted_transform.transform(train_df)
vtrain_df.printSchema()

In [None]:
vtrain_df.select('feature_vector', 'efficiencyScore').show(2)

In [None]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter=50, solver='normal', 
                 labelCol='efficiencyScore', featuresCol='feature_vector')

In [None]:
model = lr.fit(vtrain_df)

In [None]:
#테스트데이터도 변환
vtest_df = fitted_transform.transform(test_df)
#테스트데이터로 예측
pred = model.transform(vtest_df)

In [None]:
pred.select('efficiencyScore', 'prediction').show()

In [None]:
model.summary.r2, model.summary.rootMeanSquaredError

In [None]:
spark.stop()