# 1. 라이브러리 불러오기

In [3]:
from pyspark.ml.feature import (
    StringIndexer,        # 범주형 → 수치형 인코딩
    OneHotEncoder,        # 원-핫 인코딩
    VectorAssembler,      # 여러 컬럼 → 하나의 feature 벡터
    StandardScaler,       # 표준 정규화
    MinMaxScaler,         # 최소/최대 스케일링
    Bucketizer,           # 연속형 변수 → 구간화
    QuantileDiscretizer,  # 분위수 기반 구간화
    PCA,                  # 주성분 분석
    PolynomialExpansion,  # 다항 특성 생성
    ChiSqSelector         # 카이제곱 기반 피처 선택
)
from pyspark.ml.classification import (
    LogisticRegression,
    DecisionTreeClassifier,
    RandomForestClassifier,
    GBTClassifier,
    NaiveBayes,
    MultilayerPerceptronClassifier
)

from pyspark.ml.regression import (
    LinearRegression,
    DecisionTreeRegressor,
    RandomForestRegressor,
    GBTRegressor
)
from pyspark.ml.clustering import (
    KMeans,
    GaussianMixture,
    BisectingKMeans,
    LDA  # Latent Dirichlet Allocation (토픽 모델링)
)
from pyspark.ml.evaluation import (
    BinaryClassificationEvaluator,
    MulticlassClassificationEvaluator,
    RegressionEvaluator,
    ClusteringEvaluator
)
from pyspark.ml import Pipeline  # 전체 파이프라인 구성

from pyspark.ml.tuning import (   # 모델 튜닝
    ParamGridBuilder,
    CrossValidator,
    TrainValidationSplit
)
from pyspark.ml.linalg import Vectors, DenseVector, SparseVector  # 벡터 수동 생성
from pyspark.ml.stat import Correlation, ChiSquareTest            # 통계 테스트
from pyspark.sql import SparkSession
import os

# SparkSession

In [4]:
spark = SparkSession.builder.appName('minecraft').getOrCreate()
spark

# 2. 데이터 불러오기

In [14]:
csv_info = [
    ("Mobs.csv", "mob_data"),
    ("Biomes.csv", "bio_data"),
    ("FaunaGeography.csv", "fau_data"),
    ("Dimensions.csv", "dim_data")
]
cwd = os.getcwd()

for fname, var_name in csv_info:
    path = os.path.join(cwd, "learning_spark_data", fname)
    file_uri = f"file:///{path.replace(os.sep, '/')}"
    df = spark.read.csv(file_uri, header=True, inferSchema=True)
    
    # 동적으로 변수 생성
    globals()[var_name] = df


## 데이터 확인

In [15]:
mob_data.show(5)

+---+-----------+-------------+---------------+------------+---------+-------------------+----------------+-----------------------+
| ID|       name|behaviorTypes|  spawnBehavior|healthPoints|maxDamage|          debutDate|minecraftVersion|reproductiveRequirement|
+---+-----------+-------------+---------------+------------+---------+-------------------+----------------+-----------------------+
|  1|sniffer_egg|     Peaceful|Player Summoned|           0|        0|2023-06-07 00:00:00|          1.20.0|                   NULL|
|  2| turtle_egg|     Peaceful|Player Summoned|           0|        0|2018-07-18 00:00:00|          1.13.0|                   NULL|
|  3|        bat|     Peaceful|         Random|           6|     NULL|2012-10-25 00:00:00|           1.4.2|                   NULL|
|  4|      blaze|      Hostile|        On Load|          20|        6|2011-11-18 00:00:00|           1.0.0|                   NULL|
|  5|    chicken|     Peaceful|         Random|           4|     NULL|2011-1

In [16]:
bio_data.show(3)

+---+-------------+-----------+------------+------------------+----------------+-------------+
| ID|         name|dimensionID|treesOrGrass|generateStructures|minecraftVersion|precipitation|
+---+-------------+-----------+------------+------------------+----------------+-------------+
|  1|     badlands|          1|           1|                 1|           1.7.2|         NULL|
|  2|bamboo_jungle|          1|           1|                 1|          1.14.0|         Rain|
|  3|basalt_deltas|          2|           0|                 1|          1.16.0|         NULL|
+---+-------------+-----------+------------+------------------+----------------+-------------+
only showing top 3 rows



In [17]:
fau_data.show(3)

+-----+-------+
|mobID|biomeID|
+-----+-------+
|    1|     58|
|    2|      4|
|    4|      8|
+-----+-------+
only showing top 3 rows



In [18]:
dim_data.show(3)

+---+---------+-------------------+----------------+
| ID|     name|          debutDate|minecraftVersion|
+---+---------+-------------------+----------------+
|  1|Overworld|2011-11-18 00:00:00|           1.0.0|
|  2|   Nether|2011-11-18 00:00:00|           1.0.0|
|  3|      End|2011-11-18 00:00:00|           1.0.0|
+---+---------+-------------------+----------------+



# 데이터 병합

In [None]:
final_df = fauna_data \
    .join(mobs_data, fauna_data.mob_id == mobs_data.id, "left") \
    .join(biomes_data, fauna_data.biome_id == biomes_data.id, "left") \
    .join(dimensions_data, fauna_data.dimension_id == dimensions_data.id, "left") \
    .drop(mobs_data.id).drop(biomes_data.id).drop(dimensions_data.id)


# 피쳐 엔지니어링

# 결측치 처리

# 데이터 스플릿

# 파이프라인 생성

## 출현도 인코딩

## 벡터

# 모델 학습 및 평가

# 파라미터 최적화