# 1. 세션 및 라이브러리 불러오기

## Library

In [1]:
from pyspark.ml.feature import (
    StringIndexer,        # 범주형 → 수치형 인코딩
    OneHotEncoder,        # 원-핫 인코딩
    VectorAssembler,      # 여러 컬럼 → 하나의 feature 벡터
    StandardScaler,       # 표준 정규화
    MinMaxScaler,         # 최소/최대 스케일링
    Bucketizer,           # 연속형 변수 → 구간화
    QuantileDiscretizer,  # 분위수 기반 구간화
    PCA,                  # 주성분 분석
    PolynomialExpansion,  # 다항 특성 생성
    ChiSqSelector         # 카이제곱 기반 피처 선택
)
from pyspark.ml.classification import (
    LogisticRegression,
    DecisionTreeClassifier,
    RandomForestClassifier,
    GBTClassifier,
    NaiveBayes,
    MultilayerPerceptronClassifier
)

from pyspark.ml.regression import (
    LinearRegression,
    DecisionTreeRegressor,
    RandomForestRegressor,
    GBTRegressor
)
from pyspark.ml.clustering import (
    KMeans,
    GaussianMixture,
    BisectingKMeans,
    LDA  # Latent Dirichlet Allocation (토픽 모델링)
)
from pyspark.ml.evaluation import (
    BinaryClassificationEvaluator,
    MulticlassClassificationEvaluator,
    RegressionEvaluator,
    ClusteringEvaluator
)
from pyspark.ml import Pipeline  # 전체 파이프라인 구성

from pyspark.ml.tuning import (   # 모델 튜닝
    ParamGridBuilder,
    CrossValidator,
    TrainValidationSplit
)
from pyspark.ml.linalg import Vectors, DenseVector, SparseVector  # 벡터 수동 생성
from pyspark.ml.stat import Correlation, ChiSquareTest            # 통계 테스트
from pyspark.sql import SparkSession
import os
from pyspark.sql.functions import *

## SparkSession

In [2]:
spark = SparkSession.builder.appName('Drivers License Data').getOrCreate()
spark

# 2. 데이터 불러오기

In [3]:
df = spark.read.csv("Driver_License/learning_spark_data/Drivers License Data.csv", header=True, inferSchema = True)

## 데이터 확인

In [4]:
df.show(5)

+------------+------+-----------+-----+--------+-------+-----+-------------+-----------+----------+-------------+------------+----------+-------+-----------+---------+---------+
|Applicant ID|Gender|  Age Group| Race|Training|Signals|Yield|Speed Control|Night Drive|Road Signs|Steer Control|Mirror Usage|Confidence|Parking|Theory Test|Reactions|Qualified|
+------------+------+-----------+-----+--------+-------+-----+-------------+-----------+----------+-------------+------------+----------+-------+-----------+---------+---------+
|     AID0001|  Male|Young Adult|Other|    None|  38.48|30.29|        37.03|      33.53|     39.61|        58.16|       53.42|     35.32|  38.19|      70.68|  Average|       No|
|     AID0002|Female|Young Adult|Black|    None|  51.76|19.13|        63.05|      34.87|     19.56|        16.48|       27.97|     22.91|  24.23|      78.18|  Average|       No|
|     AID0003|  Male| Middle Age|Black|    None|  30.21|48.13|        43.13|      42.43|     60.93|        20.

## 결측치 처리

In [5]:
null_counts = df.select(
                    [
                        sum( when(col(c).isNull() | isnan(c),1).otherwise(0) ).alias(c)
                    for c in df.columns
                    ]
                )
null_counts.show()
# 결측치 확인되지 않음

+------------+------+---------+----+--------+-------+-----+-------------+-----------+----------+-------------+------------+----------+-------+-----------+---------+---------+
|Applicant ID|Gender|Age Group|Race|Training|Signals|Yield|Speed Control|Night Drive|Road Signs|Steer Control|Mirror Usage|Confidence|Parking|Theory Test|Reactions|Qualified|
+------------+------+---------+----+--------+-------+-----+-------------+-----------+----------+-------------+------------+----------+-------+-----------+---------+---------+
|           0|     0|        0|   0|       0|      0|    0|            0|          0|         0|            0|           0|         0|      0|          0|        0|        0|
+------------+------+---------+----+--------+-------+-----+-------------+-----------+----------+-------------+------------+----------+-------+-----------+---------+---------+



# 3. 피쳐엔지니어링

1. 아이디 컬럼 제거
2. 결측치 존재 여부 확인 및 처리
3. 번주형 컬럼 인코딩: 나이그룹, 인종, 훈련 유형, 반응속도, 통과여부
4. 벡터 어셈블러

## 범주형 데이터 인코딩

In [6]:
label_indexer = StringIndexer(inputCol="Qualified", outputCol="Pass")
# 범주형 처리
cat_cols = ["Gender", "Age Group", "Race", "Training", "Reactions"]
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in cat_cols]
encoders = [OneHotEncoder(inputCol=col+"_index", outputCol=col+"_vec") for col in cat_cols]

## 벡터어셈블러 적용

In [7]:
# 수치형 + 범주형 묶기
features = ["Signals", "Yield", "Speed Control", "Night Drive", "Road Signs", "Steer Control", "Mirror Usage", "Confidence", "Parking", "Theory Test"] + [col+"_vec" for col in cat_cols]
assembler = VectorAssembler(inputCols=features, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)

# 4. 데이터 스플릿

In [8]:
(train_df, test_df) = df.randomSplit([0.8, 0.2], seed=42)

# 5. 모델 학습 및 평가

In [31]:
lr = LogisticRegression(featuresCol="features", labelCol="Pass")

pipeline = Pipeline(stages=[label_indexer] + indexers + encoders + [assembler, lr])

model = pipeline.fit(train_df)
pred = model.transform(test_df)


In [25]:
pred.select('features', 'Pass', 'prediction').show(5)

+--------------------+----+----------+
|            features|Pass|prediction|
+--------------------+----+----------+
|[30.21,48.13,43.1...| 0.0|       1.0|
|[62.63,45.75,53.0...| 0.0|       0.0|
|(19,[0,1,2,3,4,5,...| 1.0|       1.0|
|[62.92,52.13,57.0...| 0.0|       0.0|
|[20.76,63.14,27.6...| 1.0|       1.0|
+--------------------+----+----------+
only showing top 5 rows



In [11]:
evaluator = BinaryClassificationEvaluator(labelCol="Pass")
print("ROC AUC:", evaluator.evaluate(pred))

ROC AUC: 0.876923076923077


In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="Pass", metricName="accuracy")
accuracy = evaluator.evaluate(pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8243


## 여러모델 비교

In [13]:
models = [
    LogisticRegression(labelCol="Pass", featuresCol="features"),
    RandomForestClassifier(labelCol="Pass", featuresCol="features"),
    GBTClassifier(labelCol="Pass", featuresCol="features")
]

evaluator = BinaryClassificationEvaluator(labelCol="Pass")

for model in models:
    full_pipeline = Pipeline(stages=[label_indexer] + indexers + encoders + [assembler, model])
    fitted_pipeline = full_pipeline.fit(train_df)
    preds = fitted_pipeline.transform(test_df)
    auc = evaluator.evaluate(preds)
    print(f"{type(model).__name__} AUC: {auc:.4f}")


LogisticRegression AUC: 0.8769
RandomForestClassifier AUC: 0.9062
GBTClassifier AUC: 0.8696


In [21]:
rf = RandomForestClassifier(
    labelCol="Pass",
    featuresCol="features", 
    numTrees=50,
    maxDepth=5
)
pipeline = Pipeline(stages=[
    label_indexer
] + indexers + encoders + [
    assembler, rf  
])


In [22]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="Pass",  
    predictionCol="prediction",
    metricName="accuracy" 
)

accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.8243


# 6. 세션 정지

In [34]:
spark.stop()