In [None]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Titanic Classification via ML Pipeline and Model Selection") \
    .getOrCreate()

In [None]:
data = spark.read.csv("data/titanic.csv", header=True, inferSchema=True)
data.printSchema()
data.show()

In [None]:
data.select(['*']).describe().show()

**데이터 클린업**

*   PassengerID, Name, Ticket, Embarked는 사용하지 않을 예정 (아무 의미가 없음).
*   Cabin도 비어있는 값이 너무 많아서 사용하지 않을 예정
*   Age는 중요한 정보인데 비어있는 레코드들이 많아서 디폴트값을 채워줄 예정
*   모든 필드를 MinMaxScaler로 스케일
*   Gender의 경우 카테고리 정보이기에 숫자로 인코딩 필요

In [None]:
final_data = data.select(['Survived', 'Pclass', 'Gender', 'Age', 'SibSp', 'Parch', 'Fare'])
final_data.show()

Age는 평균값으로 채운다

In [None]:
from pyspark.ml.feature import Imputer

imputer = Imputer(strategy='mean', inputCols=['Age'], outputCols=['AgeImputed'])
imputer_model = imputer.fit(final_data)
final_data = imputer_model.transform(final_data)

final_data.select("Age", "AgeImputed").show()

성별 정보 인코딩: male -> 0, female -> 1

In [None]:
from pyspark.ml.feature import StringIndexer

gender_indexer = StringIndexer(inputCol='Gender', outputCol='GenderIndexed')
gender_indexer_model = gender_indexer.fit(final_data)
final_data = gender_indexer_model.transform(final_data)

final_data.select("Gender", "GenderIndexed").show()

## 피쳐 벡터를 만들기

In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['Pclass', 'SibSp', 'Parch', 'Fare', 'AgeImputed', 'GenderIndexed'], outputCol='features')
data_vec = assembler.transform(final_data)

data_vec.show()

Age와 Fare의 값을 스케일하는 것이 주요 목표

In [None]:
from pyspark.ml.feature import MinMaxScaler

age_scaler = MinMaxScaler(inputCol="features", outputCol="features_scaled")
age_scaler_model = age_scaler.fit(data_vec)
data_vec = age_scaler_model.transform(data_vec)

data_vec.select("features", "features_scaled").show()

훈련용과 테스트용 데이터를 나누고 binary classification 모델을 하나 만든다


In [None]:
train, test = data_vec.randomSplit([0.7, 0.3])
from pyspark.ml.classification import LogisticRegression

algo = LogisticRegression(featuresCol="features_scaled", labelCol="Survived")
model = algo.fit(train)

### 모델 성능 측정

In [None]:
predictions = model.transform(test)
predictions.groupby(['Survived']).count().collect()

In [None]:
predictions.groupby(['prediction']).count().collect()

In [None]:
predictions.select(['Survived','prediction', 'probability']).show()

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol='Survived', metricName='areaUnderROC')
evaluator.evaluate(predictions)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(5,5))
plt.plot([0, 1], [0, 1], 'r--')
plt.plot(model.summary.roc.select('FPR').collect(),
         model.summary.roc.select('TPR').collect())
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.show()

## ML Pipeline만들기

In [None]:
from pyspark.ml.feature import Imputer, StringIndexer, VectorAssembler, MinMaxScaler

# Gender
stringIndexer = StringIndexer(inputCol= "Gender", outputCol = "GenderIndexed")

# Age
imputer = Imputer(strategy='mean', inputCols=['Age'], outputCols=['AgeImputed'])

# Vectorize
inputCols = ['Pclass', 'SibSp', 'Parch', 'Fare', 'AgeImputed', 'GenderIndexed']
assembler = VectorAssembler(inputCols=inputCols, outputCol='features')

# MinMaxScaler
minmax_scaler = MinMaxScaler(inputCol='features', outputCol='features_scaled')

stages = [stringIndexer, imputer, assembler, minmax_scaler]

In [None]:
from pyspark.ml import Transformer
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.sql.functions import avg, col, when, isnan
from pyspark.sql import DataFrame

class GenderMeanImputer(Transformer, DefaultParamsReadable, DefaultParamsWritable):
    def __init__(self, gender_col="Gender", target_col="Age", output_col="AgeImputed"):
        super(GenderMeanImputer, self).__init__()
        self.gender_col = gender_col
        self.target_col = target_col
        self.output_col = output_col

    def _transform(self, df: DataFrame) -> DataFrame:
        # 성별별 평균 나이 계산
        gender_avg = df.groupBy(self.gender_col).agg(avg(self.target_col).alias("AvgAge"))

        # 원본 데이터와 조인
        df = df.join(gender_avg, on=self.gender_col, how="left")

        # NULL 또는 NaN 값인 경우 성별 평균으로 대체
        df = df.withColumn(
            self.output_col,
            when(col(self.target_col).isNull() | isnan(col(self.target_col)), col("AvgAge"))
            .otherwise(col(self.target_col))
        )

        # 필요 없는 컬럼 삭제
        df = df.drop("AvgAge")

        return df


In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler

# Gender Indexer
stringIndexer = StringIndexer(inputCol="Gender", outputCol="GenderIndexed")

# Custom GenderMeanImputer 적용
gender_mean_imputer = GenderMeanImputer(gender_col="Gender", target_col="Age", output_col="AgeImputed")

# Vectorize
inputCols = ['Pclass', 'SibSp', 'Parch', 'Fare', 'AgeImputed', 'GenderIndexed']
assembler = VectorAssembler(inputCols=inputCols, outputCol='features')

# MinMaxScaler
minmax_scaler = MinMaxScaler(inputCol='features', outputCol='features_scaled')

# Pipeline 생성
stages = [stringIndexer, gender_mean_imputer, assembler, minmax_scaler]

In [None]:
from pyspark.ml.classification import LogisticRegression

algo = LogisticRegression(featuresCol='features_scaled', labelCol='Survived')
lr_stages = stages + [algo]

In [None]:
lr_stages

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = lr_stages)

In [None]:
df = data.select(['Survived', 'Pclass', 'Gender', 'Age', 'SibSp', 'Parch', 'Fare'])
df.show()

In [None]:
train, test = df.randomSplit([0.7, 0.3])

In [None]:
lr_model = pipeline.fit(train)
lr_cv_predictions = lr_model.transform(test)
evaluator.evaluate(lr_cv_predictions)

### ML Tuning

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol='Survived', metricName='areaUnderROC')

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
             .addGrid(algo.maxIter, [1, 5, 10])
             .build())

cv = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=5
)

In [None]:
# Run cross validations
cvModel = cv.fit(train)
lr_cv_predictions = cvModel.transform(test)
evaluator.evaluate(lr_cv_predictions)

In [None]:
lr_cv_predictions.select("prediction", "survived").show()

In [None]:
import pandas as pd

params = [{p.name: v for p,v in m.items()} for m in cvModel.getEstimatorParamMaps()]
pd.DataFrame.from_dict([
    {cvModel.getEvaluator().getMetricName(): metric, **ps}
    for ps, metric in zip(params, cvModel.avgMetrics)
])

### GBT Classifier

In [None]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(featuresCol='features_scaled', labelCol='Survived')
gbt_stages = stages+ [gbt]

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages= gbt_stages)

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
             .addGrid(gbt.maxDepth, [2, 4, 6])
             .addGrid(gbt.maxBins, [20, 60])
             .addGrid(gbt.maxIter, [10, 20])
             .build())

cv = CrossValidator(
    estimator= pipeline,
    estimatorParamMaps= paramGrid,
    evaluator= evaluator,
    numFolds= 5
)


In [None]:
# Run cross validations.
cvModel = cv.fit(train)
lr_cv_predictions = cvModel.transform(test)
evaluator.evaluate(lr_cv_predictions)