In [None]:
# credits: https://towardsdatascience.com/pyspark-and-xgboost-integration-tested-on-the-kaggle-titanic-dataset-4e75a568bdb

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.types import *
from pyspark.sql.functions import col
import numpy as np
from pyspark.sql.functions import col
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.sql.types import ArrayType
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import os
import shutil
from evaluate import *
from random_forest import *
from utils import init_spark
from preprocess import get_positive_samples, \
                       get_negative_samples, \
                       get_dataset_df
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars data/xgboost4j-spark-0.72.jar,data/xgboost4j-0.72.jar pyspark-shell'

In [None]:
spark = init_spark()
spark.sparkContext.addPyFile("data/sparkxgb.zip")
from sparkxgb import XGBoostEstimator

In [None]:
# load dataset
sample_ratio = 0.1
neg_samples = get_negative_samples(spark).sample(sample_ratio).na.fill(0)
pos_samples = get_positive_samples(spark).sample(sample_ratio).na.fill(0)
df = get_dataset_df(spark, pos_samples, neg_samples).na.fill(0)
trainDF, testDF = df.randomSplit([0.8, 0.2], seed=0)

In [None]:
xgboost = XGBoostEstimator(
    featuresCol="features", 
    labelCol="label", 
    predictionCol="prediction"
)
pipeline = Pipeline().setStages([xgboost])

In [None]:
model = pipeline.fit(trainDF)

In [None]:
cd ..

In [None]:
def save_model():
    workdir = "./"
    path = "data/xgboost.model"
    if os.path.isdir(path):
        shutil.rmtree(path)
    model.save(path)

In [None]:
prediction = model.transform(testDF) 
prediction = prediction.withColumn("rawPrediction", prediction['probabilities'])

In [None]:
# plot graph
"""area_under_PR, f1_score = evaluate_binary_classifier(prediction)"""
"""pd_df = compute_precision_recall_graph(result_df, 20)
pd_df.plot()"""

In [None]:
paramGrid = (ParamGridBuilder().addGrid(xgboost.max_depth, [x for x in range(10, 50, 10)])
                            .addGrid(xgboost.eta, [x for x in np.linspace(0.2, 0.6, 6)])
                            .build())

evaluator = BinaryClassificationEvaluator(labelCol="label",
                                      rawPredictionCol="probabilities",
                                      metricName="areaUnderPR")

cv = (CrossValidator()
        .setEstimator(pipeline)
        .setEvaluator(evaluator)
        .setEstimatorParamMaps(paramGrid)
        .setNumFolds(3))

cvModel = cv.fit(trainDF)

bestModel = (cvModel.bestModel
                    .asInstanceOf[PipelineModel]
                    .stages(2)
                    .asInstanceOf[XGBoostClassificationModel])

bestModel.extractParamMap()

In [None]:
# save_model()