In [1]:
# credits: https://towardsdatascience.com/pyspark-and-xgboost-integration-tested-on-the-kaggle-titanic-dataset-4e75a568bdb

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.types import *
from pyspark.sql.functions import col
import numpy as np
from pyspark.ml.feature import StringIndexer, VectorAssembler
import os
from utils import init_spark
from preprocess import get_positive_samples, \
                       get_negative_samples, \
                       get_dataset_df
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars data/xgboost4j-spark-0.72.jar,data/xgboost4j-0.72.jar pyspark-shell'

In [3]:
spark = init_spark()
spark.sparkContext.addPyFile("data/sparkxgb.zip")
from sparkxgb import XGBoostEstimator

In [4]:
#logger = spark._jvm.org.apache.log4j.Logger
"""sc = spark.sparkContext
sc.setLogLevel("OFF")"""

'sc = spark.sparkContext\nsc.setLogLevel("OFF")'

In [35]:
# load dataset
neg_samples = get_negative_samples(spark).sample(0.01).na.fill(0)
pos_samples = get_positive_samples(spark).sample(0.01).na.fill(0)


In [36]:
df = get_dataset_df(spark, pos_samples, neg_samples).na.fill(0)
df.select("features").first()

Row(features=DenseVector([8.0, 0.0, 65.1359, 1.0, 3.5555, 81.1241, 11.5592, 11.9497, 37.537, 101.0173, 0.0, 0.0, 15.1874, -0.866, 0.5, -0.0506, -0.9987, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]))

In [41]:

df.columns

['sample_id', 'street_id', 'date', 'hour', 'features', 'label']

In [12]:
"""feature_list = [e for e in list(df.columns) if e != "label" and e!="date"]
df.columns
feature_list"""

Row(features=DenseVector([2.0, 0.0, 46.4936, 1.0, 13.6173, 86.6271, 5.9996, 10.2858, 26.8006, 101.5087, nan, nan, 12.2525, -0.0, -1.0, -0.9541, -0.2994, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]))

In [8]:
# create transformers if needed

In [9]:
#create one vector to feed xgboost
"""vectorAssembler = VectorAssembler()\
  .setInputCols(feature_list)\
  .setOutputCol("features")"""

In [37]:
#create xgboost estimator
xgboost = XGBoostEstimator(
    featuresCol="features", 
    labelCol="label", 
    predictionCol="prediction"
)

In [38]:
#run pipeline

pipeline = Pipeline().setStages([xgboost])

In [39]:
trainDF, testDF = df.randomSplit([0.7, 0.3], seed=0)

In [None]:
cd ..

In [40]:
model = pipeline.fit(trainDF)
workdir = "./"
model.save(workdir + 'data/xgboost.model')

In [45]:
# evaluate
from pyspark.sql.functions import col
result_df = model.transform(testDF) 

In [46]:
result_df.show()

+------------+-------------+----------+----+--------------------+-----+--------------------+----------+
|   sample_id|    street_id|      date|hour|            features|label|       probabilities|prediction|
+------------+-------------+----------+----+--------------------+-----+--------------------+----------+
| 17179869260|1228360647074|2012-07-18|  14|[14.0,1.0,14.2569...|  1.0|[0.72101521492004...|       0.0|
| 42949673263|1391569404034|2015-02-27|   5|[5.0,0.0,77.82135...|  1.0|[0.73417460918426...|       0.0|
| 42949673432|1683627180381|2015-07-10|  16|[16.0,1.0,66.3288...|  1.0|[0.73479843139648...|       0.0|
| 51539607942|1700807049520|2013-05-10|  23|[23.0,0.0,413.056...|  1.0|[0.72878766059875...|       0.0|
| 60129542539| 687194767514|2015-01-08|   8|[8.0,0.0,308.9331...|  1.0|[0.71544170379638...|       0.0|
|206158430499| 214748364884|2013-10-15|   7|[7.0,0.0,9.669973...|  1.0|[0.69635999202728...|       0.0|
|214748365287|1374389534816|2016-03-28|  21|[21.0,1.0,55.4941...

In [47]:
from evaluate import evaluate_binary_classifier

# Evaluate model
result_df = result_df.withColumn("rawPrediction", result_df['probabilities'])
area_under_PR, f1_score = evaluate_binary_classifier(result_df)


Area Under PR = 0.08375637638647783
F1 score = 0.9558377880077036
