In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)

# import data

In [2]:
cuse = spark.read.csv("../../data/cuse_binary.csv",header=True,inferSchema=True)
cuse.show(5)

+---+---------+---------+---+
|age|education|wantsMore|  y|
+---+---------+---------+---+
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
|<25|      low|      yes|  0|
+---+---------+---------+---+
only showing top 5 rows



# Processing categorical columns
* StringIndexer all categorical columns
* OneHotEncoder: all categorical index columns
* VectorAssembler: all features columns into one vector column

## Categorical columns

In [3]:
from pyspark.ml.feature import StringIndexer,OneHotEncoder,VectorAssembler
from pyspark.ml import Pipeline

categorical_columns = cuse.columns[0:3]

## Build StringIndexer Stages

In [5]:
stringindexer_stages = [StringIndexer(inputCol=c,outputCol="strindexed_"+c)for c in categorical_columns]

stringindexer_stages += [StringIndexer(inputCol="y",outputCol="label")]

## Build OneHotEncoder stages

In [6]:
onehotencoder_stages = [OneHotEncoder(inputCol="strindexed_"+c,outputCol="onehot_"+c)for c in categorical_columns]


## Build VectorAssembler stage

In [7]:
feature_columns = ['onehot_'+c for c in categorical_columns]
vectorassembler_stage = VectorAssembler(inputCols=feature_columns,outputCol="features")

## Build pipeline model

In [8]:
all_stages = stringindexer_stages+onehotencoder_stages+[vectorassembler_stage]

pipeline = Pipeline(stages=all_stages)

## Fit pipeline model

In [10]:
pipeline_model = pipeline.fit(cuse)

## Transform data

In [12]:
final_columns = feature_columns + ['features',"label"]

cuse_df = pipeline_model.transform(cuse).\
          select(final_columns)

cuse_df.show(5)

+-------------+----------------+----------------+-------------------+-----+
|   onehot_age|onehot_education|onehot_wantsMore|           features|label|
+-------------+----------------+----------------+-------------------+-----+
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
|(3,[2],[1.0])|       (1,[],[])|   (1,[0],[1.0])|(5,[2,4],[1.0,1.0])|  0.0|
+-------------+----------------+----------------+-------------------+-----+
only showing top 5 rows



## Split data into training and test dataset

In [14]:
training, test = cuse_df.randomSplit([0.8,0.2],seed=1234)

# Build cross-validation model
## Estimatior

In [16]:
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier

In [20]:
dt = DecisionTreeClassifier(featuresCol="features",labelCol='label')

## Parameter Grid

In [21]:
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder().\
             addGrid(dt.maxDepth,[2,3,4,5]).\
             build()

## Evaluator

In [22]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [23]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',metricName="areaUnderROC")

## Build cross-validation model

In [25]:
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator=dt,estimatorParamMaps=param_grid,evaluator=evaluator,numFolds=4)

## Fit cross-validation mode

In [26]:
cv_model = cv.fit(cuse_df)

## Prediction

In [27]:
show_columns = ['features', 'label', 'prediction', 'rawPrediction', 'probability']

## Prediction on training data

In [28]:
pred_training_cv = cv_model.transform(training)
pred_training_cv.select(show_columns).show(5,truncate=False)

+---------+-----+----------+-------------+----------------------------------------+
|features |label|prediction|rawPrediction|probability                             |
+---------+-----+----------+-------------+----------------------------------------+
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
+---------+-----+----------+-------------+----------------------------------------+
only showing top 5 rows



## Prediction on test data

In [29]:
pred_test_cv = cv_model.transform(test)
pred_test_cv.select(show_columns).show(5,truncate=False)

+---------+-----+----------+-------------+----------------------------------------+
|features |label|prediction|rawPrediction|probability                             |
+---------+-----+----------+-------------+----------------------------------------+
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
|(5,[],[])|0.0  |1.0       |[203.0,237.0]|[0.46136363636363636,0.5386363636363637]|
+---------+-----+----------+-------------+----------------------------------------+
only showing top 5 rows



## Confusion matrix
Pyspark doesn’t have a function to calculate the confusion matrix automatically, but we can still easily get a confusion matrix with a combination use of several methods from the RDD class.

In [30]:
label_and_pred = cv_model.transform(cuse_df).select("label","prediction")
label_and_pred.rdd.zipWithIndex().countByKey()

defaultdict(int,
            {Row(label=0.0, prediction=0.0): 897,
             Row(label=0.0, prediction=1.0): 203,
             Row(label=1.0, prediction=0.0): 270,
             Row(label=1.0, prediction=1.0): 237})

## Parameters from the best model

In [31]:
print("The best maxDepth is:",cv_model.bestModel._java_obj.getMaxDepth())

The best maxDepth is: 3
