# Machine Learning with Spark

In [1]:
from pydataset import data
import pyspark
import pyspark.ml
from pyspark.sql.functions import *

spark = pyspark.sql.SparkSession.builder.getOrCreate()

df = spark.createDataFrame(data('tips'))

train, test = df.randomSplit([0.8, 0.2], seed=123)

In [2]:
def shape(df: pyspark.sql.DataFrame):
    return df.count(), len(df.columns)

In [3]:
shape(train)

(190, 7)

In [4]:
shape(test)

(54, 7)

## Regression

We'll first demonstrate a regression problem: predicting the tip amount.

In [5]:
train.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     12.69| 2.0|  Male|    No|Sat|Dinner|   2|
|     13.37| 2.0|  Male|    No|Sat|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 5 rows



`pyspark.ml.feature.RFormula`

- `tip ~ total_bill`: predict tip based on total bill
- `tip ~ total_bill + size`: predict tip based on total bill and size
- `tip ~ .`: predict tip based on all the other features in the dataset

`features` and `labels` columns are the shape/name required for `pyspark.ml`

In [10]:
# To find a reference for more details on rformulas
# pyspark.ml.feature.RFormula?

In [11]:
# nb: spark's rformula does encoding
rf = pyspark.ml.feature.RFormula(formula="tip ~ total_bill + size").fit(train)
rf

rf.transform(train).show(5)

+----------+----+------+------+---+------+----+-----------+-----+
|total_bill| tip|   sex|smoker|day|  time|size|   features|label|
+----------+----+------+------+---+------+----+-----------+-----+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2| [8.77,2.0]|  2.0|
|     12.69| 2.0|  Male|    No|Sat|Dinner|   2|[12.69,2.0]|  2.0|
|     13.37| 2.0|  Male|    No|Sat|Dinner|   2|[13.37,2.0]|  2.0|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|[14.78,2.0]| 3.23|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|[14.83,2.0]| 3.02|
+----------+----+------+------+---+------+----+-----------+-----+
only showing top 5 rows



In [12]:
train_input = rf.transform(train).select('features', 'label')
train_input.show(5)

+-----------+-----+
|   features|label|
+-----------+-----+
| [8.77,2.0]|  2.0|
|[12.69,2.0]|  2.0|
|[13.37,2.0]|  2.0|
|[14.78,2.0]| 3.23|
|[14.83,2.0]| 3.02|
+-----------+-----+
only showing top 5 rows



Create, fit, and use the model.

**Note**: unlike `sklearn`, each step produces a new object!

In [13]:
lr = pyspark.ml.regression.LinearRegression()
lr

LinearRegression_a967cbedd673

In [15]:
# print(lr.explainParams())

In [17]:
## This Will Not Work!!!!
lr = pyspark.ml.regression.LinearRegression()
lr.fit(train_input)
lr.transform(train_input)

AttributeError: 'LinearRegression' object has no attribute 'transform'

In [16]:
lr_fit = lr.fit(train_input)
lr_fit.transform(train_input).show(5)

+-----------+-----+------------------+
|   features|label|        prediction|
+-----------+-----+------------------+
| [8.77,2.0]|  2.0|1.9717337092055747|
|[12.69,2.0]|  2.0|  2.28547899247938|
|[13.37,2.0]|  2.0| 2.339904194679938|
|[14.78,2.0]| 3.23|2.4527564521840364|
|[14.83,2.0]| 3.02| 2.456758305287019|
+-----------+-----+------------------+
only showing top 5 rows



Training results:

In [18]:
lr_fit.summary

<pyspark.ml.regression.LinearRegressionTrainingSummary at 0x11d920c18>

In [19]:
lr_fit.summary.r2, lr_fit.summary.rootMeanSquaredError

(0.44244602311951, 0.9975913515122248)

In [20]:
[x for x in dir(lr_fit.summary) if not x.startswith('_')]

['coefficientStandardErrors',
 'degreesOfFreedom',
 'devianceResiduals',
 'explainedVariance',
 'featuresCol',
 'labelCol',
 'meanAbsoluteError',
 'meanSquaredError',
 'numInstances',
 'objectiveHistory',
 'pValues',
 'predictionCol',
 'predictions',
 'r2',
 'r2adj',
 'residuals',
 'rootMeanSquaredError',
 'tValues',
 'totalIterations']

How do we do on the test data?

In [21]:
test_input = rf.transform(test)
lr_fit.transform(test_input).show(4)

+----------+----+------+------+---+------+----+-----------+-----+------------------+
|total_bill| tip|   sex|smoker|day|  time|size|   features|label|        prediction|
+----------+----+------+------+---+------+----+-----------+-----+------------------+
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|[10.27,2.0]| 1.71| 2.091789302295041|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|[10.33,3.0]| 1.67|2.3597556280076217|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|[10.34,3.0]| 1.66| 2.360555998628218|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|[15.04,2.0]| 1.96| 2.473566088319544|
+----------+----+------+------+---+------+----+-----------+-----+------------------+
only showing top 4 rows



In [22]:
evaluator = pyspark.ml.evaluation.RegressionEvaluator()
rmse = evaluator.evaluate(lr_fit.transform(test_input))
rmse

1.0581350816287596

## Classification

Predict time of day

In [23]:
train.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     12.69| 2.0|  Male|    No|Sat|Dinner|   2|
|     13.37| 2.0|  Male|    No|Sat|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 5 rows



Preprocess the training data

In [24]:
rf = pyspark.ml.feature.RFormula(formula='time ~ total_bill + size').fit(train)
train_input = rf.transform(train)
train_input.show(50)

+----------+----+------+------+----+------+----+-----------+-----+
|total_bill| tip|   sex|smoker| day|  time|size|   features|label|
+----------+----+------+------+----+------+----+-----------+-----+
|      8.77| 2.0|  Male|    No| Sun|Dinner|   2| [8.77,2.0]|  0.0|
|     12.69| 2.0|  Male|    No| Sat|Dinner|   2|[12.69,2.0]|  0.0|
|     13.37| 2.0|  Male|    No| Sat|Dinner|   2|[13.37,2.0]|  0.0|
|     14.78|3.23|  Male|    No| Sun|Dinner|   2|[14.78,2.0]|  0.0|
|     14.83|3.02|Female|    No| Sun|Dinner|   2|[14.83,2.0]|  0.0|
|     15.42|1.57|  Male|    No| Sun|Dinner|   2|[15.42,2.0]|  0.0|
|     16.29|3.71|  Male|    No| Sun|Dinner|   3|[16.29,3.0]|  0.0|
|     16.97| 3.5|Female|    No| Sun|Dinner|   3|[16.97,3.0]|  0.0|
|     16.99|1.01|Female|    No| Sun|Dinner|   2|[16.99,2.0]|  0.0|
|     17.81|2.34|  Male|    No| Sat|Dinner|   4|[17.81,4.0]|  0.0|
|     17.92|4.08|  Male|    No| Sat|Dinner|   2|[17.92,2.0]|  0.0|
|     19.65| 3.0|Female|    No| Sat|Dinner|   2|[19.65,2.0]|  

Create and fit the model

In [25]:
lr = pyspark.ml.classification.LogisticRegression()

In [27]:
# print(lr.explainParams())

In [28]:
lr_fit = lr.fit(train_input)

Model Evaluation

In [29]:
[x for x in dir(lr_fit.summary) if not x.startswith('_')]

['accuracy',
 'areaUnderROC',
 'fMeasureByLabel',
 'fMeasureByThreshold',
 'falsePositiveRateByLabel',
 'featuresCol',
 'labelCol',
 'labels',
 'objectiveHistory',
 'pr',
 'precisionByLabel',
 'precisionByThreshold',
 'predictionCol',
 'predictions',
 'probabilityCol',
 'recallByLabel',
 'recallByThreshold',
 'roc',
 'totalIterations',
 'truePositiveRateByLabel',
 'weightedFMeasure',
 'weightedFalsePositiveRate',
 'weightedPrecision',
 'weightedRecall',
 'weightedTruePositiveRate']

Area Under ROC Curve

- Produce a curve where each point on the curve is the TPR vs FPR; multiple points are found by adjusting the threshold for the model [animation](https://stats-demos.zach.wiki/static/roc-auc.mp4)
- This works for models that predict a probability in addition to a yes/no
- Number between 0 and 1, closer to 1 is better

In [30]:
# area under TPR (recall) vs FPR (FP / (FP + TN)) curve
# https://en.wikipedia.org/wiki/Receiver_operating_characteristic
lr_fit.summary.areaUnderROC

0.6526599326599327

In [31]:
evaluator = pyspark.ml.evaluation.BinaryClassificationEvaluator()
test_auc = evaluator.evaluate(lr_fit.transform(rf.transform(test)))
test_auc

0.5234521575984991

In [34]:
test_input = rf.transform(test)
test_input.show()

+----------+----+------+------+----+------+----+-----------+-----+
|total_bill| tip|   sex|smoker| day|  time|size|   features|label|
+----------+----+------+------+----+------+----+-----------+-----+
|     10.27|1.71|  Male|    No| Sun|Dinner|   2|[10.27,2.0]|  0.0|
|     10.33|1.67|Female|    No| Sun|Dinner|   3|[10.33,3.0]|  0.0|
|     10.34|1.66|  Male|    No| Sun|Dinner|   3|[10.34,3.0]|  0.0|
|     15.04|1.96|  Male|    No| Sun|Dinner|   2|[15.04,2.0]|  0.0|
|     15.77|2.23|Female|    No| Sat|Dinner|   2|[15.77,2.0]|  0.0|
|     18.43| 3.0|  Male|    No| Sun|Dinner|   4|[18.43,4.0]|  0.0|
|     23.68|3.31|  Male|    No| Sun|Dinner|   2|[23.68,2.0]|  0.0|
|     26.88|3.12|  Male|    No| Sun|Dinner|   4|[26.88,4.0]|  0.0|
|     39.42|7.58|  Male|    No| Sat|Dinner|   4|[39.42,4.0]|  0.0|
|     13.94|3.06|  Male|    No| Sun|Dinner|   2|[13.94,2.0]|  0.0|
|     16.04|2.24|  Male|    No| Sat|Dinner|   3|[16.04,3.0]|  0.0|
|     16.31| 2.0|  Male|    No| Sat|Dinner|   3|[16.31,3.0]|  

In [37]:
# confusion matrix for the test data
(lr_fit.transform(test_input)
 .select('time', 'total_bill', 'size', 'label', 'probability', 'prediction')
 .groupby('prediction') # predicted == rows
 .pivot('label') # actual values are columns
 .count()
 .show()
)

+----------+---+----+
|prediction|0.0| 1.0|
+----------+---+----+
|       0.0| 40|  13|
|       1.0|  1|null|
+----------+---+----+



In [39]:
# Many other preprocessing steps
# dir(pyspark.ml.feature)

## How do we create a baseline?

For our regression model: baseline prediction is the average tip amount

In [48]:
avg_tip_amount = train.agg(mean('tip')).head()[0]

In [54]:
(train.selectExpr('*', '{} as prediction'.format(avg_tip_amount))
 .show(5)
)

+----------+----+------+------+---+------+----+-----------------+
|total_bill| tip|   sex|smoker|day|  time|size|       prediction|
+----------+----+------+------+---+------+----+-----------------+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|3.005105263157895|
|     12.69| 2.0|  Male|    No|Sat|Dinner|   2|3.005105263157895|
|     13.37| 2.0|  Male|    No|Sat|Dinner|   2|3.005105263157895|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|3.005105263157895|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|3.005105263157895|
+----------+----+------+------+---+------+----+-----------------+
only showing top 5 rows



In [58]:
rf = pyspark.ml.feature.RFormula(formula="tip ~ total_bill + size").fit(train)

baseline_eval_df = (
    rf.transform(train)
    .selectExpr('label', '{} as prediction'.format(avg_tip_amount))
    .select('label', col('prediction').cast('float'))
)

evaluator = pyspark.ml.evaluation.RegressionEvaluator()
baseline_rmse = evaluator.evaluate(baseline_eval_df)
baseline_rmse

1.33600848542597

For our classification model:

In [59]:
rf = pyspark.ml.feature.RFormula(formula='time ~ total_bill + size').fit(train)
train_input = rf.transform(train)

In [61]:
train_input.groupby('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|  135|
|  1.0|   55|
+-----+-----+



In [76]:
lr_fit.transform(train_input).select('prediction', 'rawPrediction', 'probability').show(truncate=False)

+----------+------------------------------------------+----------------------------------------+
|prediction|rawPrediction                             |probability                             |
+----------+------------------------------------------+----------------------------------------+
|0.0       |[0.24391504551760637,-0.24391504551760637]|[0.5606782239589314,0.43932177604106865]|
|0.0       |[0.5224097706862575,-0.5224097706862575]  |[0.6277110784259653,0.37228892157403476]|
|0.0       |[0.5707200801542888,-0.5707200801542888]  |[0.6389293132787042,0.3610706867212957] |
|0.0       |[0.6708929277277066,-0.6708929277277066]  |[0.66170307181667,0.33829692818333]     |
|0.0       |[0.6744451563650621,-0.6744451563650621]  |[0.6624977883898066,0.33750221161019345]|
|0.0       |[0.716361454285854,-0.716361454285854]    |[0.6718052818590612,0.3281947181409388] |
|0.0       |[0.6684740631485411,-0.6684740631485411]  |[0.6611613922536789,0.33883860774632113]|
|0.0       |[0.716784372616572

In [70]:
baseline_eval_df =  (
    train_input.selectExpr('label', '0 as rawPrediction')
)

evaluator = pyspark.ml.evaluation.BinaryClassificationEvaluator(metricName='accuracy')
baseline_auc = evaluator.evaluate(baseline_eval_df)
baseline_auc

IllegalArgumentException: 'BinaryClassificationEvaluator_b62d26410254 parameter metricName given invalid value accuracy.'