# Machine Learning with Spark

In [1]:
from pydataset import data
import pyspark
import pyspark.ml
from pyspark.sql.functions import *

spark = pyspark.sql.SparkSession.builder.getOrCreate()

df = spark.createDataFrame(data('tips'))

train, test = df.randomSplit([0.8, 0.2], seed=123)

In [2]:
# can we stratify using randomSplit?
df.randomSplit?

In [3]:
# no shape so zach made function
# spark is lazy so we don't know the shape of the data
def shape(df: pyspark.sql.DataFrame):
    return df.count(), len(df.columns)

In [4]:
shape(train)

(190, 7)

In [5]:
shape(test)

(54, 7)

## Regression

We'll first demonstrate a regression problem: predicting the tip amount.

In [6]:
train.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     12.69| 2.0|  Male|    No|Sat|Dinner|   2|
|     13.37| 2.0|  Male|    No|Sat|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 5 rows



`pyspark.ml.feature.RFormula`

- `tip ~ total_bill`: predict tip based on total bill
- `tip ~ total_bill + size`: predict tip based on total bill and size
- `tip ~ .`: predict tip based on all the other features in the dataset

In [7]:
# nb: spark's rformula does encoding
rf = pyspark.ml.feature.RFormula(formula="tip ~ total_bill + size").fit(train)
rf

RFormula_fe5c35f030b3

In [8]:
rf.transform(train).show(5)

+----------+----+------+------+---+------+----+-----------+-----+
|total_bill| tip|   sex|smoker|day|  time|size|   features|label|
+----------+----+------+------+---+------+----+-----------+-----+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2| [8.77,2.0]|  2.0|
|     12.69| 2.0|  Male|    No|Sat|Dinner|   2|[12.69,2.0]|  2.0|
|     13.37| 2.0|  Male|    No|Sat|Dinner|   2|[13.37,2.0]|  2.0|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|[14.78,2.0]| 3.23|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|[14.83,2.0]| 3.02|
+----------+----+------+------+---+------+----+-----------+-----+
only showing top 5 rows



In [9]:
# pyspark.ml.feature.RFormula?

`features` and `labels` columns are the shape/name required for `pyspark.ml`

In [10]:
train_input = rf.transform(train).select('features', 'label')
train_input.show(5)

+-----------+-----+
|   features|label|
+-----------+-----+
| [8.77,2.0]|  2.0|
|[12.69,2.0]|  2.0|
|[13.37,2.0]|  2.0|
|[14.78,2.0]| 3.23|
|[14.83,2.0]| 3.02|
+-----------+-----+
only showing top 5 rows



Create, fit, and use the model.

**Note**: unlike `sklearn`, each step produces a new object!

In [11]:
# ordinary least squares
lr = pyspark.ml.regression.LinearRegression()
lr

LinearRegression_5cfb9cbb813f

In [12]:
# hyperparameters
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
epsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
loss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)
maxIter: max number of iterations (>= 0). (default: 100)
predictionCol: prediction column name. (default: prediction)
regParam: regularization parameter (>= 0). (default: 0.0)
solver: The solver algorithm for optimization. Supported options: auto, normal, l-bfgs. (default: auto)
standardization: whether to standardize the training features before fitting the model.

In [13]:
lr_fit = lr.fit(train_input)
lr_fit.transform(train_input).show(5)

+-----------+-----+------------------+
|   features|label|        prediction|
+-----------+-----+------------------+
| [8.77,2.0]|  2.0|1.9717337092055747|
|[12.69,2.0]|  2.0|  2.28547899247938|
|[13.37,2.0]|  2.0| 2.339904194679938|
|[14.78,2.0]| 3.23|2.4527564521840364|
|[14.83,2.0]| 3.02| 2.456758305287019|
+-----------+-----+------------------+
only showing top 5 rows



In [14]:
# might be tempting to do the following
## This Will Not Work!!!!
# lr = pyspark.ml.regression.LinearRegression()
# lr.fit(train_input)
# lr.transform(train_input)

# each step in spark needs a new object unlike sklearn

Training results:

In [15]:
lr_fit.summary

<pyspark.ml.regression.LinearRegressionTrainingSummary at 0x7f94e6c68f50>

In [16]:
lr_fit.summary.r2, lr_fit.summary.rootMeanSquaredError

(0.44244602311951, 0.9975913515122248)

In [17]:
# reminder: explainedVariance == r_squared
[x for x in dir(lr_fit.summary) if not x.startswith('_')]

['coefficientStandardErrors',
 'degreesOfFreedom',
 'devianceResiduals',
 'explainedVariance',
 'featuresCol',
 'labelCol',
 'meanAbsoluteError',
 'meanSquaredError',
 'numInstances',
 'objectiveHistory',
 'pValues',
 'predictionCol',
 'predictions',
 'r2',
 'r2adj',
 'residuals',
 'rootMeanSquaredError',
 'tValues',
 'totalIterations']

How do we do on the test data?

In [18]:
test_input = rf.transform(test) # transforming test using r formula object
lr_fit.transform(test_input).show(4) # predicting using linear regression object fit on train

+----------+----+------+------+---+------+----+-----------+-----+------------------+
|total_bill| tip|   sex|smoker|day|  time|size|   features|label|        prediction|
+----------+----+------+------+---+------+----+-----------+-----+------------------+
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|[10.27,2.0]| 1.71| 2.091789302295041|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|[10.33,3.0]| 1.67|2.3597556280076217|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|[10.34,3.0]| 1.66| 2.360555998628218|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|[15.04,2.0]| 1.96| 2.473566088319544|
+----------+----+------+------+---+------+----+-----------+-----+------------------+
only showing top 4 rows



In [19]:
# creat the thing
evaluator = pyspark.ml.evaluation.RegressionEvaluator()

# use the thing
rmse = evaluator.evaluate(lr_fit.transform(test_input))
rmse

1.0581350816287596

## Classification

Predict time of day

In [20]:
train.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     12.69| 2.0|  Male|    No|Sat|Dinner|   2|
|     13.37| 2.0|  Male|    No|Sat|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 5 rows



Preprocess the training data

In [21]:
rf = pyspark.ml.feature.RFormula(formula='time ~ total_bill + size').fit(train)
train_input = rf.transform(train)
train_input.show(50)

+----------+----+------+------+----+------+----+-----------+-----+
|total_bill| tip|   sex|smoker| day|  time|size|   features|label|
+----------+----+------+------+----+------+----+-----------+-----+
|      8.77| 2.0|  Male|    No| Sun|Dinner|   2| [8.77,2.0]|  0.0|
|     12.69| 2.0|  Male|    No| Sat|Dinner|   2|[12.69,2.0]|  0.0|
|     13.37| 2.0|  Male|    No| Sat|Dinner|   2|[13.37,2.0]|  0.0|
|     14.78|3.23|  Male|    No| Sun|Dinner|   2|[14.78,2.0]|  0.0|
|     14.83|3.02|Female|    No| Sun|Dinner|   2|[14.83,2.0]|  0.0|
|     15.42|1.57|  Male|    No| Sun|Dinner|   2|[15.42,2.0]|  0.0|
|     16.29|3.71|  Male|    No| Sun|Dinner|   3|[16.29,3.0]|  0.0|
|     16.97| 3.5|Female|    No| Sun|Dinner|   3|[16.97,3.0]|  0.0|
|     16.99|1.01|Female|    No| Sun|Dinner|   2|[16.99,2.0]|  0.0|
|     17.81|2.34|  Male|    No| Sat|Dinner|   4|[17.81,4.0]|  0.0|
|     17.92|4.08|  Male|    No| Sat|Dinner|   2|[17.92,2.0]|  0.0|
|     19.65| 3.0|Female|    No| Sat|Dinner|   2|[19.65,2.0]|  

Create and fit the model

In [22]:
lr = pyspark.ml.classification.LogisticRegression()

In [23]:
# print(lr.explainParams())

In [24]:
lr_fit = lr.fit(train_input)

Model Evaluation

In [25]:
[x for x in dir(lr_fit.summary) if not x.startswith('_')]

['accuracy',
 'areaUnderROC',
 'fMeasureByLabel',
 'fMeasureByThreshold',
 'falsePositiveRateByLabel',
 'featuresCol',
 'labelCol',
 'labels',
 'objectiveHistory',
 'pr',
 'precisionByLabel',
 'precisionByThreshold',
 'predictionCol',
 'predictions',
 'probabilityCol',
 'recallByLabel',
 'recallByThreshold',
 'roc',
 'totalIterations',
 'truePositiveRateByLabel',
 'weightedFMeasure',
 'weightedFalsePositiveRate',
 'weightedPrecision',
 'weightedRecall',
 'weightedTruePositiveRate']

Area under ROC curve

- Produce a curve where each point on the curve is the TP rate vs. FP rate; multiple points are found by adjusting the threshold of the model
- The works for models that predict a probability in addition to a binary class
- Number between 0 and 1, closer to 1 is better

In [26]:
# area under TPR (recall) vs FPR (FP / (FP + TN)) curve
# https://en.wikipedia.org/wiki/Receiver_operating_characteristic
lr_fit.summary.areaUnderROC

0.6526599326599328

In [27]:
evaluator = pyspark.ml.evaluation.BinaryClassificationEvaluator()
test_auc = evaluator.evaluate(lr_fit.transform(rf.transform(test)))
test_auc

0.5234521575984991

In [28]:
test_input = rf.transform(test)
test_input.show()

+----------+----+------+------+----+------+----+-----------+-----+
|total_bill| tip|   sex|smoker| day|  time|size|   features|label|
+----------+----+------+------+----+------+----+-----------+-----+
|     10.27|1.71|  Male|    No| Sun|Dinner|   2|[10.27,2.0]|  0.0|
|     10.33|1.67|Female|    No| Sun|Dinner|   3|[10.33,3.0]|  0.0|
|     10.34|1.66|  Male|    No| Sun|Dinner|   3|[10.34,3.0]|  0.0|
|     15.04|1.96|  Male|    No| Sun|Dinner|   2|[15.04,2.0]|  0.0|
|     15.77|2.23|Female|    No| Sat|Dinner|   2|[15.77,2.0]|  0.0|
|     18.43| 3.0|  Male|    No| Sun|Dinner|   4|[18.43,4.0]|  0.0|
|     23.68|3.31|  Male|    No| Sun|Dinner|   2|[23.68,2.0]|  0.0|
|     26.88|3.12|  Male|    No| Sun|Dinner|   4|[26.88,4.0]|  0.0|
|     39.42|7.58|  Male|    No| Sat|Dinner|   4|[39.42,4.0]|  0.0|
|     13.94|3.06|  Male|    No| Sun|Dinner|   2|[13.94,2.0]|  0.0|
|     16.04|2.24|  Male|    No| Sat|Dinner|   3|[16.04,3.0]|  0.0|
|     16.31| 2.0|  Male|    No| Sat|Dinner|   3|[16.31,3.0]|  

In [29]:
# confusion matrix for the test data
(lr_fit.transform(test_input)
 .select('time', 'total_bill', 'size', 'label', 'probability', 'prediction')
 .groupby('prediction') # predicted == rows
 .pivot('label') # actual values are columns
 .count()
 .show())

+----------+---+----+
|prediction|0.0| 1.0|
+----------+---+----+
|       0.0| 40|  13|
|       1.0|  1|null|
+----------+---+----+



In [30]:
# Many other preprocessing steps
dir(pyspark.ml.feature)

['Binarizer',
 'BucketedRandomProjectionLSH',
 'BucketedRandomProjectionLSHModel',
 'Bucketizer',
 'ChiSqSelector',
 'ChiSqSelectorModel',
 'CountVectorizer',
 'CountVectorizerModel',
 'DCT',
 'DecisionTreeParams',
 'ElementwiseProduct',
 'FeatureHasher',
 'HasAggregationDepth',
 'HasCheckpointInterval',
 'HasCollectSubModels',
 'HasDistanceMeasure',
 'HasElasticNetParam',
 'HasFeaturesCol',
 'HasFitIntercept',
 'HasHandleInvalid',
 'HasInputCol',
 'HasInputCols',
 'HasLabelCol',
 'HasLoss',
 'HasMaxIter',
 'HasNumFeatures',
 'HasOutputCol',
 'HasOutputCols',
 'HasParallelism',
 'HasPredictionCol',
 'HasProbabilityCol',
 'HasRawPredictionCol',
 'HasRegParam',
 'HasSeed',
 'HasSolver',
 'HasStandardization',
 'HasStepSize',
 'HasThreshold',
 'HasThresholds',
 'HasTol',
 'HasVarianceCol',
 'HasWeightCol',
 'HashingTF',
 'IDF',
 'IDFModel',
 'Imputer',
 'ImputerModel',
 'IndexToString',
 'JavaEstimator',
 'JavaMLReadable',
 'JavaMLWritable',
 'JavaModel',
 'JavaParams',
 'JavaTransformer'

## Baseline

In [31]:
# can use .first, .head, or .collect
avg_tip_amount = train.agg(mean("tip")).head()[0]

In [32]:
train.select("*")

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint]

In [33]:
rf = pyspark.ml.feature.RFormula(formula="tip ~ total_bill + size").fit(train)
rf.transform(train).show()

+----------+----+------+------+---+------+----+-----------+-----+
|total_bill| tip|   sex|smoker|day|  time|size|   features|label|
+----------+----+------+------+---+------+----+-----------+-----+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2| [8.77,2.0]|  2.0|
|     12.69| 2.0|  Male|    No|Sat|Dinner|   2|[12.69,2.0]|  2.0|
|     13.37| 2.0|  Male|    No|Sat|Dinner|   2|[13.37,2.0]|  2.0|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|[14.78,2.0]| 3.23|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|[14.83,2.0]| 3.02|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|[15.42,2.0]| 1.57|
|     16.29|3.71|  Male|    No|Sun|Dinner|   3|[16.29,3.0]| 3.71|
|     16.97| 3.5|Female|    No|Sun|Dinner|   3|[16.97,3.0]|  3.5|
|     16.99|1.01|Female|    No|Sun|Dinner|   2|[16.99,2.0]| 1.01|
|     17.81|2.34|  Male|    No|Sat|Dinner|   4|[17.81,4.0]| 2.34|
|     17.92|4.08|  Male|    No|Sat|Dinner|   2|[17.92,2.0]| 4.08|
|     19.65| 3.0|Female|    No|Sat|Dinner|   2|[19.65,2.0]|  3.0|
|     19.8

rawPrediction is not the same thing as binary call as the prediction is derived using the probability

For classification baseline, create a simple logistic regression model using the classes