In [1]:
from pyspark.sql import SparkSession
# set conf and create training data frame
spark = SparkSession.builder.master('local[*]').appName('MLlib_tutorial').getOrCreate()

training_df = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0),
    (4, "b spark who", 1.0),
    (5, "g d a y", 0.0),
    (6, "spark fly", 1.0),
    (7, "was mapreduce", 0.0),
    (8, "e spark program", 1.0),
    (9, "a e c l", 0.0),
    (10, "spark compile", 1.0),
    (11, "hadoop software", 0.0),
], ["id", "text", "label"])

In [2]:
# Create transformations
from pyspark.ml.feature import HashingTF, Tokenizer
# Create Tokenizer, Hashing TF instances. These instances are transformers
transformer_tokenizer = Tokenizer(inputCol="text", outputCol="words")
transformer_hashingTF = HashingTF(inputCol=transformer_tokenizer.getOutputCol(), outputCol="features")

In [3]:
# Create estimator, lr
from pyspark.ml.classification import LogisticRegression

# Create a LogisticRegression instance. This instance is an estimator
estimator_lr = LogisticRegression(maxIter=10, regParam=0.01)

# Print out the parameters, documentation, and any default values
print("LogisticRegression parameters:\n{}\n".format(estimator_lr.explainParams()))

LogisticRegression parameters:
aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The bou

In [4]:
# Create a pipeline
from pyspark.ml import Pipeline

# Configure an ML pipeline, which consists of three stages:
#   transformer_tokenizer -> transformer_hashingTF, estimator_lr
pipeline = Pipeline(stages=[
    transformer_tokenizer, 
    transformer_hashingTF,
    estimator_lr
])

In [5]:
# Create a training model!
# Specify parameters using Python dictionary as a paramMap
paramMap = {estimator_lr.probabilityCol: "myProbability"}

# Fit the pipeline to training documents, name of probability column is specified as "myProbability"
model = pipeline.fit(training_df, paramMap)

In [6]:
# Test
test_df = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "spark hadoop spark"),
    (7, "apache hadoop"),
    (8, "spakry spyks"),
    (9, "sparky"),
    (10, "kpark")
], ["id", "text"])

prediction_df = model.transform(test_df)
prediction_df.show()

+---+------------------+--------------------+--------------------+--------------------+--------------------+----------+
| id|              text|               words|            features|       rawPrediction|       myProbability|prediction|
+---+------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  4|       spark i j k|    [spark, i, j, k]|(262144,[20197,24...|[-1.8577528108141...|[0.13496519522401...|       1.0|
|  5|             l m n|           [l, m, n]|(262144,[18910,10...|[5.50096856013554...|[0.99593378652678...|       0.0|
|  6|spark hadoop spark|[spark, hadoop, s...|(262144,[155117,2...|[-4.7770067510016...|[0.00835084407237...|       1.0|
|  7|     apache hadoop|    [apache, hadoop]|(262144,[66695,15...|[3.31647681498440...|[0.96498975713601...|       0.0|
|  8|      spakry spyks|     [spakry, spyks]|(262144,[37816,16...|[2.18898897217885...|[0.89925634997582...|       0.0|
|  9|            sparky|            [spa

In [8]:
# Parameter Tuning
from pyspark.ml.tuning import ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(transformer_hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(estimator_lr.regParam, [0.1, 0.01]) \
    .build()

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator

evaluator = BinaryClassificationEvaluator()
crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3
)

In [9]:
# Run cross-validation, and choose the best set of parameters
cvModel = crossval.fit(training_df)

prediction_df = cvModel.transform(test_df)
selected = prediction_df.select("id", "text", "probability", "prediction")
selected.show()

+---+------------------+--------------------+----------+
| id|              text|         probability|prediction|
+---+------------------+--------------------+----------+
|  4|       spark i j k|[0.25806842225846...|       1.0|
|  5|             l m n|[0.91855974126539...|       0.0|
|  6|spark hadoop spark|[0.08685724267919...|       1.0|
|  7|     apache hadoop|[0.67660828566522...|       0.0|
|  8|      spakry spyks|[0.73556216947131...|       0.0|
|  9|            sparky|[0.82945689611722...|       0.0|
| 10|             kpark|[0.73556216947131...|       0.0|
+---+------------------+--------------------+----------+



In [10]:
bestPipeline = cvModel.bestModel
bestLRModel = bestPipeline.stages[2]
bestParams = bestLRModel.extractParamMap()

for param in bestParams:
    print("{} = {}".format(param, bestParams[param]))

LogisticRegression_4f0aa273be598d872e05__aggregationDepth = 2
LogisticRegression_4f0aa273be598d872e05__elasticNetParam = 0.0
LogisticRegression_4f0aa273be598d872e05__family = auto
LogisticRegression_4f0aa273be598d872e05__featuresCol = features
LogisticRegression_4f0aa273be598d872e05__fitIntercept = True
LogisticRegression_4f0aa273be598d872e05__labelCol = label
LogisticRegression_4f0aa273be598d872e05__maxIter = 10
LogisticRegression_4f0aa273be598d872e05__predictionCol = prediction
LogisticRegression_4f0aa273be598d872e05__probabilityCol = probability
LogisticRegression_4f0aa273be598d872e05__rawPredictionCol = rawPrediction
LogisticRegression_4f0aa273be598d872e05__regParam = 0.1
LogisticRegression_4f0aa273be598d872e05__standardization = True
LogisticRegression_4f0aa273be598d872e05__threshold = 0.5
LogisticRegression_4f0aa273be598d872e05__tol = 1e-06
