In [1]:
# creating Spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark = SparkSession.builder\
        .master("local")\
        .appName("RDD")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

spark
sc = spark.sparkContext

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Prepare training documents, which are labeled.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0),
    (4, "b spark who", 1.0),
    (5, "g d a y", 0.0),
    (6, "spark fly", 1.0),
    (7, "was mapreduce", 0.0),
    (8, "e spark program", 1.0),
    (9, "a e c l", 0.0),
    (10, "spark compile", 1.0),
    (11, "hadoop software", 0.0)
], ["id", "text", "label"])


In [3]:
training.show()

+---+----------------+-----+
| id|            text|label|
+---+----------------+-----+
|  0| a b c d e spark|  1.0|
|  1|             b d|  0.0|
|  2|     spark f g h|  1.0|
|  3|hadoop mapreduce|  0.0|
|  4|     b spark who|  1.0|
|  5|         g d a y|  0.0|
|  6|       spark fly|  1.0|
|  7|   was mapreduce|  0.0|
|  8| e spark program|  1.0|
|  9|         a e c l|  0.0|
| 10|   spark compile|  1.0|
| 11| hadoop software|  0.0|
+---+----------------+-----+



In [4]:

# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=2)  # use 3+ folds in practice





In [5]:
# melihat hasil tokenize
tokenizer.transform(training).head(10)

[Row(id=0, text='a b c d e spark', label=1.0, words=['a', 'b', 'c', 'd', 'e', 'spark']),
 Row(id=1, text='b d', label=0.0, words=['b', 'd']),
 Row(id=2, text='spark f g h', label=1.0, words=['spark', 'f', 'g', 'h']),
 Row(id=3, text='hadoop mapreduce', label=0.0, words=['hadoop', 'mapreduce']),
 Row(id=4, text='b spark who', label=1.0, words=['b', 'spark', 'who']),
 Row(id=5, text='g d a y', label=0.0, words=['g', 'd', 'a', 'y']),
 Row(id=6, text='spark fly', label=1.0, words=['spark', 'fly']),
 Row(id=7, text='was mapreduce', label=0.0, words=['was', 'mapreduce']),
 Row(id=8, text='e spark program', label=1.0, words=['e', 'spark', 'program']),
 Row(id=9, text='a e c l', label=0.0, words=['a', 'e', 'c', 'l'])]

In [6]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

# hashingTF.transform(tokenizer).head().features

In [7]:
hashingTF.indexOf("a")

107107

In [8]:
# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)

In [9]:
cvModel.explainParams()

"estimator: estimator to be cross-validated (current: Pipeline_cf2e9e9ba919)\nestimatorParamMaps: estimator param maps (current: [{Param(parent='HashingTF_912b6e61bf61', name='numFeatures', doc='Number of features. Should be greater than 0.'): 10, Param(parent='LogisticRegression_d1750c91bac2', name='regParam', doc='regularization parameter (>= 0).'): 0.1}, {Param(parent='HashingTF_912b6e61bf61', name='numFeatures', doc='Number of features. Should be greater than 0.'): 10, Param(parent='LogisticRegression_d1750c91bac2', name='regParam', doc='regularization parameter (>= 0).'): 0.01}, {Param(parent='HashingTF_912b6e61bf61', name='numFeatures', doc='Number of features. Should be greater than 0.'): 100, Param(parent='LogisticRegression_d1750c91bac2', name='regParam', doc='regularization parameter (>= 0).'): 0.1}, {Param(parent='HashingTF_912b6e61bf61', name='numFeatures', doc='Number of features. Should be greater than 0.'): 100, Param(parent='LogisticRegression_d1750c91bac2', name='regPa

In [10]:
bestModel = cvModel.bestModel
bestModel


PipelineModel_85e57532ddeb

In [11]:
# Prepare test documents, which are unlabeled.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "mapreduce spark"),
    (7, "apache hadoop")
], ["id", "text"])

# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    print(row)

Row(id=4, text='spark i j k', probability=DenseVector([0.0251, 0.9749]), prediction=1.0)
Row(id=5, text='l m n', probability=DenseVector([0.8148, 0.1852]), prediction=0.0)
Row(id=6, text='mapreduce spark', probability=DenseVector([0.4425, 0.5575]), prediction=1.0)
Row(id=7, text='apache hadoop', probability=DenseVector([0.764, 0.236]), prediction=0.0)
