In [1]:
from pyspark.sql import Row
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql.functions import explode, col, udf, mean as _mean, stddev as _stddev, log, log10
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.functions import lit
sc = SparkContext()
spark = SparkSession(sc)

In [2]:
# Prepare training documents, which are labeled.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0, 0.0),
    (1, "b d", 0.0, 1.0),
    (2, "spark f g h", 1.0, 0.0),
    (3, "hadoop mapreduce", 0.0, 1.0),
    (4, "b spark who", 1.0, 0.0),
    (5, "g d a y", 0.0, 1.0),
    (6, "spark fly", 1.0, 0.0),
    (7, "was mapreduce", 0.0, 1.0),
    (8, "e spark program", 1.0, 0.0),
    (9, "a e c l", 0.0, 1.0),
    (10, "spark compile", 1.0, 0.0),
    (11, "hadoop software", 0.0, 1.0)
], ["id", "text", "label1", "label2"])

# Prepare test documents, which are unlabeled.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "mapreduce spark"),
    (7, "apache hadoop")
], ["id", "text"])
training.show()
test.show()

+---+----------------+------+------+
| id|            text|label1|label2|
+---+----------------+------+------+
|  0| a b c d e spark|   1.0|   0.0|
|  1|             b d|   0.0|   1.0|
|  2|     spark f g h|   1.0|   0.0|
|  3|hadoop mapreduce|   0.0|   1.0|
|  4|     b spark who|   1.0|   0.0|
|  5|         g d a y|   0.0|   1.0|
|  6|       spark fly|   1.0|   0.0|
|  7|   was mapreduce|   0.0|   1.0|
|  8| e spark program|   1.0|   0.0|
|  9|         a e c l|   0.0|   1.0|
| 10|   spark compile|   1.0|   0.0|
| 11| hadoop software|   0.0|   1.0|
+---+----------------+------+------+

+---+---------------+
| id|           text|
+---+---------------+
|  4|    spark i j k|
|  5|          l m n|
|  6|mapreduce spark|
|  7|  apache hadoop|
+---+---------------+



In [22]:
from pyspark.ml import Transformer
class ProbTransformer(Transformer):
    
    def __init__(self, outputCol,
                 dropCols=['rawPrediction','probability','prediction'],
                 predict_col=['probability','prediction'],
                 method=lambda prob_col, pred_col: float(pred_col if len(prob_col) == 1 else prob_col[1])):
        self.outputCol = outputCol
        self.dropCols = dropCols
        self.predict_col = predict_col
        self.method = method
        self._paramMap = []
        self._params = None
    
    def transform(self, data):
        get_predict = udf(self.method,FloatType())
        return data.withColumn(self.outputCol, get_predict(*self.predict_col)).drop(*self.dropCols)

In [23]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

labels = ['label1','label2']
feature_col = 'features'
hyperparameters = {'maxIter':10}
prob_estimator = ProbEstimator
clf = LogisticRegression
mlc = []
for label in labels:
    mlc.append(clf(labelCol=label, featuresCol=feature_col, **hyperparameters))
    mlc.append(ProbTransformer(outputCol=label+'_prob'))

pipeline = Pipeline(stages=[tokenizer, hashingTF]+mlc)
model = pipeline.fit(training)
prediction = model.transform(training)
prediction.show()
# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
multiClassCrossValidator
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [10, 100, 1000])
#set searchspace to classifiers
for i in range(0, len(mlc), 2):
    paramGrid = paramGrid.addGrid(mlc[i].regParam, [0.1, 0.01])
paramGrid = paramGrid.build()
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=2)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)
prediction = cvModel.transform(training)
prediction.show()

+---+----------------+------+------+--------------------+--------------------+-------------+------------+
| id|            text|label1|label2|               words|            features|  label1_prob| label2_prob|
+---+----------------+------+------+--------------------+--------------------+-------------+------------+
|  0| a b c d e spark|   1.0|   0.0|[a, b, c, d, e, s...|(262144,[74920,89...|    0.9987584| 0.001241599|
|  1|             b d|   0.0|   1.0|              [b, d]|(262144,[89530,14...| 0.0019675856|   0.9980324|
|  2|     spark f g h|   1.0|   0.0|    [spark, f, g, h]|(262144,[36803,17...|    0.9998865|1.1350646E-4|
|  3|hadoop mapreduce|   0.0|   1.0| [hadoop, mapreduce]|(262144,[132966,1...|  1.298185E-4|   0.9998702|
|  4|     b spark who|   1.0|   0.0|     [b, spark, who]|(262144,[143741,1...|    0.9998029|    1.971E-4|
|  5|         g d a y|   0.0|   1.0|        [g, d, a, y]|(262144,[36803,89...|1.04199105E-4|   0.9998958|
|  6|       spark fly|   1.0|   0.0|        [s

IllegalArgumentException: rawPrediction does not exist. Available: id, text, label1, label2, CrossValidator_403e8b6600cf_rand, words, features, label1_prob, label2_prob

In [20]:
from tqdm import tqdm
from pyspark.ml import Estimator
from pyspark.ml import Pipeline
# Multilabel Classifier
class MultiLabelClassifier(Estimator):
    
    def __init__(self, clf, labels, feature_col,   
                 predict_col=['probability','prediction'],
                 method=lambda prob_col, pred_col: float(pred_col if len(prob_col) == 1 else prob_col[1]), \
                **hyperparameters):
        '''
        Initialize a multilabelclassifier
        clf: the model to use
        labels: a list of labels to predict
        feature_col: the feature column
        predict_col: the prediction column where the prediction is located
        hyperparameters: all optional hyperparameters that can tune
        method: a method of how to get the final prediction for one class
        '''
        self.clf = clf
        self.labels = labels
        self.feature_col = feature_col
        self.predict_col = predict_col
        self.hyperparameters = hyperparameters
        self.method = method
        
        self._trained_clfs = Pipeline(stages=[clf(labelCol=label, featuresCol=feature_col, **self.hyperparameters)
                                             for label in labels])
        self.res = None

    def fit(self, train):
        self._trained_clfs = self._trained_clfs.fit(train)
        return self
    
    def transform(self, test):
        #target assembler
        va = VectorAssembler(inputCols=self.labels, outputCol='targets')
        ## transform this vector self.output_col to an array
        select_cols = [self.feature_col, 'targets', 'sig_id']
        res = va.transform(x_test).select(*select_cols)
        res = self._trained_clfs.transform(res)
        return res
    '''
    def transform(self, x_test):
        # convert method to udf
        get_predict = udf(self.method,FloatType())
        #target assembler
        va = VectorAssembler(inputCols=self.labels, outputCol='targets')
        ## transform this vector self.output_col to an array
        select_cols = [self.feature_col, 'targets', 'sig_id']
        res = va.transform(x_test).select(*select_cols)
        for i, clf in tqdm(enumerate(self._trained_clfs)):
            res = clf.transform(res)
            new_col = self.labels[i]
            res = res.withColumn(new_col, get_predict(*self.predict_col))
            select_cols.append(new_col)
            res = res.select(*select_cols)
        self.res = res
        return res.select(*select_cols[2:])
    '''
    def score(self):
        #target assembler
        va = VectorAssembler(inputCols=self.labels, outputCol='predicts')
        ## transform this vector self.output_col to an array
        df = va.transform(self.res).select('targets', 'predicts')
        df = df.withColumn('targets', vector_to_array('targets'))
        df = df.withColumn('predicts', vector_to_array('predicts'))
        import math
        @udf('double')
        def log_loss(y, y_hat):
            r = 0
            cut = 1e-15
            for t, p in zip(y, y_hat):
                p = max(min(p, 1-cut),cut)
                r += t * math.log(p) + (1 - t) * math.log(1 - p)
            return r/len(y)
        df = df.select(log_loss('targets','predicts').alias('log_loss'))
        return df.select((-_mean(col('log_loss'))).alias('score'))
        

In [21]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
mlc = MultiLabelClassifier(LogisticRegression, 
                           labels=['label1','label2'], 
                           feature_col='features',
                           maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, mlc])
model = pipeline.fit(training)
prediction = model.transform(training)
prediction.show()
'''
# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=2)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)

# Prepare test documents, which are unlabeled.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "mapreduce spark"),
    (7, "apache hadoop")
], ["id", "text"])

# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
selected.show()
'''

Py4JJavaError: An error occurred while calling o8677.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 920.0 failed 1 times, most recent failure: Lost task 0.0 in stage 920.0 (TID 3451, nick-Z370-AORUS-Gaming-WIFI.hitronhub.home, executor driver): TaskResultLost (result lost from block manager)
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2194)
	at org.apache.spark.rdd.RDD.$anonfun$fold$1(RDD.scala:1157)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1151)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$1(RDD.scala:1220)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1196)
	at org.apache.spark.ml.classification.LogisticRegression.$anonfun$train$1(LogisticRegression.scala:499)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:487)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:482)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:281)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:150)
	at jdk.internal.reflect.GeneratedMethodAccessor203.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:834)


In [4]:
from pyspark.mllib.evaluation import MultilabelMetrics

scoreAndLabels = sc.parallelize([
    ([0.0, 1.0], [0.0, 2.0]),
    ([0.0, 2.0], [0.0, 1.0]),
    ([], [0.0]),
    ([2.0], [2.0]),
    ([2.0, 0.0], [2.0, 0.0]),
    ([0.0, 1.0, 2.0], [0.0, 1.0]),
    ([1.0], [1.0, 2.0])])

# Instantiate metrics object
metrics = MultilabelMetrics(scoreAndLabels)

# Summary stats
print("Recall = %s" % metrics.recall())
print("Precision = %s" % metrics.precision())
print("F1 measure = %s" % metrics.f1Measure())
print("Accuracy = %s" % metrics.accuracy)

# Individual label stats
labels = scoreAndLabels.flatMap(lambda x: x[1]).distinct().collect()
for label in labels:
    print("Class %s precision = %s" % (label, metrics.precision(label)))
    print("Class %s recall = %s" % (label, metrics.recall(label)))
    print("Class %s F1 Measure = %s" % (label, metrics.f1Measure(label)))

# Micro stats
print("Micro precision = %s" % metrics.microPrecision)
print("Micro recall = %s" % metrics.microRecall)
print("Micro F1 measure = %s" % metrics.microF1Measure)

# Hamming loss
print("Hamming loss = %s" % metrics.hammingLoss)

# Subset accuracy
print("Subset accuracy = %s" % metrics.subsetAccuracy)

Recall = 0.6428571428571429
Precision = 0.6666666666666666
F1 measure = 0.6380952380952382
Accuracy = 0.5476190476190476
Class 0.0 precision = 1.0
Class 0.0 recall = 0.8
Class 0.0 F1 Measure = 0.888888888888889
Class 1.0 precision = 0.6666666666666666
Class 1.0 recall = 0.6666666666666666
Class 1.0 F1 Measure = 0.6666666666666666
Class 2.0 precision = 0.5
Class 2.0 recall = 0.5
Class 2.0 F1 Measure = 0.5
Micro precision = 0.7272727272727273
Micro recall = 0.6666666666666666
Micro F1 measure = 0.6956521739130435
Hamming loss = 0.3333333333333333
Subset accuracy = 0.2857142857142857
