In [1]:
from pyspark.sql import Row
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql.functions import explode, col, udf, mean as _mean, stddev as _stddev, log, log10
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.functions import lit
sc = SparkContext()
spark = SparkSession(sc)

In [2]:
# Prepare training documents, which are labeled.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0, 0.0),
    (1, "b d", 0.0, 1.0),
    (2, "spark f g h", 1.0, 0.0),
    (3, "hadoop mapreduce", 0.0, 1.0),
    (4, "b spark who", 1.0, 0.0),
    (5, "g d a y", 0.0, 1.0),
    (6, "spark fly", 1.0, 0.0),
    (7, "was mapreduce", 0.0, 1.0),
    (8, "e spark program", 1.0, 0.0),
    (9, "a e c l", 0.0, 1.0),
    (10, "spark compile", 1.0, 0.0),
    (11, "hadoop software", 0.0, 1.0)
], ["id", "text", "label1", "label2"])

# Prepare test documents, which are unlabeled.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "mapreduce spark"),
    (7, "apache hadoop")
], ["id", "text"])
training.show()
test.show()

+---+----------------+------+------+
| id|            text|label1|label2|
+---+----------------+------+------+
|  0| a b c d e spark|   1.0|   0.0|
|  1|             b d|   0.0|   1.0|
|  2|     spark f g h|   1.0|   0.0|
|  3|hadoop mapreduce|   0.0|   1.0|
|  4|     b spark who|   1.0|   0.0|
|  5|         g d a y|   0.0|   1.0|
|  6|       spark fly|   1.0|   0.0|
|  7|   was mapreduce|   0.0|   1.0|
|  8| e spark program|   1.0|   0.0|
|  9|         a e c l|   0.0|   1.0|
| 10|   spark compile|   1.0|   0.0|
| 11| hadoop software|   0.0|   1.0|
+---+----------------+------+------+

+---+---------------+
| id|           text|
+---+---------------+
|  4|    spark i j k|
|  5|          l m n|
|  6|mapreduce spark|
|  7|  apache hadoop|
+---+---------------+



In [9]:
from pyspark.ml import Transformer
class ProbTransformer(Transformer):
    
    def __init__(self, outputCol,
                 dropCols=['rawPrediction','probability','prediction'],
                 predict_col=['probability','prediction'],
                 method=lambda prob_col, pred_col: float(pred_col if len(prob_col) == 1 else prob_col[1])):
        self.outputCol = outputCol
        self.dropCols = dropCols
        self.predict_col = predict_col
        self.method = method
    
    def transform(self, data):
        get_predict = udf(self.method,FloatType())
        return data.withColumn(self.outputCol, get_predict(*self.predict_col)).drop(*self.dropCols)

In [20]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(labelCol='label1', featuresCol='features',maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
pipeline.fit(training).transform(training).show()
# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(labelCol='label1'),
                          numFolds=2)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)

# Prepare test documents, which are unlabeled.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "mapreduce spark"),
    (7, "apache hadoop")
], ["id", "text"])

# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    print(row)

+---+----------------+------+------+--------------------+--------------------+--------------------+--------------------+----------+
| id|            text|label1|label2|               words|            features|       rawPrediction|         probability|prediction|
+---+----------------+------+------+--------------------+--------------------+--------------------+--------------------+----------+
|  0| a b c d e spark|   1.0|   0.0|[a, b, c, d, e, s...|(262144,[74920,89...|[-6.6901128719233...|[0.00124159896478...|       1.0|
|  1|             b d|   0.0|   1.0|              [b, d]|(262144,[89530,14...|[6.22897857647562...|[0.99803241445632...|       0.0|
|  2|     spark f g h|   1.0|   0.0|    [spark, f, g, h]|(262144,[36803,17...|[-9.0835373227814...|[1.13506456672275...|       1.0|
|  3|hadoop mapreduce|   0.0|   1.0| [hadoop, mapreduce]|(262144,[132966,1...|[8.94924347957775...|[0.99987018150903...|       0.0|
|  4|     b spark who|   1.0|   0.0|     [b, spark, who]|(262144,[143741,1..

In [24]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

'''
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

labels = ['label1','label2']
feature_col = 'features'
hyperparameters = {'maxIter':10}
clf = LogisticRegression
mlc = []
for label in labels:
    mlc.append(clf(labelCol=label, featuresCol=feature_col, **hyperparameters))
    mlc.append(ProbTransformer(outputCol=label+'_prob'))

pipeline = Pipeline(stages=[tokenizer, hashingTF]+mlc)
model = pipeline.fit(training)
prediction = model.transform(training)
prediction.show()
'''
labels = ['label1','label2']
feature_col = 'features'
hyperparameters = {'maxIter':10}
clf = LogisticRegression
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol=feature_col)
feature_eng_pipe = Pipeline(stages=[tokenizer, hashingTF])
X = tokenizer.transform(training)
X = hashingTF.transform(X)
X = X.select(*['id','features','label1','label2'])
X.show()
# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.

stages = []
for label in labels:
    model = clf(labelCol=label, featuresCol=feature_col, **hyperparameters)
    pipeline = Pipeline(stages=[model])
    paramGrid = ParamGridBuilder() \
    .addGrid(model.regParam, [0.1, 0.01]) \
    .build()
    crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(labelCol=label),
                          numFolds=2)  # use 3+ folds in practice
    probTransformer = ProbTransformer(outputCol=label+'_prob')
    stages.append(crossval)
    stages.append(probTransformer)
pipeline = Pipeline(stages=stages)
model = pipeline.fit(X)
prediction = model.transform(X)
prediction.show()

+---+--------------------+------+------+
| id|            features|label1|label2|
+---+--------------------+------+------+
|  0|(262144,[74920,89...|   1.0|   0.0|
|  1|(262144,[89530,14...|   0.0|   1.0|
|  2|(262144,[36803,17...|   1.0|   0.0|
|  3|(262144,[132966,1...|   0.0|   1.0|
|  4|(262144,[143741,1...|   1.0|   0.0|
|  5|(262144,[36803,89...|   0.0|   1.0|
|  6|(262144,[39928,17...|   1.0|   0.0|
|  7|(262144,[99211,13...|   0.0|   1.0|
|  8|(262144,[76285,16...|   1.0|   0.0|
|  9|(262144,[1303,749...|   0.0|   1.0|
| 10|(262144,[109869,1...|   1.0|   0.0|
| 11|(262144,[123474,1...|   0.0|   1.0|
+---+--------------------+------+------+



AttributeError: 'PipelineModel' object has no attribute 'regParam'

In [20]:
from tqdm import tqdm
from pyspark.ml import Estimator
from pyspark.ml import Pipeline
# Multilabel Classifier
class MultiLabelClassifier(Estimator):
    
    def __init__(self, clf, labels, feature_col,   
                 predict_col=['probability','prediction'],
                 method=lambda prob_col, pred_col: float(pred_col if len(prob_col) == 1 else prob_col[1]), \
                **hyperparameters):
        '''
        Initialize a multilabelclassifier
        clf: the model to use
        labels: a list of labels to predict
        feature_col: the feature column
        predict_col: the prediction column where the prediction is located
        hyperparameters: all optional hyperparameters that can tune
        method: a method of how to get the final prediction for one class
        '''
        self.clf = clf
        self.labels = labels
        self.feature_col = feature_col
        self.predict_col = predict_col
        self.hyperparameters = hyperparameters
        self.method = method
        
        self._trained_clfs = Pipeline(stages=[clf(labelCol=label, featuresCol=feature_col, **self.hyperparameters)
                                             for label in labels])
        self.res = None

    def fit(self, train):
        self._trained_clfs = self._trained_clfs.fit(train)
        return self
    
    def transform(self, test):
        #target assembler
        va = VectorAssembler(inputCols=self.labels, outputCol='targets')
        ## transform this vector self.output_col to an array
        select_cols = [self.feature_col, 'targets', 'sig_id']
        res = va.transform(x_test).select(*select_cols)
        res = self._trained_clfs.transform(res)
        return res
    '''
    def transform(self, x_test):
        # convert method to udf
        get_predict = udf(self.method,FloatType())
        #target assembler
        va = VectorAssembler(inputCols=self.labels, outputCol='targets')
        ## transform this vector self.output_col to an array
        select_cols = [self.feature_col, 'targets', 'sig_id']
        res = va.transform(x_test).select(*select_cols)
        for i, clf in tqdm(enumerate(self._trained_clfs)):
            res = clf.transform(res)
            new_col = self.labels[i]
            res = res.withColumn(new_col, get_predict(*self.predict_col))
            select_cols.append(new_col)
            res = res.select(*select_cols)
        self.res = res
        return res.select(*select_cols[2:])
    '''
    def score(self):
        #target assembler
        va = VectorAssembler(inputCols=self.labels, outputCol='predicts')
        ## transform this vector self.output_col to an array
        df = va.transform(self.res).select('targets', 'predicts')
        df = df.withColumn('targets', vector_to_array('targets'))
        df = df.withColumn('predicts', vector_to_array('predicts'))
        import math
        @udf('double')
        def log_loss(y, y_hat):
            r = 0
            cut = 1e-15
            for t, p in zip(y, y_hat):
                p = max(min(p, 1-cut),cut)
                r += t * math.log(p) + (1 - t) * math.log(1 - p)
            return r/len(y)
        df = df.select(log_loss('targets','predicts').alias('log_loss'))
        return df.select((-_mean(col('log_loss'))).alias('score'))
        