In [1]:
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorIndexer
import pyspark.ml.evaluation as ev
# Load training data
training = spark.read.format("libsvm").load("/FileStore/tables/sample_libsvm_data.txt")

#print training.show()

labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")

featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4)

(trainingData, testData) = training.randomSplit([0.7, 0.3])

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol='label')

pipeline = Pipeline(stages=[
        labelIndexer, 
        featureIndexer, 
        lr
    ])

# Fit the model
lrModel = lr.fit(trainingData)
test_model = lrModel.transform(testData)

#print ('display LR model', test_model.take(1))
evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='label')

print(' BinaryClassification areaUnderROC:', evaluator.evaluate(test_model, 
     {evaluator.metricName: 'areaUnderROC'}))
print('BinaryClassification areaUnderPR:', evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'}))


In [2]:
import pyspark.ml.tuning as tune
logistic = LogisticRegression(
    labelCol='label')

grid = tune.ParamGridBuilder() \
    .addGrid(logistic.maxIter,  
             [0, 1, 2, 10, 50]) \
    .addGrid(logistic.regParam, 
             [0.002, 0.003, 0.001, 0.005, 0.3]) \
    .build()

In [3]:
cv = tune.CrossValidator(
    estimator=logistic, 
    estimatorParamMaps=grid, 
    evaluator=evaluator
)
pipeline = Pipeline(stages=[featureIndexer])
data_transformer = pipeline.fit(trainingData)
cvModel = cv.fit(data_transformer.transform(trainingData))

In [4]:
results = [
    (
        [
            {key.name: paramValue} 
            for key, paramValue 
            in zip(
                params.keys(), 
                params.values())
        ], metric
    ) 
    for params, metric 
    in zip(
        cvModel.getEstimatorParamMaps(), 
        cvModel.avgMetrics
    )
]

sorted(results, 
       key=lambda el: el[1], 
       reverse=True)[0]

In [5]:
#The spark.ml implementation supports decision trees for binary and multiclass classification and for regression, using both continuous and categorical #features.
#Param name	Type(s)	Default	Description
#labelCol	Double	"label"	Label to predict
#featuresCol	Vector	"features"	Feature vector

#GBTs train one tree at a time, so they can take longer to train than random forests. Random Forests can train multiple trees in parallel.
#On the other hand, it is often reasonable to use smaller (shallower) trees with GBTs than with Random Forests, and training smaller trees takes less #time.
#Random Forests can be less prone to overfitting. Training more trees in a Random Forest reduces the likelihood of overfitting, but training more trees #with GBTs increases the likelihood of overfitting. (In statistical language, Random Forests reduce variance by using more trees, whereas GBTs reduce #bias by using more trees.)
#Random Forests can be easier to tune since performance improves monotonically with the number of trees (whereas performance can start to decrease for #GBTs if the number of trees grows too large).


from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
import pyspark.ml.evaluation as cv

# Load the data stored in LIBSVM format as a DataFrame.
data = spark.read.format("libsvm").load("/FileStore/tables/sample_libsvm_data.txt")

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = ev.MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print ("First 5:", accuracy)
print("Test Error = %g " % (1.0 - accuracy))


In [6]:
from pyspark.ml import Pipeline

In [7]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
import pyspark.ml.evaluation as ev

data = spark.read.format("libsvm").load("/FileStore/tables/sample_libsvm_data.txt")


labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)

featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)


(trainingData, testData) = data.randomSplit([0.7, 0.3])

gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt])
model = pipeline.fit(trainingData)
predictions = model.transform(testData)
evaluator = ev.MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

gbtModel = model.stages[2]
print(gbtModel)