In [None]:
!pip install pyspark

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Trees").getOrCreate()

In [2]:
data = spark.read.format("libsvm").load("../data/sample_libsvm_data.txt")

data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [3]:
data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [4]:
train, test = data.randomSplit(weights = [0.7, 0.3], seed = 42)

## DecisionTreeClassifier

In [5]:
from pyspark.ml.classification import DecisionTreeClassifier

In [6]:
dtc = DecisionTreeClassifier(featuresCol = "features",
                             labelCol = "label",
                             predictionCol = "prediction", 
                             maxDepth = 10)

model = dtc.fit(train)

y_hat = model.transform(test)

y_hat.show(10)

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[100,101,102...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[125,126,127...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[126,127,128...|   [24.0,0.0]|  [1.0,0.0]|       0.0|
+-----+--------------------+-------------+-----------+----------+
only showing top 10 rows



In [7]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [9]:
evaluator = MulticlassClassificationEvaluator(labelCol = "label",
                                              predictionCol = "prediction",
                                              metricName = "f1")

f1 = evaluator.evaluate(y_hat)

print(f1)

0.9428571428571428


In [10]:
evaluator = MulticlassClassificationEvaluator(labelCol = "label",
                                              predictionCol = "prediction",
                                              metricName = "accuracy")

accuracy = evaluator.evaluate(y_hat)

print(accuracy)

0.9428571428571428


## RandomForestClassifier

In [11]:
from pyspark.ml.classification import RandomForestClassifier

In [12]:
rfc = RandomForestClassifier(featuresCol = "features",
                             labelCol = "label",
                             predictionCol = "prediction",
                             numTrees = 100)

model = rfc.fit(train)

y_hat = model.transform(test)

y_hat.show(10)

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[100,101,102...|  [60.0,40.0]|  [0.6,0.4]|       0.0|
|  0.0|(692,[123,124,125...|  [100.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[123,124,125...|   [99.0,1.0]|[0.99,0.01]|       0.0|
|  0.0|(692,[124,125,126...|   [95.0,5.0]|[0.95,0.05]|       0.0|
|  0.0|(692,[124,125,126...|   [92.0,8.0]|[0.92,0.08]|       0.0|
|  0.0|(692,[124,125,126...|   [96.0,4.0]|[0.96,0.04]|       0.0|
|  0.0|(692,[125,126,127...|   [91.0,9.0]|[0.91,0.09]|       0.0|
|  0.0|(692,[126,127,128...|   [99.0,1.0]|[0.99,0.01]|       0.0|
|  0.0|(692,[126,127,128...|   [95.0,5.0]|[0.95,0.05]|       0.0|
|  0.0|(692,[126,127,128...|   [91.0,9.0]|[0.91,0.09]|       0.0|
+-----+--------------------+-------------+-----------+----------+
only showing top 10 rows



In [13]:
evaluator = MulticlassClassificationEvaluator(labelCol = "label",
                                              predictionCol = "prediction",
                                              metricName = "f1")

f1 = evaluator.evaluate(y_hat)

print(f1)

1.0


In [14]:
evaluator = MulticlassClassificationEvaluator(labelCol = "label",
                                              predictionCol = "prediction",
                                              metricName = "accuracy")

accuracy = evaluator.evaluate(y_hat)

print(accuracy)

1.0


## GradientBoostingClassifier

In [15]:
from pyspark.ml.classification import GBTClassifier

In [16]:
gbt = GBTClassifier(featuresCol = "features",
                    labelCol = "label",
                    predictionCol = "prediction", 
                    maxIter = 100)

model = gbt.fit(train)

y_hat = model.transform(test)

y_hat.show(10)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[100,101,102...|[1.38339132415704...|[0.94085420246358...|       0.0|
|  0.0|(692,[123,124,125...|[2.22183994287119...|[0.98838390881263...|       0.0|
|  0.0|(692,[123,124,125...|[2.22183994287119...|[0.98838390881263...|       0.0|
|  0.0|(692,[124,125,126...|[2.22183994287119...|[0.98838390881263...|       0.0|
|  0.0|(692,[124,125,126...|[1.90441802777121...|[0.97830704577025...|       0.0|
|  0.0|(692,[124,125,126...|[2.15150698495498...|[0.98665283147970...|       0.0|
|  0.0|(692,[125,126,127...|[2.22183994287119...|[0.98838390881263...|       0.0|
|  0.0|(692,[126,127,128...|[2.22183994287119...|[0.98838390881263...|       0.0|
|  0.0|(692,[126,127,128...|[2.10530198645225...|[0.98537952564698...|       0.0|
|  0.0|(692,[126

In [17]:
evaluator = MulticlassClassificationEvaluator(labelCol = "label",
                                              predictionCol = "prediction",
                                              metricName = "f1")

f1 = evaluator.evaluate(y_hat)

print(f1)

0.9428571428571428


In [18]:
evaluator = MulticlassClassificationEvaluator(labelCol = "label",
                                              predictionCol = "prediction",
                                              metricName = "accuracy")

accuracy = evaluator.evaluate(y_hat)

print(accuracy)

0.9428571428571428


In [None]:
################################################################################################################################