[Covertype dataset](https://archive.ics.uci.edu/dataset/31/covertype) on UCI ML Repository

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

### Read the dataset

In [8]:
df = spark.read.csv("data/covertype/covtype.data.gz", header=False, inferSchema=True)

                                                                                

### Split into testing and training data

In [3]:
train_data, test_data = df.randomSplit([0.9, 0.1])
train_data.cache()
test_data.cache()

DataFrame[_c0: int, _c1: int, _c2: int, _c3: int, _c4: int, _c5: int, _c6: int, _c7: int, _c8: int, _c9: int, _c10: int, _c11: int, _c12: int, _c13: int, _c14: int, _c15: int, _c16: int, _c17: int, _c18: int, _c19: int, _c20: int, _c21: int, _c22: int, _c23: int, _c24: int, _c25: int, _c26: int, _c27: int, _c28: int, _c29: int, _c30: int, _c31: int, _c32: int, _c33: int, _c34: int, _c35: int, _c36: int, _c37: int, _c38: int, _c39: int, _c40: int, _c41: int, _c42: int, _c43: int, _c44: int, _c45: int, _c46: int, _c47: int, _c48: int, _c49: int, _c50: int, _c51: int, _c52: int, _c53: int, _c54: int]

### Collapse all those columns into a "feature vector"

In [4]:
from pyspark.ml.feature import VectorAssembler

input_cols = df.schema.names[:-1]
vector_assembler = VectorAssembler(inputCols=input_cols, outputCol="featureVector")

assembled_train_data = vector_assembler.transform(train_data)
assembled_test_data = vector_assembler.transform(test_data)

assembled_train_data.select("featureVector").show(5)

[Stage 5:>                                                          (0 + 1) / 1]

+--------------------+
|       featureVector|
+--------------------+
|(54,[0,1,2,3,4,5,...|
|(54,[0,1,2,3,4,5,...|
|(54,[0,1,2,3,4,5,...|
|(54,[0,1,2,3,4,5,...|
|(54,[0,1,2,3,4,5,...|
+--------------------+
only showing top 5 rows



                                                                                

### Train the model

In [10]:
from pyspark.ml.classification import DecisionTreeClassifier

labelCol = df.schema.names[-1]
classifier = DecisionTreeClassifier(seed=1234, labelCol=labelCol, featuresCol="featureVector",
predictionCol="prediction")

model = classifier.fit(assembled_train_data)
print(model.toDebugString)

                                                                                

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_0a499d9ff701, depth=5, numNodes=43, numClasses=8, numFeatures=54
  If (feature 0 <= 3048.5)
   If (feature 0 <= 2561.5)
    If (feature 10 <= 0.5)
     If (feature 0 <= 2450.0)
      If (feature 3 <= 15.0)
       Predict: 4.0
      Else (feature 3 > 15.0)
       Predict: 3.0
     Else (feature 0 > 2450.0)
      If (feature 17 <= 0.5)
       Predict: 2.0
      Else (feature 17 > 0.5)
       Predict: 3.0
    Else (feature 10 > 0.5)
     Predict: 2.0
   Else (feature 0 > 2561.5)
    If (feature 0 <= 2952.5)
     If (feature 15 <= 0.5)
      If (feature 17 <= 0.5)
       Predict: 2.0
      Else (feature 17 > 0.5)
       Predict: 3.0
     Else (feature 15 > 0.5)
      Predict: 3.0
    Else (feature 0 > 2952.5)
     If (feature 3 <= 186.0)
      If (feature 36 <= 0.5)
       Predict: 2.0
      Else (feature 36 > 0.5)
       Predict: 1.0
     Else (feature 3 > 186.0)
      Predict: 2.0
  Else (feature 0 > 3048.5)
   If (feature 0 <= 

### Evaluate

In [6]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

predictions = model.transform(assembled_test_data)

evaluator = MulticlassClassificationEvaluator(labelCol=labelCol, predictionCol="prediction")

accuracy = evaluator.setMetricName("accuracy").evaluate(predictions)
f1 = evaluator.setMetricName("f1").evaluate(predictions)

print("accuracy", accuracy)
print("f1", f1)

[Stage 24:>                                                         (0 + 1) / 1]

accuracy 0.7011925453012339
f1 0.6831750961501089


                                                                                

In [7]:
confusion_matrix = predictions.groupBy(labelCol) \
    .pivot("prediction") \
    .count() \
    .na.fill(0) \
    .orderBy(labelCol)

confusion_matrix.show()

+----+-----+-----+----+---+----+
|_c54|  1.0|  2.0| 3.0|4.0| 7.0|
+----+-----+-----+----+---+----+
|   1|14957| 5878|  17|  0| 430|
|   2| 5872|21842| 462| 13|  49|
|   3|    0|  606|2812| 82|   0|
|   4|    0|    6| 181|101|   0|
|   5|    1|  909|  78|  0|   0|
|   6|    0|  711|1023| 52|   0|
|   7|  985|    3|   6|  0|1035|
+----+-----+-----+----+---+----+

