[Covertype dataset](https://archive.ics.uci.edu/dataset/31/covertype) on UCI ML Repository

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

### Read the dataset

In [2]:
colnames = ["Elevation", "Aspect", "Slope", \
"Horizontal_Distance_To_Hydrology", \
"Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
"Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", \
"Horizontal_Distance_To_Fire_Points"] + \
[f"Wilderness_Area_{i}" for i in range(4)] + \
[f"Soil_Type_{i}" for i in range(40)] + \
["Cover_Type"]

# alternatively, colnames = df.schema.names

In [3]:
df = spark.read.csv("data/covertype/covtype.data.gz", header=False, inferSchema=True)
df = df.toDF(*colnames)

df = df.na.drop() # handle missing values lol

                                                                                

### Split into testing and training data

In [None]:
train_data, test_data = df.randomSplit([0.9, 0.1])
train_data.cache()
test_data.cache()

### Collapse all those columns into a "feature vector"

In [5]:
from pyspark.ml.feature import VectorAssembler

input_cols = colnames[:-1]
vector_assembler = VectorAssembler(inputCols=input_cols, outputCol="featureVector")

assembled_train_data = vector_assembler.transform(train_data)
assembled_test_data = vector_assembler.transform(test_data)

assembled_train_data.select("featureVector").show(3, truncate=False)

[Stage 2:>                                                          (0 + 1) / 1]

+--------------------------------------------------------------------------------------------------+
|featureVector                                                                                     |
+--------------------------------------------------------------------------------------------------+
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1859.0,18.0,12.0,67.0,11.0,90.0,211.0,215.0,139.0,792.0,1.0,1.0])|
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1860.0,18.0,13.0,95.0,15.0,90.0,210.0,213.0,138.0,780.0,1.0,1.0])|
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1861.0,35.0,14.0,60.0,11.0,85.0,218.0,209.0,124.0,832.0,1.0,1.0])|
+--------------------------------------------------------------------------------------------------+
only showing top 3 rows



                                                                                

### Train the model

In [6]:
from pyspark.ml.classification import DecisionTreeClassifier

classifier = DecisionTreeClassifier(seed=1234, labelCol=colnames[-1], featuresCol="featureVector", 
    predictionCol="prediction")

model = classifier.fit(assembled_train_data)
model.toDebugString

                                                                                

'DecisionTreeClassificationModel: uid=DecisionTreeClassifier_ccfd0a4b041f, depth=5, numNodes=47, numClasses=8, numFeatures=54\n  If (feature 0 <= 3047.5)\n   If (feature 0 <= 2561.5)\n    If (feature 10 <= 0.5)\n     If (feature 0 <= 2450.5)\n      If (feature 3 <= 15.0)\n       Predict: 4.0\n      Else (feature 3 > 15.0)\n       Predict: 3.0\n     Else (feature 0 > 2450.5)\n      If (feature 17 <= 0.5)\n       Predict: 2.0\n      Else (feature 17 > 0.5)\n       Predict: 3.0\n    Else (feature 10 > 0.5)\n     Predict: 2.0\n   Else (feature 0 > 2561.5)\n    If (feature 0 <= 2952.5)\n     If (feature 15 <= 0.5)\n      If (feature 17 <= 0.5)\n       Predict: 2.0\n      Else (feature 17 > 0.5)\n       Predict: 3.0\n     Else (feature 15 > 0.5)\n      Predict: 3.0\n    Else (feature 0 > 2952.5)\n     If (feature 3 <= 211.0)\n      If (feature 36 <= 0.5)\n       Predict: 2.0\n      Else (feature 36 > 0.5)\n       Predict: 1.0\n     Else (feature 3 > 211.0)\n      Predict: 2.0\n  Else (featur

### Evaluate

In [7]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

predictions = model.transform(assembled_test_data)

evaluator = MulticlassClassificationEvaluator(labelCol=colnames[-1], predictionCol="prediction")
accuracy = evaluator.setMetricName("accuracy").evaluate(predictions)
f1 = evaluator.setMetricName("f1").evaluate(predictions)

print("accuracy", accuracy)
print("f1", f1)

[Stage 21:>                                                         (0 + 1) / 1]

accuracy 0.7037918369458382
f1 0.686542263900111


                                                                                

In [None]:
confusion_matrix = predictions.groupBy(colnames[-1]) \
    .pivot("prediction") \
    .count() \
    .fillna(0) \
    .orderBy(colnames[-1])

confusion_matrix.show()

+----------+-----+-----+----+---+----+
|Cover_Type|  1.0|  2.0| 3.0|4.0| 7.0|
+----------+-----+-----+----+---+----+
|         1|14620| 6002|  23|  0| 517|
|         2| 5671|22074| 480| 10|  54|
|         3|    0|  605|2866| 71|   0|
|         4|    0|    4| 166|108|   0|
|         5|    1|  885|  82|  0|   0|
|         6|    0|  659| 944| 68|   0|
|         7|  926|    2|   8|  0|1147|
+----------+-----+-----+----+---+----+



### Hyperparameter Tuning

Not in manual again.

In [9]:
from pyspark.ml import Pipeline

assembler = VectorAssembler(inputCols=colnames[:-1], outputCol="featureVector")
classifier = DecisionTreeClassifier(featuresCol="featureVector", labelCol=colnames[-1], predictionCol="prediction")
pipeline = Pipeline(stages=[assembler, classifier])

`TrainValidationSplit` is like `CrossValidator`, but it performs the split only once. Good for speed.

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

grid = ParamGridBuilder() \
    .addGrid(classifier.impurity, ["gini", "entropy"]) \
    .addGrid(classifier.maxDepth, [1, 20]) \
    .addGrid(classifier.maxBins, [40, 300]) \
    .addGrid(classifier.minInfoGain, [0.0, 0.05]) \
    .build()

evaluator = MulticlassClassificationEvaluator(labelCol="Cover_Type", predictionCol="prediction", 
    metricName="accuracy")

validator = TrainValidationSplit(estimator=pipeline, evaluator=evaluator, 
    estimatorParamMaps=grid)
validator_model = validator.fit(train_data)

In [11]:
params = validator_model.getEstimatorParamMaps() # list of param maps for all models
metrics = validator_model.validationMetrics 

metrics_and_params = list(zip(metrics, params))
metrics_and_params.sort(key=lambda x: x[0], reverse=True)
metrics_and_params[0]

(0.9119171774464062,
 {Param(parent='DecisionTreeClassifier_3f718699c342', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'): 'entropy',
  Param(parent='DecisionTreeClassifier_3f718699c342', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 20,
  Param(parent='DecisionTreeClassifier_3f718699c342', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 40,
  Param(parent='DecisionTreeClassifier_3f718699c342', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0})

In [12]:
evaluator.evaluate(validator_model.bestModel.transform(test_data))

24/11/25 14:44:11 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB


0.915489800493163

The usage for `RandomForestClassifier` is the exact same. Even the hyperparameters are the same.