# Kaggle Competition - Cover Type Prediction of Forests
### Using Tree-based Machine Learning algorithms to predict cover type of forests

##### Importing useful libraries

In [3]:
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import udf, array
from pyspark.sql.types import FloatType, IntegerType
import time

##### Loading data

In [5]:
train_data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferSchema='true').load('/FileStore/tables/train_set-51e11.csv')
test_data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferSchema='true').load('/FileStore/tables/test_set-b5f57.csv')
print('Training set length:', train_data.count())
print('Test set length:', test_data.count())

In [6]:
train_data.printSchema()

In [7]:
display(train_data)

Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
1,2611,326,20,120,27,1597,168,214,184,2913,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6
2,2772,324,17,42,7,1814,175,220,183,2879,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
3,2764,4,14,480,-21,700,201,212,148,700,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
4,3032,342,9,60,8,4050,202,227,164,2376,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
5,2488,23,11,117,21,1117,209,218,151,1136,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
6,2968,83,8,390,19,4253,232,226,127,4570,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
7,3027,11,6,534,47,1248,214,228,151,2388,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2
8,3216,277,9,67,23,5430,212,236,169,2373,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
9,3242,262,5,849,169,1672,207,242,173,691,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
10,3315,61,15,120,-6,3042,231,208,106,1832,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,7


##### Preprocessing dataset
I noticed that the features related to hillshade have some aberrant values. As per data description, their values should be between 0 and 255, however, we can see some values outside this range.

In [9]:
train_data.select('Hillshade_Noon').describe().show()

In [10]:
train_data.select('Hillshade_9am').describe().show()

In [11]:
train_data.select('Hillshade_3pm').describe().show()

Therefore, I decided to clean out these values. My strategy was to infer the average value for each Cover_Type category in the case of the training set and the overall average value in the case of the test set. This pre-processing showed better scores when in the submissions compared to the raw data.

In [13]:
# Dealing with outliers and aberrant values
# Filtering train data for correct values
filtered_noon = train_data.filter('Hillshade_Noon <= 255 and Hillshade_noon > 0').select('Hillshade_Noon', 'Cover_Type')
filtered_3pm = train_data.filter('Hillshade_3pm <= 255 and Hillshade_3pm > 0').select('Hillshade_3pm', 'Cover_Type')
filtered_9am = train_data.filter('Hillshade_9am <= 255 and Hillshade_9am > 0').select('Hillshade_9am', 'Cover_Type')

# Means to be used in training data aberrant values, means by categore
means_noon = filtered_noon.groupBy('Cover_Type').mean().orderBy('Cover_Type').collect()
means_3pm = filtered_3pm.groupBy('Cover_Type').mean().orderBy('Cover_Type').collect()
means_9am = filtered_9am.groupBy('Cover_Type').mean().orderBy('Cover_Type').collect()

# Means to used in test data aberrant values, overall means
mean_noon = filtered_noon.agg({"Hillshade_Noon": "avg"}).collect()[0][0]
mean_3pm = filtered_3pm.agg({"Hillshade_3pm": "avg"}).collect()[0][0]
mean_9am = filtered_9am.agg({"Hillshade_9am": "avg"}).collect()[0][0]

In [14]:
# Creating udfs
# For training set
transf_noon = udf(lambda cover, hillshade: float((means_noon[cover-1][1]) if (hillshade > 255 or hillshade <= 0) else hillshade), FloatType())
transf_3pm = udf(lambda cover, hillshade: float((means_3pm[cover-1][1]) if (hillshade > 255 or hillshade <= 0) else hillshade), FloatType())
transf_9am = udf(lambda cover, hillshade: float((means_9am[cover-1][1]) if (hillshade > 255 or hillshade <= 0) else hillshade), FloatType())
# For test set
transf_test_noon = udf(lambda hillshade: float((mean_noon) if (hillshade > 255 or hillshade <= 0) else hillshade), FloatType())
transf_test_3pm = udf(lambda hillshade: float((mean_3pm) if (hillshade > 255 or hillshade <= 0) else hillshade), FloatType())
transf_test_9am = udf(lambda hillshade: float((mean_9am) if (hillshade > 255 or hillshade <= 0) else hillshade), FloatType())

In [15]:
# Transforming training data
train_data = train_data.withColumn('Hillshade_Noon', transf_noon('Cover_Type', 'Hillshade_Noon')) \
                       .withColumn('Hillshade_3pm', transf_3pm('Cover_Type', 'Hillshade_3pm')) \
                       .withColumn('Hillshade_9am', transf_9am('Cover_Type', 'Hillshade_9am'))

# Transforming test data
test_data = test_data.withColumn('Hillshade_Noon', transf_test_noon('Hillshade_Noon')) \
                     .withColumn('Hillshade_3pm', transf_test_3pm('Hillshade_3pm')) \
                     .withColumn('Hillshade_9am', transf_test_9am('Hillshade_9am'))

##### Working with features
I decided to create new features based on combinations of differents distances provided.

I also decided to add new features (related to climatolgy and geology) based on soiled type. This information is available on the part ***Study Code USFS ELU Code Description*** of the data description.

Both operations showed slightly better scores after submission.

In [17]:
def create_features(df):
  df = df.withColumn('Distance_to_Hydrolody', (df['Horizontal_Distance_To_Hydrology']**2 + df['Vertical_Distance_To_Hydrology']**2)**(1/2))
  df = df.withColumn('Ele_plus_VDtHyd', df['Elevation'] + df['Vertical_Distance_To_Hydrology'])
  df = df.withColumn('Ele_minus_VDtHyd', df['Elevation'] - df['Vertical_Distance_To_Hydrology'])
  df = df.withColumn('Hydro_plus_Fire', df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Fire_Points'])
  df = df.withColumn('Hydro_minus_Fire', df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Fire_Points'])
  df = df.withColumn('Hydro_plus_Road', df['Horizontal_Distance_To_Hydrology'] + df['Horizontal_Distance_To_Roadways'])
  df = df.withColumn('Hydro_minus_Road', df['Horizontal_Distance_To_Hydrology'] - df['Horizontal_Distance_To_Roadways'])
  df = df.withColumn('Fire_plus_Road', df['Horizontal_Distance_To_Fire_Points'] + df['Horizontal_Distance_To_Roadways'])
  df = df.withColumn('Fire_minus_Road', df['Horizontal_Distance_To_Fire_Points'] - df['Horizontal_Distance_To_Roadways'])
  return df

def add_features(df):
  df = df.withColumn('clim2', ((df['Soil_Type1'] == 1) | (df['Soil_Type2'] == 1) | (df['Soil_Type3'] == 1) | (df['Soil_Type4'] == 1) |
                                  (df['Soil_Type5'] == 1) | (df['Soil_Type6'] == 1)).cast('int'))
  df = df.withColumn('clim3', ((df['Soil_Type7'] == 1) | (df['Soil_Type8'] == 1)).cast('int'))
  df = df.withColumn('clim4', ((df['Soil_Type9'] == 1) | (df['Soil_Type10'] == 1) | (df['Soil_Type11'] == 1) | (df['Soil_Type12'] == 1) | 
                                  (df['Soil_Type13'] == 1)).cast('int'))
  df = df.withColumn('clim5', ((df['Soil_Type14'] == 1) | (df['Soil_Type15'] == 1)).cast('int'))
  df = df.withColumn('clim6', ((df['Soil_Type16'] == 1) | (df['Soil_Type17'] == 1) | (df['Soil_Type18'] == 1)).cast('int'))
  df = df.withColumn('clim7', ((df['Soil_Type19'] == 1) | (df['Soil_Type20'] == 1) | (df['Soil_Type21'] == 1) | (df['Soil_Type22'] == 1) | 
                                  (df['Soil_Type23'] == 1) | (df['Soil_Type24'] == 1) | (df['Soil_Type25'] == 1) | (df['Soil_Type26'] == 1) |
                                  (df['Soil_Type27'] == 1) | (df['Soil_Type28'] == 1) | (df['Soil_Type29'] == 1) | (df['Soil_Type30'] == 1) |
                                  (df['Soil_Type31'] == 1) | (df['Soil_Type32'] == 1) | (df['Soil_Type33'] == 1) | (df['Soil_Type34'] == 1)).cast('int'))
  df = df.withColumn('clim8', ((df['Soil_Type35'] == 1) | (df['Soil_Type36'] == 1) | (df['Soil_Type37'] == 1) | (df['Soil_Type38'] == 1) | 
                                  (df['Soil_Type39'] == 1) | (df['Soil_Type40'] == 1)).cast('int'))
  df = df.withColumn('geo1', ((df['Soil_Type14'] == 1) | (df['Soil_Type15'] == 1) | (df['Soil_Type16'] == 1) | (df['Soil_Type17'] == 1) | 
                                  (df['Soil_Type19'] == 1) | (df['Soil_Type10'] == 1) | (df['Soil_Type21'] == 1)).cast('int'))
  df = df.withColumn('geo2', ((df['Soil_Type9'] == 1) | (df['Soil_Type22'] == 1) | (df['Soil_Type23'] == 1)).cast('int'))
  df = df.withColumn('geo5', ((df['Soil_Type7'] == 1) | (df['Soil_Type8'] == 1)).cast('int'))  
  df = df.withColumn('geo7', ((df['Soil_Type1'] == 1) | (df['Soil_Type2'] == 1) | (df['Soil_Type3'] == 1) | (df['Soil_Type4'] == 1) | 
                                  (df['Soil_Type5'] == 1) | (df['Soil_Type6'] == 1) | (df['Soil_Type10'] == 1) | (df['Soil_Type11'] == 1) |
                                  (df['Soil_Type12'] == 1) | (df['Soil_Type13'] == 1) | (df['Soil_Type18'] == 1) | (df['Soil_Type24'] == 1) |
                                  (df['Soil_Type25'] == 1) | (df['Soil_Type26'] == 1) | (df['Soil_Type27'] == 1) | (df['Soil_Type28'] == 1) |
                                  (df['Soil_Type29'] == 1) | (df['Soil_Type30'] == 1) | (df['Soil_Type31'] == 1) | (df['Soil_Type32'] == 1) |
                                  (df['Soil_Type33'] == 1) | (df['Soil_Type34'] == 1) | (df['Soil_Type35'] == 1) | (df['Soil_Type36'] == 1) |
                                  (df['Soil_Type37'] == 1) | (df['Soil_Type38'] == 1) | (df['Soil_Type39'] == 1) | (df['Soil_Type40'] == 1)).cast('int'))
  return df

In [18]:
train_data = create_features(train_data)
test_data = create_features(test_data)
train_data = add_features(train_data)
test_data = add_features(test_data)

##### Preparing for models

In [20]:
# Creating assembler
vector_assembler = VectorAssembler(inputCols=["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points", "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3", "Wilderness_Area4", "Soil_Type1", "Soil_Type2", "Soil_Type3", "Soil_Type4", "Soil_Type5", "Soil_Type6", "Soil_Type7", "Soil_Type8", "Soil_Type9", "Soil_Type10", "Soil_Type11", "Soil_Type12", "Soil_Type13", "Soil_Type14", "Soil_Type15", "Soil_Type16", "Soil_Type17", "Soil_Type18", "Soil_Type19", "Soil_Type20", "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24", "Soil_Type25", "Soil_Type26", "Soil_Type27", "Soil_Type28", "Soil_Type29", "Soil_Type30", "Soil_Type31", "Soil_Type32", "Soil_Type33", "Soil_Type34", "Soil_Type35", "Soil_Type36", "Soil_Type37", "Soil_Type38", "Soil_Type39", "Soil_Type40", 'Distance_to_Hydrolody', 'Ele_plus_VDtHyd', 'Ele_minus_VDtHyd', 'Hydro_plus_Fire', 'Hydro_minus_Fire', 'Hydro_plus_Road', 'Hydro_minus_Road', 'Fire_plus_Road', 'Fire_minus_Road','clim2', 'clim3', 'clim4', 'clim5', 'clim6', 'clim7','geo1', 'geo2', 'geo5', 'geo7'], outputCol="features")

train_data = vector_assembler.transform(train_data)
test_data = vector_assembler.transform(test_data)

#### Working on models
Due to the structure of the data, where there are both categorical (binary) and continuous variables, as well as the labels are categorical (7 categories), I decided to use tree-based models, as these type of models performs well with such data structure.

In [22]:
# Dividing the training set in training and validation, in order to evaluate different models
train_data2, val_data = train_data.randomSplit([0.8,0.2])

##### Decision Tree Classifier
To start, I decided to run a Decision Tree classifier, the most simple tree-based model in order to see first results

In [24]:
# DecisionTree hyperparameters
impurity_DT = "entropy"
maxBins_DT = 100
maxDepth_DT = 20

# Create model
classifier_DT = DecisionTreeClassifier(labelCol="Cover_Type", featuresCol="features", impurity="entropy", 
                                       maxBins=maxBins_DT, maxDepth=maxDepth_DT)

In [25]:
# preparing grid of paramaters for grid search
paramGrid = ParamGridBuilder().addGrid(classifier_DT.maxDepth, [15,20,30])\
                              .addGrid(classifier_DT.impurity, ['entropy', 'gini']) \
                              .addGrid(classifier_DT.maxBins, [50,100,150]).build()

# Creating evaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Cover_Type', metricName='f1')

# Creating CV model in order to find best parameters
crossval = CrossValidator(estimator=classifier_DT, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5, parallelism=4)

In [26]:
# Running model on train_data2
cvModel = crossval.fit(train_data2)

In [27]:
# Getting best parameters
bestModel = cvModel.bestModel
print('Max Depth:', bestModel.getOrDefault(bestModel.getParam('maxDepth')))
print('Impurity:', bestModel.getOrDefault(bestModel.getParam('impurity')))
print('Max Bins:', bestModel.getOrDefault(bestModel.getParam('maxBins')))

In [28]:
# Predictions on validation set
pred = cvModel.transform(val_data)

# evaluating with F1-score
evaluator.evaluate(pred)

##### Random Forest Classifier
We expect the Random Forest Classifier to perform better than the Decision Tree, as this model reduces variance and limits overfitting on the training set

In [30]:
# RandomForest hyperparameters
numTrees = 10
impurity_RF = "entropy"
maxBins_RF = 50
maxDepth_RF = 30

# Create model
classifier_RT = RandomForestClassifier(labelCol="Cover_Type", featuresCol="features", impurity="entropy", 
                                       maxBins=maxBins_RF, maxDepth=maxDepth_RF, numTrees = numTrees)

In [31]:
# Running the model with same hyperparameters as Decision Tree in order to have comparables performances
RFmodel = classifier_RT.fit(train_data2)

In [32]:
# Predictions on validation set
RFpred = RFmodel.transform(val_data)

# evaluating with F1-score
evaluator.evaluate(RFpred)

We can notice a better performance with a RandomForest Classifier.

Note: the parameter numTrees was selected by previous crossvalidation, similar to crossvalidation done for the DecisionTreeClassifier.

##### Gradient Boosted Tree Classifier
Finally I decided to use a Gradient Boosted Tree Classifier. This classifier ultimately has perfomed pretty well in comparison to other classifiers in several kaggle competitions.

This model works by reducing sequentially the error for each observation by creating a moderate quantity of trees at each data point.

Although this model is more prone to overfit than a Random Forest Classifier, when the training, validation and test sets are pretty homogenuous, the GBT model overperforms the RF model.

The GBTClassifier model available in the spark.ml library does not work with multiples labels, only with binary classification. Therefore I decided to use a following strategy (also called one-versus-all): 
   1. create 7 new binary labels, where each label correponds to each cover type; 
   2. train one GBT model per cover type; 
   3. separate the probabilities to belong to each type, for each observation; 
   4. for each observation choose the label given by the highest probability

Creating the new labels in the training set

In [36]:
# initiating gbt_train_data
gbt_train_data = train_data2
# loop over 7 types:
for i in range(1,8):
  gbt_train_data = gbt_train_data.withColumn('Type'+str(i), (gbt_train_data['Cover_Type'] == i).cast("int"))

display(gbt_train_data)

Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,Distance_to_Hydrolody,Ele_plus_VDtHyd,Ele_minus_VDtHyd,Hydro_plus_Fire,Hydro_minus_Fire,Hydro_plus_Road,Hydro_minus_Road,Fire_plus_Road,Fire_minus_Road,clim2,clim3,clim4,clim5,clim6,clim7,clim8,geo1,geo2,geo5,geo7,features,Type1,Type2,Type3,Type4,Type5,Type6,Type7
3,2764,4,14,480,-21,700,201.0,212.0,148.0,700,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,480.4591553920063,2743,2785,1180,-220,1180,-220,1400,0,0,0,1,0,0,0,0,0,0,0,1,"List(0, 73, List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 24, 54, 55, 56, 57, 58, 59, 60, 61, 65, 72), List(2764.0, 4.0, 14.0, 480.0, -21.0, 700.0, 201.0, 212.0, 148.0, 700.0, 1.0, 1.0, 480.45915539200627, 2743.0, 2785.0, 1180.0, -220.0, 1180.0, -220.0, 1400.0, 1.0, 1.0))",0,1,0,0,0,0,0
4,3032,342,9,60,8,4050,202.0,227.0,164.0,2376,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,60.53098380168623,3040,3024,2436,-2316,4110,-3990,6426,-1674,0,0,0,0,0,1,0,0,1,0,0,"List(0, 73, List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 36, 54, 55, 56, 57, 58, 59, 60, 61, 62, 68, 70), List(3032.0, 342.0, 9.0, 60.0, 8.0, 4050.0, 202.0, 227.0, 164.0, 2376.0, 1.0, 1.0, 60.530983801686226, 3040.0, 3024.0, 2436.0, -2316.0, 4110.0, -3990.0, 6426.0, -1674.0, 1.0, 1.0))",0,1,0,0,0,0,0
5,2488,23,11,117,21,1117,209.0,218.0,151.0,1136,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,118.86967653695368,2509,2467,1253,-1019,1234,-1000,2253,19,1,0,0,0,0,0,0,0,0,0,1,"List(0, 73, List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 19, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 72), List(2488.0, 23.0, 11.0, 117.0, 21.0, 1117.0, 209.0, 218.0, 151.0, 1136.0, 1.0, 1.0, 118.86967653695369, 2509.0, 2467.0, 1253.0, -1019.0, 1234.0, -1000.0, 2253.0, 19.0, 1.0, 1.0))",0,1,0,0,0,0,0
6,2968,83,8,390,19,4253,232.0,226.0,127.0,4570,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,390.46254621922446,2987,2949,4960,-4180,4643,-3863,8823,317,0,0,1,0,0,0,0,0,0,0,1,"List(0, 73, List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 25, 54, 55, 56, 57, 58, 59, 60, 61, 62, 65, 72), List(2968.0, 83.0, 8.0, 390.0, 19.0, 4253.0, 232.0, 226.0, 127.0, 4570.0, 1.0, 1.0, 390.46254621922446, 2987.0, 2949.0, 4960.0, -4180.0, 4643.0, -3863.0, 8823.0, 317.0, 1.0, 1.0))",0,1,0,0,0,0,0
10,3315,61,15,120,-6,3042,231.0,208.0,106.0,1832,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,7,120.14990636700472,3309,3321,1952,-1712,3162,-2922,4874,-1210,0,0,0,0,0,0,1,0,0,0,1,"List(0, 73, List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 51, 54, 55, 56, 57, 58, 59, 60, 61, 62, 72), List(3315.0, 61.0, 15.0, 120.0, -6.0, 3042.0, 231.0, 208.0, 106.0, 1832.0, 1.0, 1.0, 120.14990636700472, 3309.0, 3321.0, 1952.0, -1712.0, 3162.0, -2922.0, 4874.0, -1210.0, 1.0))",0,0,0,0,0,0,1
11,3221,165,3,520,33,5695,218.0,241.0,154.0,2529,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,521.046063222821,3254,3188,3049,-2009,6215,-5175,8224,-3166,0,0,0,0,0,1,0,0,1,0,0,"List(0, 73, List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 35, 54, 55, 56, 57, 58, 59, 60, 61, 62, 68, 70), List(3221.0, 165.0, 3.0, 520.0, 33.0, 5695.0, 218.0, 241.0, 154.0, 2529.0, 1.0, 1.0, 521.046063222821, 3254.0, 3188.0, 3049.0, -2009.0, 6215.0, -5175.0, 8224.0, -3166.0, 1.0, 1.0))",1,0,0,0,0,0,0
15,3136,322,8,210,43,3361,200.0,232.0,171.0,4944,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,214.3571785595248,3179,3093,5154,-4734,3571,-3151,8305,1583,0,0,0,0,0,1,0,0,0,0,1,"List(0, 73, List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 42, 54, 55, 56, 57, 58, 59, 60, 61, 62, 68, 72), List(3136.0, 322.0, 8.0, 210.0, 43.0, 3361.0, 200.0, 232.0, 171.0, 4944.0, 1.0, 1.0, 214.3571785595248, 3179.0, 3093.0, 5154.0, -4734.0, 3571.0, -3151.0, 8305.0, 1583.0, 1.0, 1.0))",1,0,0,0,0,0,0
16,2903,319,15,108,21,1830,179.0,225.0,185.0,2301,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,110.02272492535349,2924,2882,2409,-2193,1938,-1722,4131,471,0,0,0,0,0,1,0,0,1,0,0,"List(0, 73, List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 36, 54, 55, 56, 57, 58, 59, 60, 61, 62, 68, 70), List(2903.0, 319.0, 15.0, 108.0, 21.0, 1830.0, 179.0, 225.0, 185.0, 2301.0, 1.0, 1.0, 110.02272492535349, 2924.0, 2882.0, 2409.0, -2193.0, 1938.0, -1722.0, 4131.0, 471.0, 1.0, 1.0))",0,1,0,0,0,0,0
18,2514,134,14,90,-8,1140,242.0,232.0,117.0,1513,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,90.35485598461214,2506,2522,1603,-1423,1230,-1050,2653,373,1,0,0,0,0,0,0,0,0,0,1,"List(0, 73, List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 17, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 72), List(2514.0, 134.0, 14.0, 90.0, -8.0, 1140.0, 242.0, 232.0, 117.0, 1513.0, 1.0, 1.0, 90.35485598461214, 2506.0, 2522.0, 1603.0, -1423.0, 1230.0, -1050.0, 2653.0, 373.0, 1.0, 1.0))",0,0,1,0,0,0,0
19,3274,90,3,212,7,5093,224.0,234.0,146.0,547,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,212.11553455605272,3281,3267,759,-335,5305,-4881,5640,-4546,0,0,0,0,0,1,0,0,0,0,1,"List(0, 73, List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 42, 54, 55, 56, 57, 58, 59, 60, 61, 62, 68, 72), List(3274.0, 90.0, 3.0, 212.0, 7.0, 5093.0, 224.0, 234.0, 146.0, 547.0, 1.0, 1.0, 212.11553455605272, 3281.0, 3267.0, 759.0, -335.0, 5305.0, -4881.0, 5640.0, -4546.0, 1.0, 1.0))",0,1,0,0,0,0,0


We can notice the new columns created at the end of the data frame and their direct correspondance with Cover_Type

In [38]:
# UDF function to the creation of columns of probabilities
prob = udf(lambda v:float(v[1]), FloatType())

# Function and UDF function to get label with best probability
def max_idx(probs):
  (m,i) = max((v,i) for i,v in enumerate(probs))
  return i + 1

max_index = udf(lambda probs: max_idx(probs), IntegerType())

In [39]:
# Hyperparameters of GBTClassifier
maxIter = 10
maxDepth = 30
maxBins = 50

In [40]:
# Initiating temporary dataframe of predictions
pred_tmp = val_data
# Loop for each type / model
for i in range(1,8):
  # Keeping track of execution time for each model
  t1 = time.time()
  
  print('Training model', i)
  labelCol = "Type" + str(i)
  # Training the current model
  GBTmodel = GBTClassifier(labelCol=labelCol, featuresCol="features", maxIter=maxIter, maxDepth=maxDepth, maxBins=maxBins)
  GBTmodel = GBTmodel.fit(gbt_train_data)
  t2 = time.time()
  mins = round((t2-t1)/60, 2)
  print('Model',str(i),'trained. Execution time:', str(mins) + 'min')
  print('Computing predictions and saving file')
  # Predicting for current type
  pred_tmp = GBTmodel.transform(pred_tmp)
  # Extracting probability and creating its respective column
  pred_tmp = pred_tmp.withColumn('prob'+str(i), prob(pred_tmp['probability']))
  # Deleting other columns created by model.transform() to avoid errors in each loop
  pred_tmp = pred_tmp.drop('prediction').drop('probability').drop('rawPrediction')
  # Saving current file to have a checkpoint in case cluster/notebook crashes
  repo_path = '/FileStore/GBTpred' + str(i)
  pred_tmp.drop('features').repartition(1).write.format('com.databricks.spark.csv').options(header='true').mode('overwrite').save(repo_path)
  print('Predictions dataframe', str(i), 'saved\n')

In [41]:
# Dataframe with final prediction for Cover_Type
final_preds = pred_tmp.withColumn('prediction', max_index(array("prob1", "prob2", "prob3", "prob4", "prob5", "prob6", "prob7")))
final_preds = final_preds.withColumn('prediction', final_preds['prediction'].cast('double'))

# evaluating with F1-score
evaluator.evaluate(final_preds)

We can notice that the Gradient Boosted Tree Classifier outperformed the Random Forest Classifier.

Unfortunatelly, due to constraints of model running time, I was not able to use the cross-validation function of Pyspark for paramater tunning. However, after doing some tunning when submitting my predictions on the test set, I used a Gradient Boosted Tree Classifier with the following parameters:

In [43]:
maxIter = 15
maxDepth = 30
maxBins = 150