# Feature selection with random forests

Feature importance is measured as averaged information gain from all decision trees in a random forest.
The algorithm works also with nonlinearly separable data.

In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.regression.RandomForestRegressor
import org.apache.spark.ml.evaluation.RegressionEvaluator

## Load the dataset

In [2]:
val df = spark.read.
  format("csv").
  option("header", "true").
  option("inferschema", "true").
  option("delimiter",",").
  load("../Datasets/Boston.csv")

df = [CRIM: double, ZN: double ... 12 more fields]


[CRIM: double, ZN: double ... 12 more fields]

## Explore the dataset

In [3]:
df.show(5)

+-----------+----+-----------+----+-----------+-----------+-----------+-----------+---+---+-----------+-----------+-----------+-----------+
|       CRIM|  ZN|      INDUS|CHAS|        NOX|         RM|        AGE|        DIS|RAD|TAX|    PTRATIO|          B|      LSTAT|       MEDV|
+-----------+----+-----------+----+-----------+-----------+-----------+-----------+---+---+-----------+-----------+-----------+-----------+
|    0.00632|18.0|2.309999943|   0|0.537999988|6.574999809|65.19999695|4.090000153|  1|296|15.30000019|396.8999939|4.980000019|       24.0|
|0.027310001| 0.0|7.070000172|   0|0.469000012|6.421000004|78.90000153|4.967100143|  2|242|17.79999924|396.8999939|9.140000343|21.60000038|
|    0.02729| 0.0|7.070000172|   0|0.469000012|7.184999943|61.09999847|4.967100143|  2|242|17.79999924|392.8299866| 4.03000021|34.70000076|
|0.032370001| 0.0|2.180000067|   0|0.458000004|6.998000145|45.79999924|6.062200069|  3|222|18.70000076|394.6300049|2.940000057|33.40000153|
|0.069049999| 0.0|2.

In [4]:
df.printSchema

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)



In [5]:
df.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+
|summary|              CRIM|                ZN|             INDUS|              CHAS|               NOX|                RM|               AGE|               DIS|              RAD|               TAX|          PTRATIO|                 B|             LSTAT|             MEDV|
+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+
|  count|               506|               506|               506|               506|               506|               506|               506|               506|              506|  

## Vector Assembler

In [6]:
df.columns

[CRIM, ZN, INDUS, CHAS, NOX, RM, AGE, DIS, RAD, TAX, PTRATIO, B, LSTAT, MEDV]

In [7]:
df.columns.length

14

In [8]:
val features = df.columns.slice(0,13)

features = Array(CRIM, ZN, INDUS, CHAS, NOX, RM, AGE, DIS, RAD, TAX, PTRATIO, B, LSTAT)


[CRIM, ZN, INDUS, CHAS, NOX, RM, AGE, DIS, RAD, TAX, PTRATIO, B, LSTAT]

In [9]:
val assembler = new VectorAssembler().
  setInputCols(features).//setInputCols(Array("_c1", "_c2")).
  setOutputCol("features")

assembler = vecAssembler_b075ab5a8a4b


vecAssembler_b075ab5a8a4b

In [10]:
val df_v = assembler.transform(df).select(col("MEDV").as("label"), col("features"))
df_v.show(10)

+-----------+--------------------+
|      label|            features|
+-----------+--------------------+
|       24.0|[0.00632,18.0,2.3...|
|21.60000038|[0.027310001,0.0,...|
|34.70000076|[0.02729,0.0,7.07...|
|33.40000153|[0.032370001,0.0,...|
|36.20000076|[0.069049999,0.0,...|
|28.70000076|[0.029850001,0.0,...|
|22.89999962|[0.088289998,12.5...|
|27.10000038|[0.144549996,12.5...|
|       16.5|[0.211239994,12.5...|
|18.89999962|[0.170039997,12.5...|
+-----------+--------------------+
only showing top 10 rows



df_v = [label: double, features: vector]


[label: double, features: vector]

## Split into train and test sets

In [11]:
val Array(trainingData, testData) = df_v.randomSplit(Array(0.7, 0.3))

trainingData = [label: double, features: vector]
testData = [label: double, features: vector]


[label: double, features: vector]

## Train the model

In [12]:
val rf = new RandomForestRegressor().setNumTrees(10)
val model = rf.fit(trainingData)

rf = rfr_f2b8bb660651
model = RandomForestRegressionModel (uid=rfr_a3da2059d4d3) with 10 trees


RandomForestRegressionModel (uid=rfr_a3da2059d4d3) with 10 trees

## Evaluate the model

In [13]:
val predictions = model.transform(testData)
predictions.show(10)

+-----------+--------------------+------------------+
|      label|            features|        prediction|
+-----------+--------------------+------------------+
|6.300000191|[9.916549683,0.0,...|11.273334295801543|
|        7.0|[0.183369994,0.0,...|13.148680101232458|
|        7.0|[45.74610138,0.0,...| 10.38523085058882|
|7.199999809|[14.23620033,0.0,...|11.289351287822631|
|7.199999809|[16.8118,0.0,18.1...| 10.63189757028152|
|7.199999809|[18.08460045,0.0,...|11.903351282685488|
|8.699999809|[15.17720032,0.0,...|12.467966658620654|
|10.19999981|[12.24720001,0.0,...|  17.3304774724265|
|10.19999981|[14.33370018,0.0,...| 10.63189757028152|
|10.19999981|[17.86669922,0.0,...|11.289351287822631|
+-----------+--------------------+------------------+
only showing top 10 rows



predictions = [label: double, features: vector ... 1 more field]


[label: double, features: vector ... 1 more field]

In [14]:
val evaluator = new RegressionEvaluator().setMetricName("r2")

evaluator = regEval_9bfe039aeb4f


regEval_9bfe039aeb4f

In [15]:
val score = evaluator.evaluate(predictions)
score

score = 0.7919073371520533


0.7919073371520533

## Feature importance

In [16]:
model.featureImportances.toArray

[0.04992911225545793, 3.529606544113375E-4, 0.029780150278418516, 0.006880081583754907, 0.06744083566657347, 0.3435367629959497, 0.018556086096938488, 0.04840901580843847, 0.005693137259555262, 0.04866821784567442, 0.04284827531184007, 0.00577587094172966, 0.33212949330125785]

In [17]:
val fi = List.range(0,13).map(i => (df.columns(i), model.featureImportances.toArray(i)) ).toDF("feature", "importance")
fi.sort(col("importance").desc).show()

+-------+--------------------+
|feature|          importance|
+-------+--------------------+
|     RM|  0.3435367629959497|
|  LSTAT| 0.33212949330125785|
|    NOX| 0.06744083566657347|
|   CRIM| 0.04992911225545793|
|    TAX| 0.04866821784567442|
|    DIS| 0.04840901580843847|
|PTRATIO| 0.04284827531184007|
|  INDUS|0.029780150278418516|
|    AGE|0.018556086096938488|
|   CHAS|0.006880081583754907|
|      B| 0.00577587094172966|
|    RAD|0.005693137259555262|
|     ZN|3.529606544113375E-4|
+-------+--------------------+



fi = [feature: string, importance: double]


[feature: string, importance: double]

## Select features

In [18]:
val assembler = new VectorAssembler().
  setInputCols(Array("RM", "LSTAT")).
  setOutputCol("features")

assembler = vecAssembler_476cff76f946


vecAssembler_476cff76f946

In [19]:
val df_vs = assembler.transform(df).select(col("MEDV").as("label"), col("features"))
df_vs.show(10)

+-----------+--------------------+
|      label|            features|
+-----------+--------------------+
|       24.0|[6.574999809,4.98...|
|21.60000038|[6.421000004,9.14...|
|34.70000076|[7.184999943,4.03...|
|33.40000153|[6.998000145,2.94...|
|36.20000076|[7.146999836,5.32...|
|28.70000076|[6.429999828,5.21...|
|22.89999962|[6.012000084,12.4...|
|27.10000038|[6.171999931,19.1...|
|       16.5|[5.631000042,29.9...|
|18.89999962|[6.004000187,17.1...|
+-----------+--------------------+
only showing top 10 rows



df_vs = [label: double, features: vector]


[label: double, features: vector]