# Random Forest Classifier
**4.0 Random Forest classifier** This trains a random Forest classifier using High Level Features. It performs very well in terms of speed and accuracy.

To run this notebook we used the following configuration:
* *Software stack*: Spark 2.4.3
* *Platform*: CentOS 7, Python 3.6
* *Spark cluster*: Analytix

In [1]:
# pip install pyspark or use your favorite way to set Spark Home, here we use findspark
import findspark
findspark.init('/home/luca/Spark/spark-2.4.3-bin-hadoop2.7') #set path to SPARK_HOME

In [2]:
# Configure according to your environment
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("Training-RandomForestClassifier") \
        .master("yarn") \
        .config("spark.driver.memory","8g") \
        .config("spark.executor.memory","14g") \
        .config("spark.executor.cores","6") \
        .config("spark.executor.instances","40") \
        .config("spark.dynamicAllocation.enabled","false") \
        .config("spark.speculation","true") \
        .getOrCreate()

In [3]:
spark

## Load train and test dataframes

In [4]:
PATH = "hdfs://analytix/Training/Spark/TopologyClassifier/"

trainDF = spark.read.format('parquet')\
        .load(PATH + 'trainUndersampled.parquet')\
        .select(['hfeatures', 'label', 'encoded_label'])
        
testDF = spark.read.format('parquet')\
        .load(PATH + 'testUndersampled.parquet')\
        .select(['hfeatures', 'label', 'encoded_label'])

In [5]:
# Optionally count the number of events in the training and test datasets
print('There are', trainDF.count(), 'training events')
print('There are', testDF.count(), 'test events')

There are 3426083 training events
There are 856090 test events


In [6]:
trainDF.printSchema()

root
 |-- hfeatures: vector (nullable = true)
 |-- label: long (nullable = true)
 |-- encoded_label: vector (nullable = true)



In [7]:
testDF.show(3)

+--------------------+-----+-------------+
|           hfeatures|label|encoded_label|
+--------------------+-----+-------------+
|[74.9491729736328...|    0|(3,[0],[1.0])|
|[0.0,27.335390090...|    0|(3,[0],[1.0])|
|[47.6835403442382...|    0|(3,[0],[1.0])|
+--------------------+-----+-------------+
only showing top 3 rows



## Train the Random Forest

In [8]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(numTrees=100, maxDepth=10,
                            featuresCol='hfeatures',
                            labelCol="label",
                            predictionCol='prediction')

In [9]:
%time rf_model = rf.fit(trainDF)

CPU times: user 43.9 ms, sys: 22 ms, total: 65.8 ms
Wall time: 3min 25s


## Save the model

In [10]:
rf_model.save(path='file:/tmp/models/RandomForest/rf_model')

In [80]:
# reload with:
# from pyspark.ml.classification import RandomForestClassificationModel
# rf_model = RandomForestClassificationModel.load('file:/tmp/models/RandomForest/rf_model')

## Prediction

In [11]:
pred = rf_model.transform(testDF)

In [12]:
pred.show(5)

+--------------------+-----+-------------+--------------------+--------------------+----------+
|           hfeatures|label|encoded_label|       rawPrediction|         probability|prediction|
+--------------------+-----+-------------+--------------------+--------------------+----------+
|[74.9491729736328...|    0|(3,[0],[1.0])|[94.3038590849602...|[0.94303859084960...|       0.0|
|[0.0,27.335390090...|    0|(3,[0],[1.0])|[79.9268879357065...|[0.79926887935706...|       0.0|
|[47.6835403442382...|    0|(3,[0],[1.0])|[93.3031558743031...|[0.93303155874303...|       0.0|
|[80.9036312103271...|    0|(3,[0],[1.0])|[71.3032930224269...|[0.71303293022426...|       0.0|
|[95.2762756347656...|    0|(3,[0],[1.0])|[95.5933859238307...|[0.95593385923830...|       0.0|
+--------------------+-----+-------------+--------------------+--------------------+----------+
only showing top 5 rows



## Compute the AUC

In [13]:
from pyspark.sql.types import ArrayType, DoubleType
from pyspark.sql.functions import udf
    
vector_udf = udf(lambda vector: vector.toArray().tolist(),ArrayType(DoubleType()))
pred = pred.select([vector_udf('encoded_label').alias('encoded_label'),
                    vector_udf('probability').alias('probability')])

In [14]:
%time pred_pd = pred.select(['encoded_label', 'probability']).toPandas()

CPU times: user 4.23 s, sys: 389 ms, total: 4.62 s
Wall time: 28.5 s


In [15]:
pred_pd.head()

Unnamed: 0,encoded_label,probability
0,"[1.0, 0.0, 0.0]","[0.9430385908496021, 0.009534201452577617, 0.0..."
1,"[1.0, 0.0, 0.0]","[0.7992688793570651, 0.009870907970965345, 0.1..."
2,"[1.0, 0.0, 0.0]","[0.933031558743031, 0.02514800703821042, 0.041..."
3,"[1.0, 0.0, 0.0]","[0.7130329302242695, 0.05587211775923601, 0.23..."
4,"[1.0, 0.0, 0.0]","[0.9559338592383074, 0.02823323615535614, 0.01..."


In [18]:
import numpy as np
y_true = np.array(pred_pd['encoded_label'].tolist())
y_pred = np.array(pred_pd['probability'].tolist())

In [19]:
from sklearn.metrics import roc_curve, auc
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [21]:
%matplotlib notebook
import matplotlib.pyplot as plt

plt.style.use('seaborn-darkgrid')
plt.figure()
plt.plot(fpr[1], tpr[1], color='blue', 
         lw=2, label='Random Forest classifier (AUC) = %0.4f' % roc_auc[1])
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Background Contamination (FPR)')
plt.ylabel('Signal Efficiency (TPR)')
plt.title('$tt$ selector')
plt.legend(loc="lower right")
plt.grid()
plt.show()

<IPython.core.display.Javascript object>

In [22]:
plt.figure()
plt.plot(fpr[2], tpr[2], color='blue', 
         lw=2, label='Random Forest classifier (AUC) = %0.4f' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Background Contamination (FPR)')
plt.ylabel('Signal Efficiency (TPR)')
plt.title('$W$ selector')
plt.legend(loc="lower right")
plt.grid()
plt.show()

<IPython.core.display.Javascript object>

## Confusion Matrix

In [23]:
from sklearn.metrics import accuracy_score

print('Accuracy of the HLF classifier: {:.4f}'.format(
    accuracy_score(np.argmax(y_true, axis=1),np.argmax(y_pred, axis=1))))

Accuracy of the HLF classifier: 0.9077


In [24]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
labels_name = ['qcd', 'tt', 'wjets']
labels = [0,1,2]

cm = confusion_matrix(np.argmax(y_true, axis=1), np.argmax(y_pred, axis=1), labels=labels)

## Normalize CM
cm = cm / cm.astype(np.float).sum(axis=1)

fig, ax = plt.subplots()
ax = sns.heatmap(cm, annot=True, fmt='g')
ax.xaxis.set_ticklabels(labels_name)
ax.yaxis.set_ticklabels(labels_name)
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
plt.show()

<IPython.core.display.Javascript object>