In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
import os

## Data

The data set consists of 11 Million Monte Carlo simulations of nuclear collsions. Signal collisions correspond to collisions where a Higgs boson was created. Background collisions correspond to collisions that have the same end product particles but where a Higgs boson was not created. Each collision has 28 attributes.
Data set location: [https://archive.ics.uci.edu/ml/datasets/HIGGS]

Relevant Paper: * Baldi, P., P. Sadowski, and D. Whiteson. “Searching for Exotic Particles in High-energy Physics with Deep Learning.” Nature Communications 5 (July 2, 2014) *

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('higgs-analysis').getOrCreate()

In [4]:
data_location = os.path.join('resources','HIGGS_subsampled_20k.csv')
df = spark.read.load(data_location,
                     format="csv", sep=",", inferSchema="true", header="true")

In [5]:
df.head(2)

[Row(label=1.0,  lepton pT=0.869293212890625,  lepton eta=-0.6350818276405334,  lepton phi=0.22569026052951813,  missing energy magnitude=0.327470064163208,  missing energy phi=-0.6899932026863098,  jet 1 pt=0.7542022466659546,  jet 1 eta=-0.24857313930988312,  jet 1 phi=-1.0920639038085938,  jet 1 b-tag=0.0,  jet 2 pt=1.3749921321868896,  jet 2 eta=-0.6536741852760315,  jet 2 phi=0.9303491115570068,  jet 2 b-tag=1.1074360609054565,  jet 3 pt=1.138904333114624,  jet 3 eta=-1.5781983137130737,  jet 3 phi=-1.046985387802124,  jet 3 b-tag=0.0,  jet 4 pt=0.657929539680481,  jet 4 eta=-0.010454569943249226,  jet 4 phi=-0.0457671694457531,  jet 4 b-tag=3.101961374282837,  m_jj=1.353760004043579,  m_jjj=0.9795631170272827,  m_lv=0.978076159954071,  m_jlv=0.9200048446655273,  m_bb=0.7216574549674988,  m_wbb=0.9887509346008301,  m_wwbb=0.8766783475875854),
 Row(label=1.0,  lepton pT=0.9075421094894409,  lepton eta=0.3291472792625427,  lepton phi=0.3594118654727936,  missing energy magnitude=1.4

## Train/Test split

In [6]:
(training, test) = df.randomSplit([0.7, 0.3])

In [7]:
training.count(), test.count()

(13905, 6095)

In [9]:
training.columns

['label',
 ' lepton pT',
 ' lepton eta',
 ' lepton phi',
 ' missing energy magnitude',
 ' missing energy phi',
 ' jet 1 pt',
 ' jet 1 eta',
 ' jet 1 phi',
 ' jet 1 b-tag',
 ' jet 2 pt',
 ' jet 2 eta',
 ' jet 2 phi',
 ' jet 2 b-tag',
 ' jet 3 pt',
 ' jet 3 eta',
 ' jet 3 phi',
 ' jet 3 b-tag',
 ' jet 4 pt',
 ' jet 4 eta',
 ' jet 4 phi',
 ' jet 4 b-tag',
 ' m_jj',
 ' m_jjj',
 ' m_lv',
 ' m_jlv',
 ' m_bb',
 ' m_wbb',
 ' m_wwbb']

In [10]:
training.describe(training.columns[1]).show()

+-------+-------------------+
|summary|          lepton pT|
+-------+-------------------+
|  count|              13905|
|   mean| 0.9999787547184554|
| stddev| 0.5773550185954196|
|    min|0.27487966418266296|
|    max|  7.000280857086182|
+-------+-------------------+



## Feature Scaling

Not required for GBT or Random forrest but done to make it easy to add more classifiers later

In [11]:
from pyspark.sql.functions import *
from pyspark.ml.linalg import DenseVector

In [13]:
training_dense = training.rdd.map(lambda x: (x[0], DenseVector(x[1:])))
training_dense = spark.createDataFrame(training_dense, ["label", "features"])

In [14]:
test_dense = test.rdd.map(lambda x: (x[0], DenseVector(x[1:])))
test_dense = spark.createDataFrame(test_dense, ["label", "features"])

In [26]:
from pyspark.ml.feature import StandardScaler
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled", withMean=True)

In [27]:
scaler = standardScaler.fit(training_dense)
scaled_training = scaler.transform(training_dense)
scaled_training.take(2)

[Row(label=0.0, features=DenseVector([0.2758, -2.1077, 1.6889, 1.9085, 0.5887, 1.2632, 1.3635, -1.1625, 0.0, 0.5701, 0.6752, -0.7686, 0.0, 1.007, 1.5239, 1.5276, 0.0, 2.206, -0.4685, 0.2762, 3.102, 0.8865, 0.7056, 0.9947, 0.7695, 0.5959, 1.1519, 1.4402]), features_scaled=DenseVector([-1.2543, -2.0959, 1.6617, 1.5468, 0.5985, 0.5684, 1.3339, -1.1475, -0.9728, -0.845, 0.6658, -0.7508, -0.9441, 0.0111, 1.4956, 1.517, -0.8353, 2.3649, -0.4661, 0.2845, 1.5136, -0.2299, -0.841, -0.3392, -0.6088, -0.7169, 0.3146, 1.5237])),
 Row(label=0.0, features=DenseVector([0.2769, 1.1249, -0.6267, 1.1238, -1.6665, 0.5973, -0.4367, -0.0626, 0.0, 0.5557, -1.2695, -0.7792, 2.2149, 1.3079, -0.2975, 1.0106, 2.5482, 0.4009, -1.6103, -0.392, 0.0, 0.6491, 0.877, 0.9888, 0.7338, 1.0162, 0.6881, 0.6694]), features_scaled=DenseVector([-1.2524, 1.1299, -0.6333, 0.2251, -1.6486, -0.8369, -0.4357, -0.0631, -0.9728, -0.8737, -1.2558, -0.7614, 1.1671, 0.615, -0.308, 1.0015, 1.3002, -1.1534, -1.5957, -0.3793, -0.7135, -0

In [28]:
scaled_test = scaler.transform(test_dense)
scaled_test.take(2)

[Row(label=0.0, features=DenseVector([0.2809, -1.0276, 1.7355, 0.6845, -1.4089, 0.5, 0.2456, -0.617, 1.0865, 0.5949, 0.8558, 1.0358, 0.0, 0.6447, -0.6388, 0.1874, 2.5482, 0.3722, -1.2888, 1.3744, 0.0, 0.9205, 0.7694, 0.9934, 0.7998, 0.5845, 0.6035, 0.5795]), features_scaled=DenseVector([-1.2454, -1.018, 1.7079, -0.5147, -1.392, -1.0423, 0.2349, -0.6097, 0.0828, -0.7955, 0.8443, 1.0476, -0.9441, -0.7162, -0.646, 0.1807, 1.3002, -1.2094, -1.2776, 1.3755, -0.7135, -0.1791, -0.6738, -0.3473, -0.5336, -0.7384, -1.1714, -1.2122])),
 Row(label=0.0, features=DenseVector([0.2822, 0.2756, -0.5324, 0.8523, 0.8196, 0.3298, -0.2862, -0.2577, 2.1731, 0.7767, 0.8044, 1.5002, 0.0, 0.6623, 0.4262, -1.5124, 2.5482, 0.9069, 1.0564, -0.3082, 0.0, 1.1181, 0.8894, 0.9888, 0.7355, 0.5731, 0.6595, 0.6316]), features_scaled=DenseVector([-1.2432, 0.2824, -0.5398, -0.2322, 0.8285, -1.4015, -0.2878, -0.2555, 1.1384, -0.4325, 0.7934, 1.5104, -0.9441, -0.6809, 0.4086, -1.5141, 1.3002, -0.1671, 1.0425, -0.296, -0.71

## create model

In [30]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib import linalg as mllib_linalg
from pyspark.ml import linalg as ml_linalg

In [31]:
def as_old(v):
    if isinstance(v, ml_linalg.SparseVector):
        return mllib_linalg.SparseVector(v.size, v.indices, v.values)
    if isinstance(v, ml_linalg.DenseVector):
        return mllib_linalg.DenseVector(v.values)
    raise ValueError("Unsupported type {0}".format(type(v)))

In [32]:
scaled_labelPoint_train = scaled_training.rdd.map(lambda row: LabeledPoint(row.label, as_old(row.features)))
scaled_labelPoint_train.take(2)

[LabeledPoint(0.0, [0.2757946848869324,-2.107722520828247,1.6888595819473267,1.908488154411316,0.5887398719787598,1.263171911239624,1.3635212182998657,-1.162469506263733,0.0,0.5700910091400146,0.6751649379730225,-0.7686452865600586,0.0,1.0070127248764038,1.523929238319397,1.527646541595459,0.0,2.2059988975524902,-0.4685073494911194,0.2761847674846649,3.101961374282837,0.886506974697113,0.7055552005767822,0.9947258234024048,0.7694635391235352,0.5958611369132996,1.1519246101379395,1.440234661102295]),
 LabeledPoint(0.0, [0.276892751455307,1.1248798370361328,-0.6266821026802063,1.1237695217132568,-1.6664758920669556,0.5972790122032166,-0.43671685457229614,-0.06258609890937805,0.0,0.5557380318641663,-1.2695250511169434,-0.7791869640350342,2.214872121810913,1.3078521490097046,-0.29747843742370605,1.0106332302093506,2.548224449157715,0.40089157223701477,-1.610308051109314,-0.39203202724456787,0.0,0.6491283178329468,0.8770354986190796,0.988837718963623,0.733756959438324,1.0162041187286377,0.6

In [35]:
#  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
#         (b) Use more iterations in practice.
import time
train_start = time.time()
model = GradientBoostedTrees.trainClassifier(scaled_labelPoint_train,
                                             categoricalFeaturesInfo={}, numIterations=10)
train_end = time.time()
print(f'Time elapsed training model: {train_end - train_start} seconds')

Time elapsed training model: 20.349748849868774 seconds


In [36]:
model.predict(scaled_test.take(1)[0].features.values)

0.0

## Train model using CV and Grid Search

## Evaluate model

In [37]:
# Evaluate model on test instances and compute test error
predictions = model.predict(scaled_test.rdd.map(lambda x: x.features.values))
labelsAndPredictions = scaled_test.rdd.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(scaled_test.rdd.count())
print('Test Error = ' + str(testErr))
print('Learned classification GBT model:')
print(model.toDebugString())

Test Error = 0.32748154224774406
Learned classification GBT model:
TreeEnsembleModel classifier with 10 trees

  Tree 0:
    If (feature 25 <= 1.1958584785461426)
     If (feature 25 <= 0.6043451130390167)
      If (feature 27 <= 0.8742782473564148)
       Predict: -0.03178206583427923
      Else (feature 27 > 0.8742782473564148)
       Predict: -0.3333333333333333
     Else (feature 25 > 0.6043451130390167)
      If (feature 26 <= 0.7800827026367188)
       Predict: -0.06046511627906977
      Else (feature 26 > 0.7800827026367188)
       Predict: 0.33555975025410195
    Else (feature 25 > 1.1958584785461426)
     If (feature 22 <= 1.0406605005264282)
      If (feature 27 <= 1.0255640149116516)
       Predict: -0.3010752688172043
      Else (feature 27 > 1.0255640149116516)
       Predict: -0.6604708798017348
     Else (feature 22 > 1.0406605005264282)
      If (feature 25 <= 2.182615876197815)
       Predict: -0.0777422790202343
      Else (feature 25 > 2.182615876197815)
       Predi

In [None]:
spark.stop()