In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
import os

## Data

The data set consists of 11 Million Monte Carlo simulations of nuclear collsions. Signal collisions correspond to collisions where a Higgs boson was created. Background collisions correspond to collisions that have the same end product particles but where a Higgs boson was not created. Each collision has 28 attributes.
Data set location: [https://archive.ics.uci.edu/ml/datasets/HIGGS]

Relevant Paper: * Baldi, P., P. Sadowski, and D. Whiteson. “Searching for Exotic Particles in High-energy Physics with Deep Learning.” Nature Communications 5 (July 2, 2014) *

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('higgs-analysis').getOrCreate()

In [4]:
data_location = os.path.join('resources','HIGGS_subsampled_2M.csv')
df = spark.read.load(data_location,
                     format="csv", sep=",", inferSchema="true", header="true")

In [None]:
df.head(2)

## Train/Test split

In [5]:
(training, test) = df.randomSplit([0.7, 0.3])

In [6]:
training.count(), test.count()

(1400277, 599723)

In [None]:
training.columns

In [None]:
training.describe(training.columns[1]).show()

## Feature Scaling

Not required for GBT or Random forrest but done to make it easy to add more classifiers later

In [7]:
from pyspark.sql.functions import *
from pyspark.ml.linalg import DenseVector

In [9]:
training_dense = training.rdd.map(lambda x: (x[0], DenseVector(x[1:])))
training_dense = spark.createDataFrame(training_dense, ["label", "features"])

In [10]:
test_dense = test.rdd.map(lambda x: (x[0], DenseVector(x[1:])))
test_dense = spark.createDataFrame(test_dense, ["label", "features"])

In [11]:
from pyspark.ml.feature import StandardScaler
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled", withMean=True)

In [12]:
scaler = standardScaler.fit(training_dense)
scaled_training = scaler.transform(training_dense)
scaled_training.take(2)

[Row(label=0.0, features=DenseVector([0.2747, -1.984, 0.7695, 1.4013, 1.496, 0.9565, 0.3327, -0.0321, 0.0, 0.7049, -0.8722, -1.0061, 2.2149, 0.6468, -0.353, -0.6304, 0.0, 1.0686, -0.2303, 1.3877, 3.102, 0.8953, 0.9776, 0.9931, 0.7897, 0.6978, 0.7528, 0.7791]), features_scaled=DenseVector([-1.2683, -1.9675, 0.7637, 0.6725, 1.487, -0.0719, 0.3298, -0.0329, -0.9748, -0.5752, -0.8647, -0.9993, 1.1582, -0.7095, -0.3511, -0.6255, -0.8377, 0.1645, -0.2272, 1.3794, 1.5037, -0.2063, -0.1238, -0.3495, -0.5536, -0.5248, -0.7688, -0.5773])),
 Row(label=0.0, features=DenseVector([0.2747, 1.2018, -1.1843, 0.8634, 0.0123, 1.0525, -0.1377, 1.2331, 2.1731, 0.8947, 0.3138, -0.1001, 0.0, 1.0154, -0.1527, -0.6886, 0.0, 1.164, 0.2461, -1.5895, 1.551, 0.8237, 0.8717, 0.9845, 0.9644, 0.7065, 0.8854, 0.7666]), features_scaled=DenseVector([-1.2683, 1.1902, -1.1784, -0.2247, 0.0126, 0.1303, -0.1362, 1.2247, 1.1393, -0.1958, 0.3105, -0.0992, -0.9527, 0.0475, -0.1524, -0.6834, -0.8377, 0.3535, 0.2455, -1.5778, 0.

In [13]:
scaled_test = scaler.transform(test_dense)
scaled_test.take(2)

[Row(label=0.0, features=DenseVector([0.2747, 1.0226, 0.5963, 2.626, 1.2172, 2.6693, 0.4099, -0.2705, 0.0, 1.4566, -0.9004, 0.583, 0.0, 1.7471, 2.345, 1.4195, 0.0, 2.1904, 1.7551, -1.0302, 0.0, 1.8085, 1.8315, 0.9755, 1.004, 0.6204, 1.7004, 1.702]), features_scaled=DenseVector([-1.2683, 1.0126, 0.5916, 2.7154, 1.21, 3.5367, 0.4063, -0.2698, -0.9748, 0.9278, -0.8926, 0.5793, -0.9527, 1.55, 2.3258, 1.4118, -0.8377, 2.3868, 1.7428, -1.0222, -0.7131, 1.1501, 2.1244, -0.457, -0.0141, -0.6723, 1.8312, 2.3724])),
 Row(label=0.0, features=DenseVector([0.2749, -1.6139, -1.6815, 1.1242, -0.4629, 1.1079, -0.3288, 1.1643, 2.1731, 0.6779, 0.2973, -1.187, 0.0, 0.5585, -0.556, 0.6656, 0.0, 1.1097, 0.9015, -0.8915, 3.102, 1.0404, 1.0145, 0.9898, 0.7681, 1.4423, 0.9111, 0.7806]), features_scaled=DenseVector([-1.268, -1.6006, -1.6726, 0.2104, -0.4596, 0.2471, -0.3255, 1.1564, 1.1393, -0.6294, 0.2942, -1.179, -0.9527, -0.8909, -0.5525, 0.6626, -0.8377, 0.246, 0.8958, -0.8844, 1.5037, 0.0093, -0.0267, -0.

## create model

In [14]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib import linalg as mllib_linalg
from pyspark.ml import linalg as ml_linalg

In [15]:
def as_old(v):
    if isinstance(v, ml_linalg.SparseVector):
        return mllib_linalg.SparseVector(v.size, v.indices, v.values)
    if isinstance(v, ml_linalg.DenseVector):
        return mllib_linalg.DenseVector(v.values)
    raise ValueError("Unsupported type {0}".format(type(v)))

In [16]:
scaled_labelPoint_train = scaled_training.rdd.map(lambda row: LabeledPoint(row.label, as_old(row.features)))
scaled_labelPoint_train.take(2)

[LabeledPoint(0.0, [0.2746966481208801,-1.9840284585952759,0.7694543600082397,1.4012621641159058,1.495957612991333,0.9564709663391113,0.33269181847572327,-0.03209567442536354,0.0,0.7049337029457092,-0.8722332715988159,-1.0061124563217163,2.214872121810913,0.6468133926391602,-0.3530036211013794,-0.6303791999816895,0.0,1.0685573816299438,-0.23031991720199585,1.3876736164093018,3.101961374282837,0.8952987790107727,0.9776041507720947,0.9931463003158569,0.7896732687950134,0.6978268623352051,0.7527646422386169,0.7791340947151184]),
 LabeledPoint(0.0, [0.2746966481208801,1.201823353767395,-1.184317708015442,0.8633654117584229,0.012271969579160213,1.0524753332138062,-0.13766740262508392,1.2330915927886963,2.1730761528015137,0.8946702480316162,0.3138139843940735,-0.10007253289222717,0.0,1.0154426097869873,-0.15274888277053833,-0.6886264085769653,0.0,1.1639729738235474,0.2460549771785736,-1.589535117149353,1.5509806871414185,0.8237000703811646,0.8717207312583923,0.9844738841056824,0.964415132999

In [17]:
#  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
#         (b) Use more iterations in practice.
import time
train_start = time.time()
model = GradientBoostedTrees.trainClassifier(scaled_labelPoint_train,
                                             categoricalFeaturesInfo={}, numIterations=10)
train_end = time.time()
print(f'Time elapsed training model: {train_end - train_start} seconds')

Time elapsed training model: 349.64889430999756 seconds


In [None]:
model.predict(scaled_test.take(1)[0].features.values)

## Train model using CV and Grid Search

## Evaluate model

In [18]:
# Evaluate model on test instances and compute test error
predictions = model.predict(scaled_test.rdd.map(lambda x: x.features.values))
labelsAndPredictions = scaled_test.rdd.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(scaled_test.rdd.count())
print('Test Error = ' + str(testErr))
print('Learned classification GBT model:')
print(model.toDebugString())

Test Error = 0.3235710486341194
Learned classification GBT model:
TreeEnsembleModel classifier with 10 trees

  Tree 0:
    If (feature 25 <= 1.090235412120819)
     If (feature 25 <= 0.6411992013454437)
      If (feature 27 <= 0.8580954968929291)
       Predict: 0.001504319257802084
      Else (feature 27 > 0.8580954968929291)
       Predict: -0.29039358905555
     Else (feature 25 > 0.6411992013454437)
      If (feature 26 <= 0.8173975944519043)
       Predict: -0.004087017253420937
      Else (feature 26 > 0.8173975944519043)
       Predict: 0.4107947037971915
    Else (feature 25 > 1.090235412120819)
     If (feature 22 <= 1.0227294564247131)
      If (feature 27 <= 0.9984551668167114)
       Predict: -0.2628822729508434
      Else (feature 27 > 0.9984551668167114)
       Predict: -0.5964422257879202
     Else (feature 22 > 1.0227294564247131)
      If (feature 25 <= 1.8310698866844177)
       Predict: 0.008552113593788178
      Else (feature 25 > 1.8310698866844177)
       Predict

In [None]:
spark.stop()