## About

In this notebook we prepare a simple solution for the [kaggle challenge on higgs.](https://inclass.kaggle.com/c/mlhep-2016-higgs-detection)

In [76]:
%matplotlib inline

In [77]:
import matplotlib.pyplot as plt

import pandas
import numpy as np

### Download data

In [None]:
!cd datasets; wget -O public_train_10000.root -nc --no-check-certificate https://2016.mlhep.yandex.net/data/higgs/public_train_10000.root

In [None]:
# you can download training sample with 100000 available events
# uncomment the below row
!cd datasets; wget -O public_train_100000.root -nc --no-check-certificate https://2016.mlhep.yandex.net/data/higgs/public_train_100000.root

In [None]:
!cd datasets; wget -O public_test.root -nc --no-check-certificate https://2016.mlhep.yandex.net/data/higgs/public_test.root

### Read the smallest part of training file and test file

In [78]:
import root_numpy
data = pandas.DataFrame(root_numpy.root2array('datasets/public_train_100000.root'))
test = pandas.DataFrame(root_numpy.root2array('datasets/public_test.root'))

### Define training features

Exclude `event_id`, `target` from the features set

In [79]:
features = list(set(data.columns) - {'event_id', 'target'})
features

['jet3_pt',
 'jet3_eta',
 'm_jjj',
 'mem_phi',
 'jet1_pt',
 'jet4_phi',
 'jet1_phi',
 'jet2_eta',
 'jet3_btag',
 'm_jlv',
 'm_wbb',
 'jet4_pt',
 'jet4_btag',
 'jet2_pt',
 'jet1_btag',
 'm_jj',
 'm_wwbb',
 'jet2_phi',
 'lepton_phi',
 'm_bb',
 'm_lv',
 'jet4_eta',
 'jet2_btag',
 'lepton_pt',
 'mem_pt',
 'lepton_eta',
 'jet3_phi',
 'jet1_eta']

### Prepare high-level features for training

In [80]:
high_level_features = ['m_jj', 'm_jjj', 'm_jlv', 'm_wwbb', 'm_bb', 'm_wbb', 'm_lv']

### Plot histograms for each high-level feature

In [None]:
hist_params = {'normed': True, 'bins': 60, 'alpha': 0.4}
# create the figure
plt.figure(figsize=(16, 25))
for n, feature in enumerate(high_level_features):
    # add sub plot on our figure
    plt.subplot(len(features) // 5 + 1, 3, n+1)
    # define range for histograms by cutting 1% of data from both ends
    min_value, max_value = numpy.percentile(data[feature], [1, 99])
    plt.hist(data.ix[data.target.values == 0, feature].values, range=(min_value, max_value), 
             label='class 0', **hist_params)
    plt.hist(data.ix[data.target.values == 1, feature].values, range=(min_value, max_value), 
             label='class 1', **hist_params)
    plt.legend(loc='best')
    plt.title(feature)

In [81]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC

def get_validated_trained(fitter, data, features, part):
    training_data, validation_data = train_test_split(data, random_state=3747824, train_size=part)
    fitter.fit(training_data[features], training_data.target)
    results = fitter.predict_proba(validation_data[features])
    print 'Validation:', roc_auc_score(validation_data.target, results[:, 1])

def get_full_trained(fitter, data, features):
    fitter.fit(data[features], data.target)

def get_result(fitter, test, features):
    return fitter.predict_proba(test[features])[:, 1]

In [71]:
from sknn.mlp import Classifier, Layer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [89]:
from sknn.platform import cpu64, threading



In [None]:
import sys
import logging

logging.basicConfig(
    stream=sys.stdout
)
logger = logging.getLogger('sknn')
logger.setLevel(logging.DEBUG)

fitter = Pipeline([
        ('min/max scaler', MinMaxScaler(feature_range=(0.0, 1.0))),
        ('neural network', Classifier(
                layers=[
                    Layer("Rectifier", units=300),
                    Layer("Rectifier", units=300),
                    Layer("Rectifier", units=300),
                    Layer("Rectifier", units=300),
                    Layer("Softmax")
                ],
                n_iter=66,
                random_state=274734,
                learning_rate=0.2,
                learning_momentum=0.999,
                verbose=True,
                batch_size=10,
                regularize='L2',
                weight_decay=0.01
    ))])

cur_features = features
get_validated_trained(fitter, data, cur_features, 0.6)

INFO:sknn:Initializing neural network with 5 layers, 28 inputs and 2 outputs.
DEBUG:sknn:  - Dense: [1;97mRectifier [0m Units:  [1;97m300 [0m
DEBUG:sknn:  - Dense: [1;97mRectifier [0m Units:  [1;97m300 [0m
DEBUG:sknn:  - Dense: [1;97mRectifier [0m Units:  [1;97m300 [0m
DEBUG:sknn:  - Dense: [1;97mRectifier [0m Units:  [1;97m300 [0m
DEBUG:sknn:  - Dense: [1;97mSoftmax   [0m Units:  [1;97m2   [0m
DEBUG:sknn:
INFO:sknn:Training on dataset of 60,000 samples with 1,800,000 total size.
DEBUG:sknn:  - Using `L2` for regularization.
DEBUG:sknn:  - Terminating loop after 66 total iterations.
DEBUG:sknn:  - Early termination after 10 stable iterations.
DEBUG:sknn:
Epoch       Training Error       Validation Error       Time
------------------------------------------------------------




DEBUG:sknn:    1         [0;94m 8.930e-01[0m                 N/A           16.7s




DEBUG:sknn:    2         [0;94m 6.938e-01[0m                 N/A           16.2s




DEBUG:sknn:    3         [0;94m 6.940e-01[0m                 N/A           16.4s




DEBUG:sknn:    4         [0;94m 6.940e-01[0m                 N/A           16.1s




DEBUG:sknn:    5         [0;94m 6.939e-01[0m                 N/A           16.7s




DEBUG:sknn:    6         [0;94m 6.935e-01[0m                 N/A           16.4s


..................................................

In [84]:
def get_fitter():
    fitter = RandomForestClassifier(
        3000,
        max_depth=33,
        n_jobs=-1,
        verbose=True,
    )
    return fitter

fitter = get_fitter()
cur_features = features
new_features = cur_features[:]
new_data = data.copy()
new_test = test.copy()
for i, cur_f in enumerate(cur_features):
    if cur_f.startswith('m_'):
        for cur_g in cur_features[i + 1:]:
            if cur_g.startswith('m_'):
                f_name = 'm_diff_%s_%s' % (cur_f, cur_g)
                new_data[f_name] = data[cur_f] / data[cur_g]
                new_test[f_name] = test[cur_f] / test[cur_g]
                new_features.append(f_name)
new_data.head()
get_full_trained(fitter, new_data, new_features)
#get_validated_trained(fitter, new_data, new_features, 0.6)

[Parallel(n_jobs=-1)]: Done   1 out of 3000 | elapsed:    1.3s remaining: 66.0min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:  3.3min finished


In [55]:
new_data.head()

Unnamed: 0,event_id,target,lepton_pt,lepton_eta,lepton_phi,mem_pt,mem_phi,jet1_pt,jet1_eta,jet1_phi,...,jet4_eta,jet4_phi,jet4_btag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
0,1000001,1,(5421299.97301+0j),(-24.3297657498+0j),(632.728385961+0j),(5810072.05736+0j),(324.480778877+0j),(7088333.3927+0j),(75.0336984188+0j),(-628.041845405+0j),...,(321.896043186+0j),(816.361165102+0j),(81180.6939785+0j),(8976519.16873+0j),(18126930.3969+0j),(8542375.21261+0j),(19404433.3865+0j),(11982746.0311+0j),(34842988.1681+0j),(48405143.0547+0j)
1,1000002,1,(4406.64085382-10440.3480691j),(-7.28865676809+99.0357121096j),(-102.329158471+74.7017249853j),(-4518.82218108+1282.87634954j),(406.089742316-337.838785852j),(-16804.2251273-4587.62807946j),(398.196307412-181.349032318j),(500.895046357+119.861000708j),...,(115.546772465+475.990939384j),(-428.054984952-887.893824144j),(-179.935167556+10.7879083344j),(1316.7216786+9273.19743202j),(-549.237713651+13601.9247298j),(128.300496094-453.924667721j),(11566.0863809+19364.8912835j),(13120.2128331+3103.74473599j),(18093.0082823+29863.944389j),(31306.838205+20385.8465256j)
2,1000003,1,(-5380.83726356+13619.8256582j),(153.654852681+409.158441849j),(-70.9141241033-1032.21122385j),(-7709.12358284-1300.60661022j),(-490.653216636-337.598577703j),(-2732.74099555+2993.42039603j),(-158.256097425-218.422013112j),(581.369191318-430.350948148j),...,(554.324381492+178.065026119j),(-250.197348588+148.227219908j),(-147.742534269+36.9325361747j),(6872.42733148-5896.04177095j),(20083.3732922-3461.22101082j),(-1816.83410464+341.95447713j),(8147.31831542+12612.6752749j),(4322.52347107+405.753495554j),(17668.7951018+40323.1110574j),(24431.9202624+32478.0342892j)
3,1000004,0,(11246.1414315-6755.97484606j),(-72.7880904913-178.550321593j),(190.920513772+237.4080537j),(7519.61146663-3775.1144496j),(-81.3251021379-574.916035425j),(20100.7321953-8639.19305206j),(-214.446214274+152.06861446j),(299.652518587-324.622239567j),...,(-61.9648059859+445.929128328j),(-40.9793201391-207.713414596j),(-34.4666359745-125.945142273j),(3111.3085742+10029.0444708j),(-3882.78527776+2390.60150768j),(-399.011978123+3103.97430008j),(-18134.268758+4369.8005329j),(-4335.76489347-19327.3488033j),(-4198.80052076+8739.59602611j),(11179.4873673-3632.49985006j)
4,1000005,0,(-1749.56741157+2407.81320111j),(372.723808664-18.2296637898j),(311.710069031-325.018280842j),(-2017.25610017-1805.54949022j),(-523.006823577+77.3812855229j),(2646.71449848-4623.40210645j),(319.117782808-23.9335389722j),(-302.028429999+83.5827503272j),...,(-127.303987167-80.2030779503j),(-1294.74533348-137.928477418j),(118.231513568+61.7688141911j),(11006.5745471+5095.22134146j),(12538.4120959-9957.91739448j),(-1122.23583873-1697.77577789j),(-6828.50096709-587.104477543j),(-12066.1287332+1709.35739797j),(-27544.0632036+2421.36229569j),(-41289.9915709-5265.04432711j)


## Prepare submission to kaggle

In [85]:
# predict test sample
kaggle_proba = get_result(fitter, new_test, new_features)
kaggle_ids = test.event_id

[Parallel(n_jobs=16)]: Done   1 out of 2047 | elapsed:    0.5s remaining: 17.0min
[Parallel(n_jobs=16)]: Done 3000 out of 3000 | elapsed:  1.2min finished


In [86]:
from IPython.display import FileLink
def create_solution(ids, proba, filename='baseline.csv'):
    """saves predictions to file and provides a link for downloading """
    pandas.DataFrame({'event_id': ids, 'prediction': proba}).to_csv('datasets/{}'.format(filename), index=False)
    return FileLink('datasets/{}'.format(filename))
    
create_solution(kaggle_ids, kaggle_proba)