## About

In this notebook we prepare a simple solution for the [kaggle challenge on higgs.](https://inclass.kaggle.com/c/mlhep-2016-higgs-detection)

In [7]:
%matplotlib inline

In [8]:
import matplotlib.pyplot as plt

import pandas
import numpy as np

### Download data

In [None]:
!cd datasets; wget -O public_train_10000.root -nc --no-check-certificate https://2016.mlhep.yandex.net/data/higgs/public_train_10000.root

In [None]:
# you can download training sample with 100000 available events
# uncomment the below row
!cd datasets; wget -O public_train_100000.root -nc --no-check-certificate https://2016.mlhep.yandex.net/data/higgs/public_train_100000.root

In [None]:
!cd datasets; wget -O public_test.root -nc --no-check-certificate https://2016.mlhep.yandex.net/data/higgs/public_test.root

### Read the smallest part of training file and test file

In [9]:
import root_numpy
data = pandas.DataFrame(root_numpy.root2array('datasets/public_train_100000.root'))
test = pandas.DataFrame(root_numpy.root2array('datasets/public_test.root'))

### Define training features

Exclude `event_id`, `target` from the features set

In [10]:
features = list(set(data.columns) - {'event_id', 'target'})
features

['jet3_pt',
 'jet3_eta',
 'm_jjj',
 'mem_phi',
 'jet1_pt',
 'jet4_phi',
 'jet1_phi',
 'jet2_eta',
 'jet3_btag',
 'm_jlv',
 'm_wbb',
 'jet4_pt',
 'jet4_btag',
 'jet2_pt',
 'jet1_btag',
 'm_jj',
 'm_wwbb',
 'jet2_phi',
 'lepton_phi',
 'm_bb',
 'm_lv',
 'jet4_eta',
 'jet2_btag',
 'lepton_pt',
 'mem_pt',
 'lepton_eta',
 'jet3_phi',
 'jet1_eta']

### Prepare high-level features for training

In [11]:
high_level_features = ['m_jj', 'm_jjj', 'm_jlv', 'm_wwbb', 'm_bb', 'm_wbb', 'm_lv']

### Plot histograms for each high-level feature

In [None]:
hist_params = {'normed': True, 'bins': 60, 'alpha': 0.4}
# create the figure
plt.figure(figsize=(16, 25))
for n, feature in enumerate(high_level_features):
    # add sub plot on our figure
    plt.subplot(len(features) // 5 + 1, 3, n+1)
    # define range for histograms by cutting 1% of data from both ends
    min_value, max_value = numpy.percentile(data[feature], [1, 99])
    plt.hist(data.ix[data.target.values == 0, feature].values, range=(min_value, max_value), 
             label='class 0', **hist_params)
    plt.hist(data.ix[data.target.values == 1, feature].values, range=(min_value, max_value), 
             label='class 1', **hist_params)
    plt.legend(loc='best')
    plt.title(feature)

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC

def get_validated_trained(fitter, data, features, part):
    training_data, validation_data = train_test_split(data, random_state=3747824, train_size=part)
    fitter.fit(training_data[features], training_data.target)
    results = fitter.predict_proba(validation_data[features])
    print 'Validation:', roc_auc_score(validation_data.target, results[:, 1])

def get_full_trained(fitter, data, features):
    fitter.fit(data[features], data.target)

def get_result(fitter, test, features):
    return fitter.predict_proba(test[features])[:, 1]

In [16]:
def get_fitter():
    fitter = RandomForestClassifier(
        5000,
        max_depth=30,
        n_jobs=-1,
        verbose=True,
    )
    return fitter

fitter = get_fitter()
cur_features = features
"""for i in range(1, 5):
    var_name = 'jet%d_eta' % i
    new_var_name = 'jet%d_eta_norm' % i
    cur_features.append(new_var_name)
    data[new_var_name] = 2 * np.arctan(np.exp(-data[var_name]))
    data[new_var_name] = 2 * np.arctan(np.exp(-data[var_name]))"""
#get_full_trained(fitter, data, cur_features)
get_validated_trained(fitter, data, cur_features, 0.6)

[Parallel(n_jobs=-1)]: Done   1 out of 1847 | elapsed:    0.5s remaining: 14.1min
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:  2.0min finished
[Parallel(n_jobs=16)]: Done   1 out of  34 | elapsed:    0.0s remaining:    0.6s
[Parallel(n_jobs=16)]: Done 5000 out of 5000 | elapsed:    4.7s finished


Validation: 0.796521417074


In [None]:
importances = fitter.feature_importances_
std = np.std([tree.feature_importances_ for tree in fitter.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print "Feature ranking:"

X = data[cur_features]

for f in range(X.shape[1]):
    print "%d. feature %d %s (%f)" % (f + 1, indices[f], cur_features[f], importances[indices[f]])

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

## Prepare submission to kaggle

In [14]:
# predict test sample
kaggle_proba = get_result(fitter, test, cur_features)
kaggle_ids = test.event_id

[Parallel(n_jobs=16)]: Done   1 out of 2000 | elapsed:    0.5s remaining: 17.2min
[Parallel(n_jobs=16)]: Done 2000 out of 2000 | elapsed:   38.4s finished


In [15]:
from IPython.display import FileLink
def create_solution(ids, proba, filename='baseline.csv'):
    """saves predictions to file and provides a link for downloading """
    pandas.DataFrame({'event_id': ids, 'prediction': proba}).to_csv('datasets/{}'.format(filename), index=False)
    return FileLink('datasets/{}'.format(filename))
    
create_solution(kaggle_ids, kaggle_proba)