In [1]:
import os

import numpy as np
import pandas as pd
from root_pandas import read_root
import root_numpy

import matplotlib.pyplot as plt

from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

import xgboost as xgb

from itertools import islice

from tqdm import tqdm

import ROOT

from scripts.data_preparation import add_target_column, concat_df_chunks

from rep.metaml import GridOptimalSearchCV, RegressionParameterOptimizer

In [2]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 7)
plt.rcParams['font.size'] = 14

In [3]:
data_dir = '/home/kheinicke/tank/flavourtagging/'
filenames = [
    data_dir + 'Bu2JpsiK_MuonTrainingTuple_2012_MD_sweighted_kheinick.root',
    data_dir + 'Bu2JpsiK_MuonTrainingTuple_2012_MU_sweighted_kheinick.root',
]

# Read Dataset
Since these files are very huge and ROOT does a lot of comprimation, having the whole dataset in RAM is not possible too easy. Therefore for now only use 100k Events.

In [4]:
num_rows = 10000

In [5]:
full_data = read_root(filenames,
                      key='DecayTree',
                      columns=['B_OS_Muon*', 'B_*', 'SigYield_sw', 'BkgYield_sw'],
                      ignore=['B_ENDVERTEX_COV_',
                              'B_OWNPV_COV_',
                              'B_TOPPV_COV_',
                              'B_OS_E*',
                              'B_OS_K*',
                              'B_All*',
                              'B_Hlt*',
                              'B_L0*',
                            ],
                      stop=num_rows,
                      where='(B_LOKI_MASS_JpsiConstr_NoPVConstr>0)',
                      flatten=True)

In [6]:
# Add the target column
full_data['target'] = np.sign(full_data.B_ID) == np.sign(full_data.B_OS_Muon_ID)

In [7]:
full_data.replace([-np.inf, np.inf], np.nan, inplace=True)
full_data.dropna(inplace=True)
labels = full_data['target'].copy(deep=True)

In [8]:
ncols = 3
muon_columns = [f for f in full_data.columns if f.startswith('B_OS_Muon')]
nrows = np.ceil(len(muon_columns)/ncols)

In [9]:
# plt.figure(figsize=(16, nrows * 4))
# for i, f in enumerate(muon_columns):
#     plt.subplot(nrows, ncols, i+1)
#     _, bins = np.histogram(full_data[f], 50)
#     full_data[labels==True][f].hist(bins=bins, alpha=0.5, label='True', normed=True)
#     full_data[labels==False][f].hist(bins=bins, alpha=0.5, label='False', normed=True)
#     plt.title(f[:24])
#     plt.legend(loc='best')
# plt.tight_layout()
# plt.show()

# Optimal Grid Search

In [10]:
train_data, test_data, train_labels, test_labels = train_test_split(full_data[muon_columns], labels, test_size=0.3, random_state=1)

In [11]:
def avg_dilution(labels, prediction, threshhold=0.5):
    wrong_tags = labels != (prediction > threshhold)
    omega = np.sum(wrong_tags) / len(labels)
    return 1 - 2*omega


def tagging_power(efficiency, dilution):
    return efficiency * dilution ** 2


def tagging_power_scorer(estimator, X, y):
    probas = estimator.predict_proba(X)[:,1]
    dilution = avg_dilution(y, probas)
    return tagging_power(efficiency, dilution)


def tp_scorer(y_true, probas, sample_weight=None):
    dilution = avg_dilution(y_true, probas)
    print('Dilution {}'.format(dilution))
    return tagging_power(1, dilution)


def tagging_power_curve(y_true, probas):
    xs = np.linspace(0, 1, 100)
    ys = [tagging_power(efficiency, avg_dilution(y_true, probas, thresh)) for thresh in xs]
    return np.array(ys), xs

In [18]:
grid = {
    'max_depth': np.arange(1, 5),
    'learning_rate': np.linspace(0.1, 0.5, 10),
    'n_estimators': np.arange(5, 50, 5),
}

In [19]:
from rep.metaml import ClassificationFoldingScorer
from rep.report.metrics import RocAuc

In [20]:
scorer = ClassificationFoldingScorer(RocAuc())
parameter_generator = RegressionParameterOptimizer(grid, n_attempts=20, n_evaluations=100)

In [21]:
grid_finder = GridOptimalSearchCV(xgb.XGBClassifier(nthread=4),
                                  parameter_generator,
                                  scorer,
                                  parallel_profile='threads-4'
                                 )

In [22]:
grid_finder.fit(train_data, train_labels)

Performing grid search in 4 threads
4 evaluations done
8 evaluations done
12 evaluations done
16 evaluations done
20 evaluations done
24 evaluations done
28 evaluations done
32 evaluations done
36 evaluations done
40 evaluations done
44 evaluations done
48 evaluations done
52 evaluations done
56 evaluations done
60 evaluations done
64 evaluations done
68 evaluations done
72 evaluations done
76 evaluations done
80 evaluations done
84 evaluations done
88 evaluations done
92 evaluations done
96 evaluations done
100 evaluations done


<rep.metaml.gridsearch.GridOptimalSearchCV at 0x7f3a3c0416a0>

In [23]:
grid_finder.params_generator.best_params_

OrderedDict([('n_estimators', 35),
             ('learning_rate', 0.41111111111111109),
             ('max_depth', 4)])