In [2]:
from matplotlib import pyplot as plt
%matplotlib inline

import numpy
import pandas
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Ridge
from sklearn.ensemble import AdaBoostClassifier
from sklearn.grid_search import GridSearchCV

In [3]:
def train_test_split_group(group_column, *arrays, **kw_args):
   
    from sklearn import cross_validation
    allow_none = kw_args.pop('allow_none', None)

    assert len(arrays) > 0, "at least one array should be passed"
    length = len(arrays[0])
    for array in arrays:
        assert len(array) == length, "different size"

    initial_data = numpy.array(group_column)
    assert len(initial_data) == length, "group column must have the same length"
    group_ids = numpy.unique(initial_data)

    train_indices, test_indices = cross_validation.train_test_split(group_ids, **kw_args)
    train_indices = numpy.in1d(initial_data, train_indices)
    test_indices = numpy.in1d(initial_data, test_indices)

    result = []
    for array in arrays:
        if isinstance(array, pandas.DataFrame):
            result.append(array.iloc[train_indices, :])
            result.append(array.iloc[test_indices, :])
        elif (array is None) and allow_none:
            # specially for checking weights
            result.append(None)
            result.append(None)
        else:
            result.append(numpy.array(array)[train_indices])
            result.append(numpy.array(array)[test_indices])
    return result

In [4]:
data = pd.read_csv("training.csv", sep=",")

In [5]:
data.head()

Unnamed: 0,EventID,Label,Mass,Corrected_mass,Pt,Pt_sum,Pt_min,IP_chi2,IP_chi2_sum,Flight_distance,Pseudorapidity,Track_number_PV,Tracks_number,Tracks_number_passed,Vertex_chi2,Weight
0,0,1,3440.680014,5202.580014,15583.900014,15793.800014,6499.170014,23.863914,281.409014,727.555014,2.581344,0,2,2,1.119164,2.834853
1,0,1,1319.829991,2465.479991,3477.259991,3593.409991,404.050991,34.731791,135.844991,165.981991,2.581401,0,2,1,0.004716,2.834853
2,0,1,2732.810016,5804.080016,9356.570016,9562.870016,406.027016,150.119016,348.274016,414.395016,2.579956,0,2,1,0.016452,2.834853
3,0,1,1674.579998,4423.859998,6889.459998,6981.049998,404.992998,84.545298,167.266998,167.871998,2.585178,0,2,1,0.085944,2.834853
4,0,1,1844.839993,5744.339993,12385.899993,12299.399993,3167.219993,110.864993,251.260993,350.210993,2.590683,0,2,2,1.167173,2.834853


In [6]:
features = list(set(data.columns) - {'EventID', 'Label', 'Weight'})
features

['Corrected_mass',
 'Pt',
 'IP_chi2',
 'Vertex_chi2',
 'Tracks_number',
 'Tracks_number_passed',
 'Mass',
 'Pseudorapidity',
 'IP_chi2_sum',
 'Pt_sum',
 'Pt_min',
 'Flight_distance',
 'Track_number_PV']

In [7]:
training_data, validation_data = train_test_split_group(data.EventID, data, random_state=11, train_size=0.66)

In [8]:
def compute_mean(event_ids, values):
    number_of_sv_in_event = np.bincount(event_ids)
    return np.bincount(event_ids, weights=values) / number_of_sv_in_event

In [9]:
def compute_max(event_ids, values):
    max_values = numpy.zeros(numpy.amax(event_ids)+1)
    for i in range(len(event_ids)):
        if values[i] > max_values[event_ids[i]]:
            max_values[event_ids[i]] = values[i]
    max_values[max_values == 0] = numpy.nan
    return max_values

In [None]:
param_grid = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
              'max_depth': [6, 8, 10],
              #'subsamples': [1.0, 0.8, 0.6],
              'min_samples_leaf': [20, 50, 100, 150],
              'max_features': [4, 6, 8, 10]
              }

gbclassifier = GradientBoostingClassifier(n_estimators=100, random_state=42)


gs_cv = GridSearchCV(gbclassifier, param_grid, n_jobs=4).fit(training_data[features], training_data.Label)


gs_cv.best_params_

In [18]:
param_grid = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
              'algorithm': ['SAMME', 'SAMME.R'],
              'n_estimators': [50, 100, 150]
              }

abclassifier = AdaBoostClassifier()


gs_cv = GridSearchCV(abclassifier, param_grid, n_jobs=4).fit(training_data[features], training_data.Label)


gs_cv.best_params_

{'algorithm': 'SAMME.R', 'learning_rate': 0.1, 'n_estimators': 150}

In [10]:
gbclassifier = GradientBoostingClassifier(n_estimators=100, random_state=42, learning_rate=0.05,
                                          max_depth=10, max_features=4, min_samples_leaf=100)
gbclassifier.fit(training_data[features], training_data.Label)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=10,
              max_features=4, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=100,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=42,
              subsample=1.0, verbose=0, warm_start=False)

In [11]:
# predict each SV
proba = gbclassifier.predict_proba(validation_data[features])
events_ids = np.unique(validation_data.EventID)

# compute number of SVs in each event
number_of_sv_in_event = np.bincount(validation_data.EventID)

# compute predictions for events (take the mean value of predictions for SVs forming an event)
events_proba = compute_max(numpy.array(validation_data.EventID), proba[:, 1])[events_ids]

# compute weights for events 
events_weights = compute_mean(validation_data.EventID, validation_data.Weight)[events_ids]

# compute labels for events 
events_labels = compute_mean(validation_data.EventID, validation_data.Label)[events_ids]

roc_auc_score(events_labels, events_proba, sample_weight=events_weights)

  app.launch_new_instance()


0.96290436009902136

In [22]:
gbclassifier = GradientBoostingClassifier(n_estimators=2000, random_state=42, learning_rate=0.05,
                                          max_depth=10, max_features=4, min_samples_leaf=100)
gbclassifier.fit(training_data[features], training_data.Label)

# predict each SV
proba = gbclassifier.predict_proba(validation_data[features])
events_ids = np.unique(validation_data.EventID)

# compute number of SVs in each event
number_of_sv_in_event = np.bincount(validation_data.EventID)

# compute predictions for events (take the mean value of predictions for SVs forming an event)
events_proba = compute_max(numpy.array(validation_data.EventID), proba[:, 1])[events_ids]

# compute weights for events 
events_weights = compute_mean(validation_data.EventID, validation_data.Weight)[events_ids]

# compute labels for events 
events_labels = compute_mean(validation_data.EventID, validation_data.Label)[events_ids]

roc_auc_score(events_labels, events_proba, sample_weight=events_weights)

  app.launch_new_instance()


0.95750652982042184

In [12]:
rfclassifier = RandomForestClassifier(max_depth=6, n_estimators=500)
rfclassifier.fit(training_data[features], training_data.Label)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [13]:
# predict each SV
proba = rfclassifier.predict_proba(validation_data[features])
events_ids = np.unique(validation_data.EventID)

# compute number of SVs in each event
number_of_sv_in_event = np.bincount(validation_data.EventID)

# compute predictions for events (take the mean value of predictions for SVs forming an event)
events_proba = compute_max(numpy.array(validation_data.EventID), proba[:, 1])[events_ids]

# compute weights for events 
events_weights = compute_mean(validation_data.EventID, validation_data.Weight)[events_ids]

# compute labels for events 
events_labels = compute_mean(validation_data.EventID, validation_data.Label)[events_ids]

roc_auc_score(events_labels, events_proba, sample_weight=events_weights)

  app.launch_new_instance()


0.94322849922427576

In [14]:
abclassifier = AdaBoostClassifier(algorithm='SAMME.R', learning_rate=0.1, n_estimators=150)
abclassifier.fit(training_data[features], training_data.Label)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.1, n_estimators=150, random_state=None)

In [15]:
# predict each SV
proba = abclassifier.predict_proba(validation_data[features])
events_ids = np.unique(validation_data.EventID)

# compute number of SVs in each event
number_of_sv_in_event = np.bincount(validation_data.EventID)

# compute predictions for events (take the mean value of predictions for SVs forming an event)
events_proba = compute_max(numpy.array(validation_data.EventID), proba[:, 1])[events_ids]

# compute weights for events 
events_weights = compute_mean(validation_data.EventID, validation_data.Weight)[events_ids]

# compute labels for events 
events_labels = compute_mean(validation_data.EventID, validation_data.Label)[events_ids]

roc_auc_score(events_labels, events_proba, sample_weight=events_weights)

  app.launch_new_instance()


0.95273395244124004

In [12]:
import xgboost as xgb 

In [17]:
xgbclassifier = xgb.XGBClassifier()
xgbclassifier.fit(training_data[features], training_data.Label)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [18]:
# predict each SV
proba = xgbclassifier.predict_proba(validation_data[features])
events_ids = np.unique(validation_data.EventID)

# compute number of SVs in each event
number_of_sv_in_event = np.bincount(validation_data.EventID)

# compute predictions for events (take the mean value of predictions for SVs forming an event)
events_proba = compute_max(numpy.array(validation_data.EventID), proba[:, 1])[events_ids]

# compute weights for events 
events_weights = compute_mean(validation_data.EventID, validation_data.Weight)[events_ids]

# compute labels for events 
events_labels = compute_mean(validation_data.EventID, validation_data.Label)[events_ids]

roc_auc_score(events_labels, events_proba, sample_weight=events_weights)

  app.launch_new_instance()


0.95864837019200388

In [19]:
xgbclassifier = xgb.XGBClassifier(learning_rate=0.15, max_depth=8)
xgbclassifier.fit(training_data[features], training_data.Label)

# predict each SV
proba = xgbclassifier.predict_proba(validation_data[features])
events_ids = np.unique(validation_data.EventID)

# compute number of SVs in each event
number_of_sv_in_event = np.bincount(validation_data.EventID)

# compute predictions for events (take the mean value of predictions for SVs forming an event)
events_proba = compute_max(numpy.array(validation_data.EventID), proba[:, 1])[events_ids]

# compute weights for events 
events_weights = compute_mean(validation_data.EventID, validation_data.Weight)[events_ids]

# compute labels for events 
events_labels = compute_mean(validation_data.EventID, validation_data.Label)[events_ids]

roc_auc_score(events_labels, events_proba, sample_weight=events_weights)

  app.launch_new_instance()


0.96214855688736522

In [31]:
param_test1 = {
 'max_depth':[4,6,8,10],
 'min_child_weight':[1,3,5,7]
}

gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=150, min_child_weight=1,
gamma=0, subsample=0.8, colsample_bytree=0.8,
objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
param_grid = param_test1, scoring='roc_auc', n_jobs=4, iid=False, cv=5)

gsearch1.fit(training_data[features], training_data.Label)

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: 0.92882, std: 0.00819, params: {'min_child_weight': 1, 'max_depth': 4},
  mean: 0.92863, std: 0.00763, params: {'min_child_weight': 3, 'max_depth': 4},
  mean: 0.92862, std: 0.00782, params: {'min_child_weight': 5, 'max_depth': 4},
  mean: 0.92869, std: 0.00773, params: {'min_child_weight': 7, 'max_depth': 4},
  mean: 0.92908, std: 0.00855, params: {'min_child_weight': 1, 'max_depth': 6},
  mean: 0.93008, std: 0.00799, params: {'min_child_weight': 3, 'max_depth': 6},
  mean: 0.92893, std: 0.00783, params: {'min_child_weight': 5, 'max_depth': 6},
  mean: 0.92934, std: 0.00744, params: {'min_child_weight': 7, 'max_depth': 6},
  mean: 0.92722, std: 0.00840, params: {'min_child_weight': 1, 'max_depth': 8},
  mean: 0.92824, std: 0.00773, params: {'min_child_weight': 3, 'max_depth': 8},
  mean: 0.92866, std: 0.00723, params: {'min_child_weight': 5, 'max_depth': 8},
  mean: 0.92833, std: 0.00780, params: {'min_child_weight': 7, 'max_depth': 8},
  mean: 0.92559, std: 0.00781, params: {

In [37]:
xgbclassifier = xgb.XGBClassifier(learning_rate=0.1, n_estimators=150, min_child_weight=3,
max_depth = 6, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
xgbclassifier.fit(training_data[features], training_data.Label)

# predict each SV
proba = xgbclassifier.predict_proba(validation_data[features])
events_ids = np.unique(validation_data.EventID)

# compute number of SVs in each event
number_of_sv_in_event = np.bincount(validation_data.EventID)

# compute predictions for events (take the mean value of predictions for SVs forming an event)
events_proba = compute_max(numpy.array(validation_data.EventID), proba[:, 1])[events_ids]

# compute weights for events 
events_weights = compute_mean(validation_data.EventID, validation_data.Weight)[events_ids]

# compute labels for events 
events_labels = compute_mean(validation_data.EventID, validation_data.Label)[events_ids]

roc_auc_score(events_labels, events_proba, sample_weight=events_weights)

  app.launch_new_instance()


0.96415923462390529

In [39]:
param_test3 = {
 'gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5]
}

gsearch3 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=150, max_depth=6,
 min_child_weight=3, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch3.fit(training_data[features], training_data.Label)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

([mean: 0.93008, std: 0.00799, params: {'gamma': 0},
  mean: 0.92990, std: 0.00752, params: {'gamma': 0.1},
  mean: 0.92959, std: 0.00805, params: {'gamma': 0.2},
  mean: 0.93006, std: 0.00732, params: {'gamma': 0.3},
  mean: 0.92939, std: 0.00740, params: {'gamma': 0.4},
  mean: 0.92968, std: 0.00749, params: {'gamma': 0.5}],
 {'gamma': 0},
 0.930076472945256)

In [46]:
xgbclassifier = xgb.XGBClassifier(learning_rate=0.1, n_estimators=150, min_child_weight=3,
max_depth = 6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
xgbclassifier.fit(training_data[features], training_data.Label)

# predict each SV
proba = xgbclassifier.predict_proba(validation_data[features])
events_ids = np.unique(validation_data.EventID)

# compute number of SVs in each event
number_of_sv_in_event = np.bincount(validation_data.EventID)

# compute predictions for events (take the mean value of predictions for SVs forming an event)
events_proba = compute_max(numpy.array(validation_data.EventID), proba[:, 1])[events_ids]

# compute weights for events 
events_weights = compute_mean(validation_data.EventID, validation_data.Weight)[events_ids]

# compute labels for events 
events_labels = compute_mean(validation_data.EventID, validation_data.Label)[events_ids]

roc_auc_score(events_labels, events_proba, sample_weight=events_weights)

  app.launch_new_instance()


0.96471006757655431

In [48]:
param_test4 = {
 'subsample':[0.5, 0.6, 0.7, 0.8, 0.9],
 'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9]
}

gsearch4 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=150, max_depth=6,
 min_child_weight=3, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch4.fit(training_data[features], training_data.Label)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

([mean: 0.92891, std: 0.00722, params: {'subsample': 0.5, 'colsample_bytree': 0.5},
  mean: 0.92928, std: 0.00742, params: {'subsample': 0.6, 'colsample_bytree': 0.5},
  mean: 0.92904, std: 0.00778, params: {'subsample': 0.7, 'colsample_bytree': 0.5},
  mean: 0.92917, std: 0.00729, params: {'subsample': 0.8, 'colsample_bytree': 0.5},
  mean: 0.92941, std: 0.00677, params: {'subsample': 0.9, 'colsample_bytree': 0.5},
  mean: 0.92884, std: 0.00716, params: {'subsample': 0.5, 'colsample_bytree': 0.6},
  mean: 0.92879, std: 0.00740, params: {'subsample': 0.6, 'colsample_bytree': 0.6},
  mean: 0.92924, std: 0.00769, params: {'subsample': 0.7, 'colsample_bytree': 0.6},
  mean: 0.92942, std: 0.00706, params: {'subsample': 0.8, 'colsample_bytree': 0.6},
  mean: 0.92936, std: 0.00713, params: {'subsample': 0.9, 'colsample_bytree': 0.6},
  mean: 0.92870, std: 0.00764, params: {'subsample': 0.5, 'colsample_bytree': 0.7},
  mean: 0.92906, std: 0.00772, params: {'subsample': 0.6, 'colsample_bytree'

In [53]:
param_test5 = {
 'subsample':[0.75, 0.8, 0.85],
 'colsample_bytree': [0.75, 0.8, 0.85]
}

gsearch5 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=150, max_depth=6,
 min_child_weight=3, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch5.fit(training_data[features], training_data.Label)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

([mean: 0.92891, std: 0.00722, params: {'subsample': 0.5, 'colsample_bytree': 0.5},
  mean: 0.92928, std: 0.00742, params: {'subsample': 0.6, 'colsample_bytree': 0.5},
  mean: 0.92904, std: 0.00778, params: {'subsample': 0.7, 'colsample_bytree': 0.5},
  mean: 0.92917, std: 0.00729, params: {'subsample': 0.8, 'colsample_bytree': 0.5},
  mean: 0.92941, std: 0.00677, params: {'subsample': 0.9, 'colsample_bytree': 0.5},
  mean: 0.92884, std: 0.00716, params: {'subsample': 0.5, 'colsample_bytree': 0.6},
  mean: 0.92879, std: 0.00740, params: {'subsample': 0.6, 'colsample_bytree': 0.6},
  mean: 0.92924, std: 0.00769, params: {'subsample': 0.7, 'colsample_bytree': 0.6},
  mean: 0.92942, std: 0.00706, params: {'subsample': 0.8, 'colsample_bytree': 0.6},
  mean: 0.92936, std: 0.00713, params: {'subsample': 0.9, 'colsample_bytree': 0.6},
  mean: 0.92870, std: 0.00764, params: {'subsample': 0.5, 'colsample_bytree': 0.7},
  mean: 0.92906, std: 0.00772, params: {'subsample': 0.6, 'colsample_bytree'

In [58]:
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=150, max_depth=6,
min_child_weight=3, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch6.fit(training_data[features], training_data.Label)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_

([mean: 0.92990, std: 0.00752, params: {'reg_alpha': 1e-05},
  mean: 0.92972, std: 0.00756, params: {'reg_alpha': 0.01},
  mean: 0.92953, std: 0.00763, params: {'reg_alpha': 0.1},
  mean: 0.92988, std: 0.00742, params: {'reg_alpha': 1},
  mean: 0.92343, std: 0.00864, params: {'reg_alpha': 100}],
 {'reg_alpha': 1e-05},
 0.9298981731919647)

In [59]:
xgbclassifier = xgb.XGBClassifier(learning_rate=0.1, n_estimators=150, min_child_weight=3,
max_depth = 6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27,
reg_alpha = 1e-05)
xgbclassifier.fit(training_data[features], training_data.Label)

# predict each SV
proba = xgbclassifier.predict_proba(validation_data[features])
events_ids = np.unique(validation_data.EventID)

# compute number of SVs in each event
number_of_sv_in_event = np.bincount(validation_data.EventID)

# compute predictions for events (take the mean value of predictions for SVs forming an event)
events_proba = compute_max(numpy.array(validation_data.EventID), proba[:, 1])[events_ids]

# compute weights for events 
events_weights = compute_mean(validation_data.EventID, validation_data.Weight)[events_ids]

# compute labels for events 
events_labels = compute_mean(validation_data.EventID, validation_data.Label)[events_ids]

roc_auc_score(events_labels, events_proba, sample_weight=events_weights)

  app.launch_new_instance()


0.96471006757655442

In [60]:
param_test6 = {
 'learning_rate':[0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]
}

gsearch6 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=150, max_depth=6,
min_child_weight=3, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27,
reg_alpha = 1e-05),
param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch6.fit(training_data[features], training_data.Label)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_

([mean: 0.92877, std: 0.00858, params: {'learning_rate': 0.05},
  mean: 0.92990, std: 0.00752, params: {'learning_rate': 0.1},
  mean: 0.92790, std: 0.00822, params: {'learning_rate': 0.15},
  mean: 0.92616, std: 0.00813, params: {'learning_rate': 0.2},
  mean: 0.92399, std: 0.00901, params: {'learning_rate': 0.25},
  mean: 0.92038, std: 0.00809, params: {'learning_rate': 0.3},
  mean: 0.91966, std: 0.00742, params: {'learning_rate': 0.35},
  mean: 0.91566, std: 0.00961, params: {'learning_rate': 0.4}],
 {'learning_rate': 0.1},
 0.9298981731919647)

In [62]:
param_test6 = {
 'n_estimators':[150, 300, 500, 700, 1000, 1500, 2000]
}

gsearch6 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=150, max_depth=6,
min_child_weight=3, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27,
reg_alpha = 1e-05),
param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch6.fit(training_data[features], training_data.Label)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_

([mean: 0.92990, std: 0.00752, params: {'n_estimators': 150},
  mean: 0.92877, std: 0.00748, params: {'n_estimators': 300},
  mean: 0.92710, std: 0.00711, params: {'n_estimators': 500},
  mean: 0.92481, std: 0.00723, params: {'n_estimators': 700},
  mean: 0.92187, std: 0.00750, params: {'n_estimators': 1000},
  mean: 0.91816, std: 0.00817, params: {'n_estimators': 1500},
  mean: 0.91497, std: 0.00835, params: {'n_estimators': 2000}],
 {'n_estimators': 150},
 0.9298981731919647)

In [65]:
param_test7 = {
 'seed':[5, 15, 25, 35, 50]
}

gsearch7 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=150, max_depth=6,
min_child_weight=3, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1,
reg_alpha = 1e-05),
param_grid = param_test7, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch7.fit(training_data[features], training_data.Label)
gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_

([mean: 0.92983, std: 0.00804, params: {'seed': 5},
  mean: 0.92991, std: 0.00788, params: {'seed': 15},
  mean: 0.92967, std: 0.00796, params: {'seed': 25},
  mean: 0.92916, std: 0.00780, params: {'seed': 35},
  mean: 0.92962, std: 0.00829, params: {'seed': 50}],
 {'seed': 15},
 0.9299050154653736)

n_estimators = 1000

In [73]:
cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}

optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'roc_auc', cv = 5, n_jobs = -1)

optimized_GBM.fit(training_data[features], training_data.Label)
optimized_GBM.grid_scores_

[mean: 0.92876, std: 0.00710, params: {'min_child_weight': 1, 'max_depth': 3},
 mean: 0.92878, std: 0.00717, params: {'min_child_weight': 3, 'max_depth': 3},
 mean: 0.92831, std: 0.00712, params: {'min_child_weight': 5, 'max_depth': 3},
 mean: 0.92486, std: 0.00811, params: {'min_child_weight': 1, 'max_depth': 5},
 mean: 0.92480, std: 0.00783, params: {'min_child_weight': 3, 'max_depth': 5},
 mean: 0.92499, std: 0.00749, params: {'min_child_weight': 5, 'max_depth': 5},
 mean: 0.91848, std: 0.00874, params: {'min_child_weight': 1, 'max_depth': 7},
 mean: 0.91932, std: 0.00812, params: {'min_child_weight': 3, 'max_depth': 7},
 mean: 0.91904, std: 0.00787, params: {'min_child_weight': 5, 'max_depth': 7}]

In [76]:
optimized_GBM.best_params_

{'max_depth': 3, 'min_child_weight': 3}

In [77]:
cv_params = {'min_child_weight': [2,3]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'max_depth': 3, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}

optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                            scoring = 'roc_auc', cv = 5, n_jobs = -1)

optimized_GBM.fit(training_data[features], training_data.Label)
optimized_GBM.grid_scores_
optimized_GBM.best_params_

{'min_child_weight': 2}

In [78]:
cv_params = {'learning_rate': [0.01, 0.1, 0.2], 'subsample':[0.7, 0.8, 0.9]}
ind_params = {'n_estimators': 1000, 'max_depth': 3, 'min_child_weight': 2,'seed': 0, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}

optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'roc_auc', cv = 5, n_jobs = -1)

optimized_GBM.fit(training_data[features], training_data.Label)
optimized_GBM.grid_scores_
optimized_GBM.best_params_

{'learning_rate': 0.1, 'subsample': 0.8}

In [13]:
xgbclassifier = xgb.XGBClassifier(n_estimators=1000, max_depth=3, min_child_weight=2,seed=0, colsample_bytree=0.8, 
             objective='binary:logistic', learning_rate=0.1, subsample=0.8)
xgbclassifier.fit(training_data[features], training_data.Label)

# predict each SV
proba = xgbclassifier.predict_proba(validation_data[features])
events_ids = np.unique(validation_data.EventID)

# compute number of SVs in each event
number_of_sv_in_event = np.bincount(validation_data.EventID)

# compute predictions for events (take the mean value of predictions for SVs forming an event)
events_proba = compute_max(numpy.array(validation_data.EventID), proba[:, 1])[events_ids]

# compute weights for events 
events_weights = compute_mean(validation_data.EventID, validation_data.Weight)[events_ids]

# compute labels for events 
events_labels = compute_mean(validation_data.EventID, validation_data.Label)[events_ids]

roc_auc_score(events_labels, events_proba, sample_weight=events_weights)

  app.launch_new_instance()


0.96275514032549481