In [1]:
import pandas as pd
import numpy as np
import pickle as pk
import os
import glob
import zlib

from xgboost import XGBClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold


In [2]:
## Data input

# load graph
with open('dataset_nodes_pdgs_labels.p', 'rb') as fd:
    graph, pdgs, labels = pk.load(fd)

    
labels=pd.Series(labels)
nodes = pd.read_pickle("./features.pkl")



In [3]:
## Features data

print(nodes.head())

   feature1  feature2  feature3   feature4  feature5  feature6  feature7  \
0      17.0      53.0     128.0  12.028289  0.048538  0.249053    1270.0   
1      18.0      54.0     128.0  32.379967 -0.017928  2.538584    1329.0   
2      19.0      54.0     128.0  39.963924 -0.009562  2.760196    1398.0   
3      19.0      54.0     128.0  29.785406 -0.011091  2.890076    1408.0   
4      20.0      54.0     128.0  69.445038 -0.019203  3.075611    1486.0   

   feature8  feature9  
0    4378.0   12097.0  
1    4439.0   12077.0  
2    4432.0   12070.0  
3    4442.0   12080.0  
4    4402.0   12040.0  


In [4]:
## Features:

# Feature1  neighbour_3
# Feature2  neighbour_10
# Feature3  neighbour_30
# Feature4  charge_map
# Feature5  dot_product
# Feature6  angle
# Feature7  charge_distance_3
# Feature8  charge_distance_10
# Feature9  charge_distance_30

# Rename and describe the data frame
nodes.columns = ["neighbour_3", "neighbour_10", "neighbour_30", "charge_map", "dot_product", "angle", "charge_distance_3", "charge_distance_10", "charge_distance_30"]

nodes.describe()

Unnamed: 0,neighbour_3,neighbour_10,neighbour_30,charge_map,dot_product,angle,charge_distance_3,charge_distance_10,charge_distance_30
count,7030533.0,7030533.0,7030533.0,7030533.0,7030533.0,7030533.0,7030533.0,7030533.0,7030533.0
mean,24.97649,111.3293,348.3956,57.05963,0.1804495,1.490602,1454.06,6613.205,20132.32
std,10.76896,59.71821,177.9324,52.1572,19.69706,1.217663,1110.965,4996.266,12931.29
min,0.0,0.0,0.0,0.005439834,-2123.361,0.0,0.0,0.0,0.0
25%,18.0,72.0,224.0,29.59888,-0.02227991,0.145109,871.0,3419.0,10685.0
50%,27.0,100.0,308.0,40.82213,0.0002128416,1.484194,1129.0,4317.0,14525.0
75%,31.0,133.0,451.0,66.5498,0.02362917,2.857099,1755.0,8651.0,28952.0
max,135.0,532.0,1754.0,3050.392,28492.25,3.141593,18888.0,46138.0,110183.0


In [5]:
## Create new labels

# Old labels        
# 0 = track
# 1 = heavy track
# 2 = shower
        
# New labels        
# 0 = track
# 1 = shower

track_number = 0
shower_number = 0
new_labels_list = []
for x in labels:
    if x<2:
        new_labels_list.append(0)
        track_number += 1
    else:
        new_labels_list.append(1)
        shower_number += 1

new_labels = pd.Series(new_labels_list) 
print("Total number of events: ", track_number+shower_number)
print("Total number of tracks: ", track_number)
print("Total number of showers: ", shower_number)
print("Ratio of tracks: ", track_number/(track_number+shower_number))
print("Ratio of showers: ", shower_number/(track_number+shower_number))

Total number of events:  7030533
Total number of tracks:  2315092
Total number of showers:  4715441
Ratio of tracks:  0.32929110780078835
Ratio of showers:  0.6707088921992116


In [6]:
## Split the data

from sklearn.model_selection import train_test_split

X = nodes
#y = labels
y = new_labels

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.6, test_size=None, shuffle=False)


In [7]:
## Preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

#scaler = MinMaxScaler()
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [8]:
## Boosted Decision Tree - eXtreme Gradient Boosting

# import XGBoost classifier
from xgboost import XGBClassifier

# import cross_val_score for cross-validation
from sklearn.model_selection import cross_val_score

xgbc = XGBClassifier()

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=4, nthread=None,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1) 

xgbc.fit(X_train, y_train)


scores = cross_val_score(xgbc, X_train, y_train, cv=5)
print("Mean cross-validation score: %.2f" % scores.mean())



Mean cross-validation score: 0.72


In [9]:
kfold = KFold(n_splits=10, shuffle=True)
kf_cv_scores = cross_val_score(xgbc, X_train, y_train, cv=kfold )
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())

## Predicting with confusion matrix
ypred = xgbc.predict(X_test)
cm = confusion_matrix(y_test,ypred) 
print(cm)

K-fold CV average score: 0.73
[[ 472881  446035]
 [ 354903 1538395]]
