In [4]:
import pandas as pd 
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier 
from sklearn.model_selection import train_test_split # Import train_test_split function 
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

from pprint import pprint
import matplotlib.pyplot as plt
## Importing required libraries
import numpy as np
%matplotlib inline
#%matplotlib notebook
import seaborn as sns

In [5]:
df = pd.read_csv('data/HIGGS.csv', header=None)
x_unscaled = df #.sample(frac=1, replace=True, random_state=1)
y = x_unscaled[0]
x_unscaled.drop([0],axis=1, inplace=True)


In [6]:
# data_dmatrix = xgb.DMatrix(data=x_unscaled,label=y)
X_train, X_test, y_train, y_test = train_test_split(x_unscaled, y, test_size=0.3, random_state=13)

In [7]:
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train, y_train, verbose=True)
print(model)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)


In [8]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [10]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 74.30%


In [16]:
i=0
for w in model.feature_importances_:
    if w > 0:
        print("The weight of Featire [" + str(i) + "] is " + str(w))
    i+=1

The weight of Featire [0] is 0.03752176
The weight of Featire [1] is 0.013957256
The weight of Featire [2] is 0.001136717
The weight of Featire [3] is 0.048096567
The weight of Featire [4] is 0.0011503488
The weight of Featire [5] is 0.075860925
The weight of Featire [6] is 0.014675418
The weight of Featire [7] is 0.0012089922
The weight of Featire [8] is 0.024098396
The weight of Featire [9] is 0.033851933
The weight of Featire [10] is 0.008887056
The weight of Featire [11] is 0.0014490433
The weight of Featire [12] is 0.018606672
The weight of Featire [13] is 0.018180657
The weight of Featire [14] is 0.011617622
The weight of Featire [15] is 0.0014003875
The weight of Featire [16] is 0.024170693
The weight of Featire [17] is 0.017309818
The weight of Featire [18] is 0.010367594
The weight of Featire [19] is 0.0010497295
The weight of Featire [20] is 0.027526133
The weight of Featire [21] is 0.041171163
The weight of Featire [22] is 0.08832629
The weight of Featire [23] is 0.025354093

## High-level features vs low-level features

* Low-level features: the kinematic properties measured by the particle detectors in the accelerator. Separate the first 21 features
* High-level features derived by physicists to help discriminate between the two classes

In [44]:

x_low_level_features = x_unscaled.iloc[:,0:21]
x_high_level_features = x_unscaled.iloc[:,21:]

print(x_low_level_features.shape)
print(x_high_level_features.shape)


(11000000, 21)
(11000000, 7)


### Split and train only using `x_high_level_features` 

In [50]:
X_train, X_test, y_train, y_test = train_test_split(x_high_level_features, y, test_size=0.3, random_state=13)

In [51]:
from joblib import dump
from joblib import load

model = XGBClassifier()
model.fit(X_train, y_train, verbose=True)
# save model to file
file = "high_level_features.joblib.dat"
dump(model, file)
print("Saved model to: " + file)
# some time later...
# load model from file
loaded_model = load(file)
print("Loaded model from:"+ file)
# make predictions for test data
predictions = loaded_model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print(model)



KeyboardInterrupt: 

In [None]:
i=0
for w in model.feature_importances_:
    if w > 0:
        print("The weight of Featire [" + str(i) + "] is " + str(w))
    i+=1

### Split and train only using `x_low_level_features` 

In [48]:
X_train, X_test, y_train, y_test = train_test_split(x_low_level_features, y, test_size=0.3, random_state=13)

In [49]:
from joblib import dump
from joblib import load

model = XGBClassifier()
model.fit(X_train, y_train, verbose=True)
# save model to file
file = "low_level_features.joblib.dat"
dump(model, file)
print("Saved model to: " + file)
# some time later...
# load model from file
loaded_model = load(file)
print("Loaded model from:"+ file)
# make predictions for test data
predictions = loaded_model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print(model)



KeyboardInterrupt: 