In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.dpi']= 100


from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_curve, auc
import xgboost as xgb

In [2]:
#/content/SUSY.csv.gz
import os
#print(os.listdir('/content'))
if 'SUSY.csv' in os.listdir('/content'):
  print('file exists')
else:
  print('file downloading')
  !wget http://archive.ics.uci.edu/ml/machine-learning-databases/00279/SUSY.csv.gz
  !gzip -d SUSY.csv.gz
# Link to dataset: http://archive.ics.uci.edu/ml/datasets/SUSY
# 0: background
# 1: signal

file exists


In [None]:
df = pd.read_csv('SUSY.csv')
df.columns = ["label","lepton1-pT", "lepton1-eta", "lepton1-phi", "lepton2-pT", "lepton2-eta", "lepton2-phi", "missing-energy-magnitude", "missing-energy-phi", "MET-rel", "axial-MET", "MR", "MTR2", "R", "MT2", "SR", "MDeltaR", "dPhirb", "cos(thetar1)"]
df

In [None]:
signal = df.loc[df['label']==1]
background = df.loc[df['label']==0]

In [None]:
import numpy as np

In [None]:
for col in df.columns:
  plt.hist(signal[col], alpha=0.4,bins=50,color='b',label='signal')#  range=[min,max]
  plt.hist(background[col],alpha=0.4,bins=50,color='r',label='background')#,range=[min,max]
  plt.xlabel(col)
  plt.yscale('log')
  plt.legend() 
  plt.show()

In [None]:
df_all =df# 
df_high_level= df[["MET-rel", "axial-MET", "MR", "MTR2", "R", "MT2", "SR", "MDeltaR", "dPhirb", "cos(thetar1)"]]
df_low_level=df[["label","lepton1-pT", "lepton1-eta", "lepton1-phi", "lepton2-pT", "lepton2-eta", "lepton2-phi", "missing-energy-magnitude", "missing-energy-phi"]]

In [None]:
y = df_low_level[['label']]
X = df_low_level.drop('label',axis=1)
X=X.to_numpy()
y=y.to_numpy()
#y.ravel()

In [None]:
X_train, X_valid, y_train, y_valid =  train_test_split(X,y,random_state=1,test_size=.35)

In [None]:
#rand = RandomForestClassifier(criterion='gini',min_samples_leaf=5,max_depth=6,n_jobs=-1)
rand = xgb.XGBClassifier(max_depth=10,sampling_method='uniform',reg_lambda=2,reg_alpha=0.1,n_jobs=-1,random_state=1,tree_method='gpu_hist')#criterion='gini',min_samples_leaf=5,max_depth=6,n_jobs=-1)
rand.fit(X_train,y_train.ravel())
y_pred_xgb = rand.predict_proba(X_valid)

In [None]:
plt.hist(y_pred_xgb[:,0],label='background',bins=50,histtype='step')
plt.hist(y_pred_xgb[:,1],label='signal',bins=50,histtype='step')
plt.legend()
plt.xlabel('BDT Output')
plt.ylabel('Counts`')
plt.yscale('log')
#plt.gca().set_aspect('equal', adjustable='box')
plt.show()

In [None]:
fpr_xgb, tpr_xgb, thresholds = roc_curve(y_valid.ravel(), y_pred_xgb[:,1].ravel())
auc_xgb = auc(fpr_xgb, tpr_xgb)
plt.plot(tpr_xgb, 1-fpr_xgb,label=f'XGB, AUC={auc_xgb:.2f}')
#plt.plot(tpr_xgb, 1/(fpr_xgb+.000001),label=f'RandomForestClassifier, AUC={auc_xgb:.2f}')
#plt.yscale('log')
plt.xlabel('Signal Efficiency')
plt.ylabel('Background Rejection')
plt.xlim([0.0, 1.0])
plt.gca().set_aspect('equal', adjustable='box')
plt.legend(loc='lower left',title_fontsize='x-small')
plt.show()

In [None]:
S = 100*tpr_xgb
B = 1000*fpr_xgb
metric = S/np.sqrt(S+B+.000000001)
opt_index = np.argmax(metric)
print(thresholds[opt_index],metric[opt_index])
plt.plot(thresholds,metric)
plt.xlim([0.0, 1.0])
plt.xlabel('BDT Cut')
plt.ylabel('Significance')
plt.show()

In [None]:
save = pd.DataFrame({"fpr_xgb" : fpr_xgb, "tpr_xgb" : tpr_xgb, "thresholds":thresholds})
save.to_csv("submission_xgb.csv", index=False)