In [1]:
# XGBOOST example synthtetic data > 90%

In [2]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import glob
import xgboost as xgb
import seaborn as sns


In [3]:
def label_marker(row):
    if isinstance(row['marker'], str) and 'Natural' in row['marker']:
        return 0
    if isinstance(row['marker'], str) and 'Attack' in row['marker']:
        return 1
    return 

In [4]:
path = './synthetic_data/'

input_df = pd.DataFrame()
files = glob.glob(path+"*.csv")
for name in files:
    df = pd.read_csv(name) 
    input_df = pd.concat([input_df, df], ignore_index=True)

input_df = input_df.replace([np.inf, -np.inf], 0)
input_df = input_df.sample(frac=1) # we suffle the dataframe

In [5]:
# we delete one feature that we believe is wrong
input_df = input_df.drop(['relay3_log'], axis=1)

In [6]:
y_col = 'marker'
X_cols = input_df.loc[:, input_df.columns != y_col].columns

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(input_df[X_cols], input_df[y_col],test_size=0.2, 
                                                    random_state=42)

print(' X_train shape',X_train.shape, '\n', 
      'y_train shape', y_train.shape, '\n',
      'X_test shape',X_test.shape, '\n', 
      'y_test shape', y_test.shape)

 X_train shape (1485518, 127) 
 y_train shape (1485518,) 
 X_test shape (371380, 127) 
 y_test shape (371380,)


In [7]:
from collections import Counter
class_counter = Counter(y_train)
print(' Number of items class 0:', class_counter[0], '\n',
      'Number of items class 1:', class_counter[1])
estimate_imbalancing = class_counter[0]/class_counter[1]
print(' Imbalancing Factor: ',estimate_imbalancing)

 Number of items class 0: 367130 
 Number of items class 1: 1118388
 Imbalancing Factor:  0.32826711302338724


In [8]:
from numpy import nan
regressor = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=1, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.30000012,
             max_delta_step=0, max_depth=7, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=5000, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=123,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=0.3278456267435863,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [9]:
regressor.fit(X_train, y_train,verbose=10)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=1, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.30000012,
             max_delta_step=0, max_depth=7, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=5000, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=123,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=0.3278456267435863,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [10]:
y_pred = regressor.predict(X_test)
mean_squared_error(y_test, y_pred)

0.1606927

In [11]:
y_pred_round = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, y_pred_round)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 76.46%


In [12]:
#EOF
# 74.54
# 74.46
# 74.92
# 76.18
# 76.46