In [1]:
%run data_getter_and_processor.ipynb
import xgboost 
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.preprocessing import scale

# Obtain Training and testing data.
train_x, test_x, train_y, test_y = get_split_train_data(random_state=10)



In [5]:
from sklearn.preprocessing import PolynomialFeatures

# Xgboost without fixing class imbalance.
# Processing the data to fix imballance of classes.
worst_stress_levels = train_y.loc[:, "worst_stress_level"]
# balanced_train_x, worst_stress_levels = get_balanced_dataset_by_hybrid_approach(train_x, worst_stress_levels)
balanced_train_x, worst_stress_levels = train_x, worst_stress_levels


# # applying basis expansion.
# poly = PolynomialFeatures(degree=2 ,interaction_only=True)
# balanced_train_x = poly.fit_transform(balanced_train_x, worst_stress_levels)
# test_x = poly.fit_transform(test_x, test_y)

# Scaling the data.
balanced_train_x = scale(balanced_train_x)
test_x = scale(test_x)

# Xgboost code.
dtrain = xgboost.DMatrix(balanced_train_x, label=worst_stress_levels)
param = {'max_depth': 9, 'eta': 1, 'silent': 1, 'objective': 'multi:softmax','rate_drop': 0.1,'skip_drop': 0.5, 'lambda': 1.5}
param['nthread'] = 4
param['eval_metric'] = 'mlogloss'
param['num_class'] = 5
param['booster'] = 'dart'

evallist = [(test_x, 'eval'), (dtrain, 'train')]
num_round = 20

# Training Xgboost.
bst = xgboost.train(param, dtrain, num_round)
dtest = xgboost.DMatrix(test_x)
ypred = bst.predict(dtest)

In [6]:
# Evaluate.
from IPython.display import display

ypred_df = pd.DataFrame(ypred)

score = accuracy_score(test_y.loc[:,"worst_stress_level"], ypred_df, normalize=True)
f1 = f1_score(test_y.iloc[:,0], ypred_df, average=None)
p_score = precision_score(test_y.iloc[:,0], ypred_df, average=None)
r_score = recall_score(test_y.iloc[:,0], ypred_df, average=None)

print("Worst stress levels accuracy is "+ str(score * 100) + " %")
print("Worst stress levels f_1 score ", f1)
print("Worst stress levels precision score ", p_score)
print("Worst stress levels recall score ", r_score)
print(np.array(ypred))

Worst stress levels accuracy is 50.6024096386 %
Worst stress levels f_1 score  [ 0.42666667  0.35164835  0.62992126  0.38235294  0.2       ]
Worst stress levels precision score  [ 0.43243243  0.4         0.57971014  0.39393939  1.        ]
Worst stress levels recall score  [ 0.42105263  0.31372549  0.68965517  0.37142857  0.11111111]
[ 1.  2.  2.  3.  0.  2.  2.  2.  2.  2.  2.  2.  2.  1.  2.  0.  2.  3.
  2.  0.  1.  0.  2.  2.  3.  1.  2.  2.  2.  0.  2.  2.  2.  2.  3.  2.
  2.  2.  2.  0.  2.  2.  2.  1.  2.  2.  2.  0.  2.  0.  3.  2.  1.  2.
  2.  2.  3.  3.  2.  2.  1.  2.  2.  0.  2.  2.  3.  2.  1.  0.  1.  2.
  3.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  1.  2.  1.  2.  1.  0.
  3.  2.  0.  2.  2.  2.  0.  1.  3.  2.  2.  2.  3.  2.  0.  3.  2.  1.
  0.  1.  2.  0.  3.  2.  1.  0.  1.  1.  2.  2.  2.  2.  3.  2.  2.  2.
  0.  3.  1.  2.  3.  0.  3.  2.  3.  0.  3.  2.  0.  0.  2.  1.  2.  2.
  1.  3.  1.  2.  3.  1.  1.  4.  2.  1.  1.  2.  2.  0.  2.  2.  0.  2.
  1.  0