## ch02-01

### EMSE

In [8]:
import numpy as np
from sklearn.metrics import mean_squared_error

y_true = [1.0, 1.5, 2.0, 1.2, 1.8]
y_pred = [0.8, 1.5, 1.8, 1.3, 3.0]

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(rmse)

0.5531726674375732


### RMSLE

### Confusion Matrix 混同行列

In [9]:
from sklearn.metrics import confusion_matrix

y_true = [1, 0, 1, 1, 0, 1, 1, 0]
y_pred = [0, 0, 1, 1, 0, 0, 1, 1]

tp = np.sum((np.array(y_true) == 1) & (np.array(y_pred) == 1))
tn = np.sum((np.array(y_true) == 0) & (np.array(y_pred) == 0))
fp = np.sum((np.array(y_true) == 0) & (np.array(y_pred) == 1))
fn = np.sum((np.array(y_true) == 1) & (np.array(y_pred) == 0))

cm_1 = np.array([[tp, fp], [fn, tn]])
#TP, FP, FN, TN
print(cm_1)

cm_2 = confusion_matrix(y_true, y_pred)
#TN, FP, FN, TP
print(cm_2)

[[3 1]
 [2 2]]
[[2 1]
 [2 3]]


### Accurary 正答率, Error Rate 誤答率


In [10]:
from sklearn.metrics import accuracy_score
y_true = [1, 0, 1, 1, 0, 1, 1, 0]
y_pred = [0, 0, 1, 1, 0, 0, 1, 1]
accuracy = accuracy_score(y_true, y_pred)
print(accuracy)

0.625


### logloss

In [11]:
from sklearn.metrics import log_loss
y_true = [1, 0, 1, 1, 0, 1]
y_pred = [0.1, 0.2, 0.8, 0.8, 0.1, 0.3]

logloss = log_loss(y_true, y_pred)
print(logloss)

0.7135581778200728


### MAP@K

In [12]:
K = 3

y_true = [[1, 2], [1, 2], [4], [1, 2, 3, 4], [3, 4]]
y_pred = [[1, 2, 4], [4, 1, 2], [1, 4, 3], [1, 2, 3], [1, 2, 4]]

def apk(y_true, y_pred):
    assert(len(y_pred) <= K)
    assert(len(np.unique(y_pred)) == len(y_pred))

    sum_precision = 0.0
    num_hits = 0.0

    for i, p in enumerate(y_pred):
        if p in y_true:
            num_hits += 1
            precision = num_hits/(i+1)
            sum_precision += precision
    
    return sum_precision / min(len(y_true), K)

def mapk(Y_true, Y_pred):
    return np.mean([apk(y_true, y_pred) for y_true, y_pred in zip(Y_true, Y_pred)])

print(mapk(y_true, y_pred))

0.6499999999999999


## ch02-02

### XGBoostにおけるカスタム評価指標とカスタム目的関数

In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

train = pd.read_csv('../dataset/insurance/train_preprocessed.csv')
x_train_total = train.drop(['target'], axis=1)
y_train_total = train['target']
x_test_total = pd.read_csv('../dataset/insurance/test_preprocessed.csv')

kf = KFold(n_splits=4, shuffle=True, random_state=71)
train_idx, val_idx = list(kf.split(x_train_total))[0]
x_train, x_val = x_train_total.iloc[train_idx], x_train_total.iloc[val_idx]
y_train, y_val = y_train_total.iloc[train_idx], y_train_total.iloc[val_idx]

In [16]:
import xgboost
from sklearn.metrics import log_loss

dtrain = xgboost.DMatrix(x_train, label=y_train)
dvalid = xgboost.DMatrix(x_val, label=y_val)

def logregobj(y_preds, dtrain):
    y_true = dtrain.get_label()
    y_preds = 1.0 / (1.0 + np.exp(-y_preds))
    grad = y_preds - y_true
    hess = y_preds * (1.0 - y_preds) #二階微分値
    return grad, hess

def evalerror(y_preds, dtrain):
    y_true = dtrain.get_label()
    return 'custom-error', float(sum(y_true != (y_preds > 0.0))) / len(y_true)

num_round = 50
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

# Method1 (usual)
params = {'random_state': 71, 'objective': 'binary:logistic', 'eval_metric': 'logloss'}
bst = xgboost.train(params, dtrain, num_round, watchlist)

y_pred = bst.predict(dvalid)
logloss = log_loss(y_val, y_pred)
print(logloss)

# Method2
params = {'random_state': 71} # 'silent': 1, 
bst = xgboost.train(params, dtrain, num_round, watchlist, obj=logregobj, feval=evalerror)

y_pred = bst.predict(dvalid)
y_pred = 1.0 / (1.0 + np.exp(-y_pred)) # 変換が必要
logloss = log_loss(y_val, y_pred)
print(logloss)

[0]	train-logloss:0.54088	eval-logloss:0.55003
[1]	train-logloss:0.45269	eval-logloss:0.47182
[2]	train-logloss:0.39482	eval-logloss:0.42026
[3]	train-logloss:0.35198	eval-logloss:0.38520
[4]	train-logloss:0.32021	eval-logloss:0.36150
[5]	train-logloss:0.29673	eval-logloss:0.34463
[6]	train-logloss:0.27610	eval-logloss:0.32900
[7]	train-logloss:0.25886	eval-logloss:0.31670
[8]	train-logloss:0.24363	eval-logloss:0.30775
[9]	train-logloss:0.23153	eval-logloss:0.30092
[10]	train-logloss:0.22016	eval-logloss:0.29413
[11]	train-logloss:0.20963	eval-logloss:0.28528
[12]	train-logloss:0.19951	eval-logloss:0.27912
[13]	train-logloss:0.19324	eval-logloss:0.27642
[14]	train-logloss:0.18547	eval-logloss:0.27154
[15]	train-logloss:0.17474	eval-logloss:0.26516
[16]	train-logloss:0.16900	eval-logloss:0.26089
[17]	train-logloss:0.16323	eval-logloss:0.25849
[18]	train-logloss:0.15950	eval-logloss:0.25691
[19]	train-logloss:0.15637	eval-logloss:0.25511
[20]	train-logloss:0.14722	eval-logloss:0.25034
[2