In [None]:
import time
import math
import shap
import pickle
import optuna
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
import tensorflow as tf
import matplotlib.pyplot as plt

from scipy.special import ndtri
from decimal import Decimal, ROUND_HALF_UP
from sklearn.calibration import calibration_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import log_loss, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, validation_curve, cross_val_score, learning_curve

pd.set_option('display.max_columns', 200)

# データの読み込み

In [None]:
df = pd.read_csv('データ抽出.csv')
df

In [None]:
df.info()

# 前処理

In [None]:
cut = 0

## 治療法

In [None]:
df['治療法解析用'].value_counts()

In [None]:
cut += df['治療法解析用'].isnull().sum()
df['治療法解析用'].isnull().sum()

In [None]:
df = df.dropna(subset=['治療法解析用'])

In [None]:
df = pd.get_dummies(df, columns=['治療法解析用'], prefix='', prefix_sep='')
df = df.drop(columns='無治療')
df.rename(columns={'化学療法': 'MTA', '放射線治療': 'Radiation'}, inplace=True)
df

## 前回治療からの期間

In [None]:
cut += df['Last_Treatment'].isnull().sum()
df['Last_Treatment'].isnull().sum()

In [None]:
df['Last_Treatment'] = df['Last_Treatment'].replace('#NUM!', 0).replace(0, 10000).astype(int)
df['Last_Treatment'].value_counts()

In [None]:
df['Last_Treatment'] = np.log10(df['Last_Treatment'] + 1)
df['Last_Treatment'].hist()

## 年齢

In [None]:
cut += df['Age'].isnull().sum()
df['Age'].isnull().sum()

In [None]:
df = df.dropna(subset=['Age'])

## 性別

In [None]:
cut += df['Gender'].isnull().sum()
df['Gender'].isnull().sum()

In [None]:
df['Gender'] = df['Gender'].replace(1,  0).replace(2, 1)
df['Gender'].value_counts()

## BMI

In [None]:
df['BMI'].isnull().sum()

In [None]:
df['BMI'].mean()

In [None]:
df['BMI'] = df['BMI'].fillna(df['BMI'].mean())
df['BMI'].value_counts()

## 手術回数

In [None]:
cut += df['No_of_Admission'].isnull().sum()
df['No_of_Admission'].isnull().sum()

In [None]:
df = df.dropna(subset=['No_of_Admission'])
df['No_of_Admission'] = df['No_of_Admission'].astype(int)
df['No_of_Admission'].value_counts()

## 個数

In [None]:
df['HCC_No'].value_counts()

In [None]:
cut += df['HCC_No'].isnull().sum()
df['HCC_No'].isnull().sum()

In [None]:
#df['HCC_No'] = df['HCC_No'].fillna(5).astype(int)
df = df.dropna(subset=['HCC_No'])
df['HCC_No'] = df['HCC_No'].astype(int)
df['HCC_No'].value_counts()

In [None]:
before = 0
l = []
for i, n in zip(df['Code'], df['HCC_No']):
    if i == before:
        l.append(l[-1] + n)
    else:
        l.append(n)
        before = i

df['No_Cumsum'] = l
df['No_Cumsum'].value_counts()

## サイズ

In [None]:
df['HCC_size'].value_counts()

In [None]:
cut += df['HCC_size'].isnull().sum()
df['HCC_size'].isnull().sum()

In [None]:
df = df.replace('diffuse', '1')
df = df.dropna(subset=['HCC_size'])
df['HCC_size'] = df['HCC_size'].map(lambda x: int(Decimal(str(x)).quantize(Decimal('0'), rounding=ROUND_HALF_UP)))
df['HCC_size'].value_counts()

## サイズ*個数

In [None]:
df['NoSize'] = df['HCC_No'] * df['HCC_size']
df['NoSize'].value_counts()

In [None]:
cut += df['NoSize'].isnull().sum()
df['NoSize'].isnull().sum()

In [None]:
before = 0
l = []
for i, n in zip(df['Code'], df['NoSize']):
    if i == before:
        l.append(l[-1] + n)
    else:
        l.append(n)
        before = i

l_10 = [i//10 for i in l]
df['NoSize_Cumsum'] = l_10
df['NoSize_Cumsum'].value_counts()

## PS

In [None]:
df['PS'].value_counts()

In [None]:
#PSは0埋め
df['PS'] = df['PS'].fillna(0).astype(int)
df['PS_Raw'] = df['PS']
df = pd.get_dummies(df, columns=['PS'])
df = df.drop(columns='PS_0')
df['PS_Raw'].value_counts()

## ALBI

In [None]:
df['ALBI_score'].value_counts()

In [None]:
cut += df['ALBI_score'].isnull().sum()
df['ALBI_score'].isnull().sum()

In [None]:
df = df.dropna(subset=['ALBI_score'])
df['ALBI_score'] = df['ALBI_score'].map(lambda x: int(Decimal(str(x*(-100))).quantize(Decimal('0'), rounding=ROUND_HALF_UP)))
df['ALBI_score'].value_counts()

## ALBI_grade

In [None]:
df['ALBI_grade'].value_counts()

In [None]:
cut += df['ALBI_grade'].isnull().sum()
df['ALBI_grade'].isnull().sum()

In [None]:
df['ALBI_grade'] = df['ALBI_grade'].replace('3', '4').replace('2b', '3').replace('2a', '2').astype(int)
df = pd.get_dummies(df, columns=['ALBI_grade'])
df = df.drop(columns='ALBI_grade_1')
df

## AFP

In [None]:
#cut += df['AFP'].isnull().sum()
df['AFP'].isnull().sum()

In [None]:
#AFPは0埋め
df['AFP'] = df['AFP'].fillna(0).astype(float)
df.insert(loc=0, column='AFP_100', value= -1)
df.loc[df['AFP'] < 100, 'AFP_100'] = 0
df.loc[~(df['AFP'] < 100), 'AFP_100'] = 1
df['AFP_100'].value_counts()

## L3

In [None]:
#cut += df['L3'].isnull().sum()
df['L3'].isnull().sum()

In [None]:
#L3は0埋め
df['L3'] = df['L3'].fillna(0).astype(float)
df.insert(loc=0, column='L3_10', value= -1)
df.loc[df['L3'] < 10, 'L3_10'] = 0
df.loc[~(df['L3'] < 10), 'L3_10'] = 1
df['L3_10'].value_counts()

In [None]:
df['L3_10'] = df['L3_10'].fillna(0).astype(int)
df['L3_10'].value_counts()

## PIVKA

In [None]:
#cut += df['PIVKA'].isnull().sum()
df['PIVKA'].isnull().sum()

In [None]:
#PIVKAは0埋め
df['PIVKA'] = df['PIVKA'].fillna(0).astype(float)
df.insert(loc=0, column='PIVKA_100', value= -1)
df.loc[df['PIVKA'] < 100, 'PIVKA_100'] = 0
df.loc[~(df['PIVKA'] < 100), 'PIVKA_100'] = 1
df['PIVKA_100'].value_counts()

## Vp_grade

In [None]:
df['Vp_grade'].value_counts()

In [None]:
cut += df['Vp_grade'].isnull().sum()
df['Vp_grade'].isnull().sum()

In [None]:
df['Vp_grade'] = df['Vp_grade'].replace(2,  1).replace(3, 1).replace(4, 1)
df['Vp_grade'].value_counts()

## Meta0or1

In [None]:
df['Meta0or1'].value_counts()

In [None]:
cut += df['Meta0or1'].isnull().sum()
df['Meta0or1'].isnull().sum()

In [None]:
df = df.dropna(subset=['Meta0or1'])
df['Meta0or1'] = df['Meta0or1'].replace(2, 1).astype(int)
df['Meta0or1'].value_counts()

## etiology

In [None]:
df['etiology_C1B2BC3Alc4NBNC5'].value_counts()

In [None]:
cut += df['etiology_C1B2BC3Alc4NBNC5'].isnull().sum()
df['etiology_C1B2BC3Alc4NBNC5'].isnull().sum()

In [None]:
df = df.dropna(subset=['etiology_C1B2BC3Alc4NBNC5'])
df = df.rename(columns={'etiology_C1B2BC3Alc4NBNC5': 'etiology_class'})
df['etiology_class'] = df['etiology_class'].replace(1,  'C').replace(2, 'B').replace(3, 'BC').replace(4, 'Alc').replace(5, 'NBNC')
df['etiology_class'].value_counts()

In [None]:
df = pd.get_dummies(df, columns=['etiology_class'])
df.loc[df['etiology_class_BC'] == 1, 'etiology_class_B'] = 1
df.loc[df['etiology_class_BC'] == 1, 'etiology_class_C'] = 1
df = df.drop(columns=['etiology_class_BC', 'etiology_class_NBNC'])
df

## OS

In [None]:
df['OS_day'] = df['OS_day'].replace('#VALUE!', np.nan).replace('#REF!', np.nan)
cut += df['OS_day'].isnull().sum()
df['OS_day'].isnull().sum()

In [None]:
df = df.dropna(subset=['OS_day'])
df['OS_day'] = df['OS_day'].astype(int)
df['OS_day'].unique()

## 肝臓がんのみを抽出

In [None]:
df['肝癌症例'].value_counts()

In [None]:
cut += len(df[df['肝癌症例']==0])
len(df[df['肝癌症例']==0])

In [None]:
df = df[df['肝癌症例'] == 1]

## dfとcutの確認

In [None]:
## 肝臓がんのみを抽出

df['肝癌症例'].value_counts()

cut += len(df[df['肝癌症例']==0])
len(df[df['肝癌症例']==0])

df = df[df['肝癌症例'] == 1]

## dfとcutの確認

In [None]:
df

In [None]:
cut

In [None]:
len(df[df['No_of_Admission'] == 1])

## 3yearの作成

In [None]:
df['Death1Alive0'] = df['Death1Alive0'].astype(int)
df.insert(loc=0, column='3year', value= -1)
df.loc[(df['OS_day'] < 1095) & (df['Death1Alive0'] == 1), '3year'] = 0
df.loc[df['OS_day'] >= 1095, '3year'] = 1
df['3year'].value_counts()

In [None]:
## 3年後の生死が未確認（OS.day<1095&Death1Alive0=0)を削除
df = df[df['3year'] != -1]

## 学習データ

In [None]:
data = df.loc[:,['Ablation', 'OPE', 'TAE', 'MTA', 'Radiation', 'Last_Treatment', 'Age', 'Gender', 'BMI', 'No_of_Admission', 'HCC_No', 'No_Cumsum', 
                 'HCC_size', 'NoSize', 'NoSize_Cumsum', 'PS_Raw', 'PS_1', 'PS_2', 'PS_3', 'PS_4', 'ALBI_score', 'AFP_100', 'L3_10', 'PIVKA_100', 
                 'Vp_grade', 'Meta0or1', 'etiology_class_C', 'etiology_class_B', 'etiology_class_Alc']]
target = df['3year']
data

In [None]:
data.dtypes

# TrainとValidの作成

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(data, target, train_size = 0.8, random_state = 5)

In [None]:
x_train.head()

# GBDT

In [None]:
X = data.drop(['PS_1', 'PS_2', 'PS_3', 'PS_4'], axis=1)
Y = target.values.ravel()

model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', eta=0.05, max_depth=4, use_label_encoder=False)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = cross_validate(model, X, Y, scoring=['neg_log_loss', 'roc_auc'], cv=kfold)

print(results)
print()
print('LogLoss :', np.mean(results['test_neg_log_loss'])*-1)
print('AUC :', np.mean(results['test_roc_auc']))

In [None]:
cv_params = {'subsample': [0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0],
             'colsample_bytree': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
             'reg_alpha': [0, 0.0001, 0.001, 0.01, 0.03, 0.1, 0.3, 1.0, 1.5, 2.0],
             'reg_lambda': [0, 0.0001, 0.001, 0.01, 0.03, 0.1, 0.3, 1.0, 1.5, 2.0],
             'learning_rate': [0, 0.0001, 0.001, 0.01, 0.03, 0.1, 0.3, 1.0],
             'min_child_weight': [1, 3, 5, 7, 9, 11, 13, 15, 18],
             'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
             'gamma': [0, 0.0001, 0.001, 0.01, 0.03, 0.1, 0.3, 1.0, 1.5, 2.0]
             }
param_scales = {'subsample': 'linear',
                'colsample_bytree': 'linear',
                'reg_alpha': 'log',
                'reg_lambda': 'log',
                'learning_rate': 'log',
                'min_child_weight': 'linear',
                'max_depth': 'linear',
                'gamma': 'log'
                }
fit_params = {'verbose': 0,  'early_stopping_rounds': 20,  'eval_metric': 'logloss',  'eval_set': [(X, Y)]}
scoring = 'neg_log_loss'

# 検証曲線のプロット（パラメータ毎にプロット）
for i, (k, v) in enumerate(cv_params.items()):
    train_scores, valid_scores = validation_curve(estimator=model,
                                                  X=X, y=Y,
                                                  param_name=k,
                                                  param_range=v,
                                                  fit_params=fit_params,
                                                  cv=kfold, scoring=scoring,
                                                  n_jobs=-1)
    # 学習データに対するスコアの平均±標準偏差を算出
    train_mean = np.mean(train_scores, axis=1)
    train_std  = np.std(train_scores, axis=1)
    train_center = train_mean
    train_high = train_mean + train_std
    train_low = train_mean - train_std
    # テストデータに対するスコアの平均±標準偏差を算出
    valid_mean = np.mean(valid_scores, axis=1)
    valid_std  = np.std(valid_scores, axis=1)
    valid_center = valid_mean
    valid_high = valid_mean + valid_std
    valid_low = valid_mean - valid_std
    # training_scoresをプロット
    plt.plot(v, train_center, color='blue', marker='o', markersize=5, label='training score')
    plt.fill_between(v, train_high, train_low, alpha=0.15, color='blue')
    # validation_scoresをプロット
    plt.plot(v, valid_center, color='green', linestyle='--', marker='o', markersize=5, label='validation score')
    plt.fill_between(v, valid_high, valid_low, alpha=0.15, color='green')
    # スケールをparam_scalesに合わせて変更
    plt.xscale(param_scales[k])
    # 軸ラベルおよび凡例の指定
    plt.xlabel(k)  # パラメータ名を横軸ラベルに
    plt.ylabel(scoring)  # スコア名を縦軸ラベルに
    plt.legend(loc='lower right')  # 凡例
    # グラフを描画
    plt.show()

In [None]:
start = time.time()

# ベイズ最適化時の評価指標算出メソッド
def bayes_objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 8),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0001, 0.1, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0001, 0.1, log=True),
        'gamma': trial.suggest_float('gamma', 0.0001, 0.1, log=True),
    }
    # モデルにパラメータ適用
    model.set_params(**params)
    # cross_val_scoreでクロスバリデーション
    scores = cross_val_score(model, X, Y, cv=kfold,
                             scoring=scoring, fit_params=fit_params, n_jobs=-1)
    val = scores.mean()
    return val

# ベイズ最適化を実行
study = optuna.create_study(direction='maximize',
                            sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(bayes_objective, n_trials=600)

# 最適パラメータの表示と保持
best_params = study.best_trial.params
best_score = study.best_trial.value
print(f'最適パラメータ {best_params}\nスコア {best_score}')
print(f'所要時間{time.time() - start}秒')

In [None]:
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
# 最適パラメータを学習器にセット
model.set_params(**best_params)

# 学習曲線の取得
train_sizes, train_scores, valid_scores = learning_curve(estimator=model,
                                                         X=X, y=Y,
                                                         train_sizes=np.linspace(0.1, 1.0, 10),
                                                         fit_params=fit_params,
                                                         cv=kfold, scoring=scoring, n_jobs=-1)
# 学習データ指標の平均±標準偏差を計算
train_mean = np.mean(train_scores, axis=1)
train_std  = np.std(train_scores, axis=1)
train_center = train_mean
train_high = train_mean + train_std
train_low = train_mean - train_std
# 検証データ指標の平均±標準偏差を計算
valid_mean = np.mean(valid_scores, axis=1)
valid_std  = np.std(valid_scores, axis=1)
valid_center = valid_mean
valid_high = valid_mean + valid_std
valid_low = valid_mean - valid_std
# training_scoresをプロット
plt.plot(train_sizes, train_center, color='blue', marker='o', markersize=5, label='training score')
plt.fill_between(train_sizes, train_high, train_low, alpha=0.15, color='blue')
# validation_scoresをプロット
plt.plot(train_sizes, valid_center, color='green', linestyle='--', marker='o', markersize=5, label='validation score')
plt.fill_between(train_sizes, valid_high, valid_low, alpha=0.15, color='green')
# 最高スコアの表示
best_score = valid_center[len(valid_center) - 1]
plt.text(np.amax(train_sizes), valid_low[len(valid_low) - 1], f'best_score={best_score}',
                color='black', verticalalignment='top', horizontalalignment='right')
# 軸ラベルおよび凡例の指定
plt.xlabel('training examples')  # 学習サンプル数を横軸ラベルに
plt.ylabel(scoring)  # スコア名を縦軸ラベルに
plt.legend(loc='lower right')  # 凡例

In [None]:
valid_curve_params = {'subsample': [0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0],
             'colsample_bytree': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
             'reg_alpha': [0, 0.0001, 0.001, 0.01, 0.03, 0.1, 0.3, 1.0, 1.5, 2.0],
             'reg_lambda': [0, 0.0001, 0.001, 0.01, 0.03, 0.1, 0.3, 1.0, 1.5, 2.0],
             'learning_rate': [0, 0.0001, 0.001, 0.01, 0.03, 0.1, 0.3, 1.0],
             'min_child_weight': [1, 3, 5, 7, 9, 11, 13, 15, 18],
             'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
             'gamma': [0, 0.0001, 0.001, 0.01, 0.03, 0.1, 0.3, 1.0, 1.5, 2.0]
             }

for k, v in valid_curve_params.items():
    if best_params[k] not in v:
        v.append(best_params[k])
        v.sort()
for i, (k, v) in enumerate(valid_curve_params.items()):
    # モデルに最適パラメータを適用
    model.set_params(**best_params)
    # 検証曲線を描画
    train_scores, valid_scores = validation_curve(estimator=model,
                                                  X=X, y=Y,
                                                  param_name=k,
                                                  param_range=v,
                                                  fit_params=fit_params,
                                                  cv=kfold, scoring=scoring,
                                                  n_jobs=-1)
    # 学習データに対するスコアの平均±標準偏差を算出
    train_mean = np.mean(train_scores, axis=1)
    train_std  = np.std(train_scores, axis=1)
    train_center = train_mean
    train_high = train_mean + train_std
    train_low = train_mean - train_std
    # テストデータに対するスコアの平均±標準偏差を算出
    valid_mean = np.mean(valid_scores, axis=1)
    valid_std  = np.std(valid_scores, axis=1)
    valid_center = valid_mean
    valid_high = valid_mean + valid_std
    valid_low = valid_mean - valid_std
    # training_scoresをプロット
    plt.plot(v, train_center, color='blue', marker='o', markersize=5, label='training score')
    plt.fill_between(v, train_high, train_low, alpha=0.15, color='blue')
    # validation_scoresをプロット
    plt.plot(v, valid_center, color='green', linestyle='--', marker='o', markersize=5, label='validation score')
    plt.fill_between(v, valid_high, valid_low, alpha=0.15, color='green')
    # 最適パラメータを縦線表示
    plt.axvline(x=best_params[k], color='gray')
    # スケールをparam_scalesに合わせて変更
    plt.xscale(param_scales[k])
    # 軸ラベルおよび凡例の指定
    plt.xlabel(k)  # パラメータ名を横軸ラベルに
    plt.ylabel(scoring)  # スコア名を縦軸ラベルに
    plt.legend(loc='lower right')  # 凡例
    # グラフを描画
    plt.show()

In [None]:
x_train_gbdt = x_train.drop(['PS_1', 'PS_2', 'PS_3', 'PS_4'], axis=1)
x_valid_gbdt = x_valid.drop(['PS_1', 'PS_2', 'PS_3', 'PS_4'], axis=1)

dtrain = xgb.DMatrix(x_train_gbdt, label=y_train)
dvalid = xgb.DMatrix(x_valid_gbdt, label=y_valid)
x_train_gbdt.head()

In [None]:
best_params = {'learning_rate': 0.08566665798184023, 'min_child_weight': 2, 'max_depth': 6, 'colsample_bytree': 0.6332864539990984, 
               'subsample': 0.8749273245384632, 'reg_alpha': 0.010240948422244883, 'reg_lambda': 0.0006712774237446539, 'gamma': 0.013301717822084034}

In [None]:
params = {'objective': 'binary:logistic', 'eval_metric': 'logloss', 'booster': 'gbtree'} | best_params
num_round = 1000
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
evals_result = {}
model_GBDT = xgb.train(params, dtrain, num_round, early_stopping_rounds=20, evals=watchlist, evals_result=evals_result)

In [None]:
#trainデータに対してのloss推移をplot
plt.plot(evals_result['train']['logloss'], label='train logloss')
#validデータに対してのloss推移をplot
plt.plot(evals_result['eval']['logloss'], label='eval logloss')
plt.grid()
plt.legend()
plt.xlabel('rounds')
plt.ylabel('logloss')
plt.show()

In [None]:
git, ax = plt.subplots(figsize=(12, 12))
xgb.plot_importance(model_GBDT, height=0.8, ax=ax)
#plt.savefig('1021_Feature_Importance_3year.jpg', dpi=300)
plt.show()

In [None]:
explainer = shap.TreeExplainer(model_GBDT)
shap_values = explainer.shap_values(x_train_gbdt)
shap.summary_plot(shap_values, x_train_gbdt, max_display=100, show=False)
plt.title('SHAP_Value_3year.jpg')
#plt.savefig('results/0111_Feature_Importance_3year_SHAP.jpg', dpi=300)
plt.show()

In [None]:
y_valid

In [None]:
pred_GBDT = model_GBDT.predict(dvalid)

plt.figure(figsize=(6, 6))
plt.ylabel("Predict")
plt.xlabel("Actual")
plt.xlim(0, 1)
plt.scatter(pred_GBDT, y_valid)
plt.show()

In [None]:
zero, one = [], []

for i in range(len(pred_GBDT)):
    if y_valid.get(i) == 0:
        zero.append(pred_GBDT[i])
    else:
        one.append(pred_GBDT[i])
        
plt.figure(figsize=(10, 6))
bins = np.linspace(0, 1, 50)

plt.hist(zero, bins, alpha = 0.5, label='zero', density = True)
plt.hist(one, bins, alpha = 0.5, label='one', density = True)
plt.legend(loc='upper left')

plt.show()

In [None]:
# ROC曲線の値の生成：fpr、tpr、閾値
fpr, tpr, thresholds = roc_curve(y_valid, pred_GBDT)

# ROC曲線のプロット
plt.figure(figsize=(6, 6))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Training_Cohort_ROC_3year')
#plt.savefig('results/0111_Valid_ROC_3year.jpg', dpi=300)
plt.show()

#AUCの表示
auc_GBDT = roc_auc_score(y_valid, pred_GBDT)
print(auc_GBDT)

In [None]:
print(y_valid.values.tolist()[:20])

In [None]:
print(np.round(pred_GBDT).astype(int).tolist()[:20])

In [None]:
cm = confusion_matrix(y_valid.values.tolist(), np.round(pred_GBDT).astype(int).tolist())
cm

In [None]:
def roc_auc_ci(y_true, y_score, positive=1):
    AUC = roc_auc_score(y_true, y_score)
    N1 = sum(y_true == positive)
    N2 = sum(y_true != positive)
    Q1 = AUC / (2 - AUC)
    Q2 = 2*AUC**2 / (1 + AUC)
    SE_AUC = math.sqrt((AUC*(1 - AUC) + (N1 - 1)*(Q1 - AUC**2) + (N2 - 1)*(Q2 - AUC**2)) / (N1*N2))
    lower = AUC - 1.96*SE_AUC
    upper = AUC + 1.96*SE_AUC
    if lower < 0:
        lower = 0
    if upper > 1:
        upper = 1
    return (lower, upper)

roc_auc_ci(y_valid, pred_GBDT)

In [None]:
def _proportion_confidence_interval(r, n, z):
    A = 2*r + z**2
    B = z*math.sqrt(z**2 + 4*r*(1 - r/n))
    C = 2*(n + z**2)
    return ((A-B)/C, (A+B)/C)

def sensitivity_and_specificity_with_confidence_intervals(TP, FP, FN, TN, alpha):
    z = -ndtri((1.0-alpha)/2)
    
    # Compute sensitivity using method described in [1]
    sensitivity_point_estimate = TP/(TP + FN)
    sensitivity_confidence_interval = _proportion_confidence_interval(TP, TP + FN, z)
    
    # Compute specificity using method described in [1]
    specificity_point_estimate = TN/(TN + FP)
    specificity_confidence_interval = _proportion_confidence_interval(TN, TN + FP, z)
    
    return sensitivity_point_estimate, specificity_point_estimate, sensitivity_confidence_interval, specificity_confidence_interval

sensitivity_and_specificity_with_confidence_intervals(cm[1][1], cm[0][1], cm[1][0], cm[0][0], 0.95)

In [None]:
def viz_calibration_curve(y_test, y_pred, name):
    frac_of_pos, mean_pred_value = calibration_curve(y_test, y_pred, n_bins=10)

    fig, ax = plt.subplots(1, 2, figsize=(15,6))
    ax[0].plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
    ax[0].plot(mean_pred_value, frac_of_pos, marker="o", label=f'{name}')
    ax[0].set_ylabel("Fraction of positives")
    ax[0].set_ylim([-0.05, 1.05])
    ax[0].legend(loc="lower right")
    ax[0].set_title(f'Calibration plot 3year ({name})')
    
    sns.distplot(y_pred, bins=100, label='predicted score', ax=ax[1])
    ax[1].legend(loc='upper right')
    ax[1].set_xlim([-0.05, 1.05])
    #plt.savefig('results/0111_calibration_3year.jpg', dpi=300)
    plt.show()

# AUCとReliability Diagramの可視化
viz_calibration_curve(y_valid, pred_GBDT, 'XGBoost')

# NN

In [None]:
scaler = StandardScaler()
x_train_nn = x_train.drop(['PS_Raw'], axis=1)
x_valid_nn = x_valid.drop(['PS_Raw'], axis=1)

x_train_nn = scaler.fit_transform(x_train_nn)
x_valid_nn = scaler.fit_transform(x_valid_nn)

In [None]:
model_NN = tf.keras.models.Sequential()
model_NN.add(tf.keras.layers.Dense(16, activation='relu', input_shape=(x_train_nn.shape[1],)))
model_NN.add(tf.keras.layers.Dropout(0.1))
model_NN.add(tf.keras.layers.Dense(8, activation='relu'))
model_NN.add(tf.keras.layers.Dropout(0.1))
model_NN.add(tf.keras.layers.Dense(8, activation='relu'))
model_NN.add(tf.keras.layers.Dropout(0.1))
model_NN.add(tf.keras.layers.Dense(1, activation='sigmoid'))

optimizer = tf.keras.optimizers.Adam(0.001)

model_NN.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [None]:
model_NN.summary()

In [None]:
batch_size = 32
epochs = 1000
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

history_NN = model_NN.fit(x_train_nn, y_train, batch_size=batch_size, epochs=epochs, verbose=1, 
                          validation_data = (x_valid_nn, y_valid), callbacks=[early_stopping])

In [None]:
hist = pd.DataFrame(history_NN.history)
hist['epoch'] = history_NN.epoch
hist.tail()

In [None]:
def plot_history(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    
    plt.figure()
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.plot(hist['epoch'], hist['accuracy'], label='Train Accuracy')
    plt.plot(hist['epoch'], hist['val_accuracy'], label = 'Val Accuracy')
    plt.legend()
    
    plt.figure()
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.plot(hist['epoch'], hist['loss'], label='Train Loss')
    plt.plot(hist['epoch'], hist['val_loss'], label = 'Val Loss')
    plt.legend()
    plt.show()
    
plot_history(history_NN)

In [None]:
pred_NN = model_NN.predict(x_valid_nn)

plt.figure(figsize=(6, 6))
plt.xlabel("Predict")
plt.ylabel("Actual")
plt.xlim(0, 1)
plt.scatter(pred_NN, y_valid)
plt.show()

In [None]:
y_valid_array = np.array(y_valid).reshape(-1, 1)
y_valid_array.shape

In [None]:
zero, one = [], []

for i in range(len(pred_NN)):
    if y_valid_array[i][0] == 0:
        zero.append(pred_NN[i][0])
    else:
        one.append(pred_NN[i][0])
        
plt.figure(figsize=(10, 6))
bins = np.linspace(0, 1, 50)

plt.hist(zero, bins, alpha = 0.5, label='Dead', density = True)
plt.hist(one, bins, alpha = 0.5, label='Alive', density = True)
plt.legend(loc='upper left')

plt.show()

In [None]:
# ROC曲線の値の生成：fpr、tpr、閾値
fpr, tpr, thresholds = roc_curve(y_valid, pred_NN)

# ROC曲線のプロット
plt.figure(figsize=(6, 6))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('1021_Valid_ROC_3year')
#plt.savefig("1021_Valid_ROC_3year.jpg", dpi=300)
plt.show()

#AUCの表示
auc_NN = roc_auc_score(y_valid, pred_NN)
print(auc_NN)

# Total

In [None]:
pred_GBDT = pred_GBDT.reshape(-1, 1)
pred_GBDT.shape

In [None]:
pred_NN.shape

In [None]:
logloss_total = log_loss(y_valid, (pred_GBDT+pred_NN)/2)
logloss_total

In [None]:
pred_total = (pred_NN+pred_GBDT)/2

In [None]:
plt.figure(figsize=(6, 6))
plt.ylabel("Predict")
plt.xlabel("Actual")
plt.xlim(0, 1)
plt.scatter(pred_total, y_valid)
#plt.savefig("0716_Predict_Actual_Emsemble.jpg", dpi=300)
plt.show()

In [None]:
pred_total[0:5]

In [None]:
zero, one = [], []

for i in range(len(pred_total)):
    if y_valid_array[i][0] == 0:
        zero.append(pred_total[i][0])
    else:
        one.append(pred_total[i][0])
        
plt.figure(figsize=(10, 6))
bins = np.linspace(0, 1, 50)

plt.hist(zero, bins, alpha = 0.5, label='Dead', density = True)
plt.hist(one, bins, alpha = 0.5, label='Alive', density = True)
plt.legend(loc='upper left')

plt.show()

In [None]:
# ROC曲線の値の生成：fpr、tpr、閾値
fpr, tpr, thresholds = roc_curve(y_valid, pred_total)

# ROC曲線のプロット
plt.figure(figsize=(6, 6))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('1021_Valid_ROC_3year')
#plt.savefig("1021_Valid_ROC_3year.jpg", dpi=300)
plt.show()

#AUCの表示
auc_total = roc_auc_score(y_valid, pred_total)
print(auc_total)

# Save

In [None]:
#pickle.dump(model_GBDT, open('models/0111_model_GBDT_3year.pickle', 'wb'))
#model_NN.save('models/1022_model_NN_3year.h5')