## 11.2 ウォーターフォールチャート

In [1]:
import pandas as pd
import json
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from plotly import graph_objects as go
from plotly.graph_objs.layout import Template

# DiabetesデータセットからDataFrameを読み込み
df_X, df_y = load_diabetes(return_X_y=True, as_frame=True)
df_X = df_X[['age', 'sex', 'bmi', 'bp', 's1', 's3', 's5']]

df_X


Unnamed: 0,age,sex,bmi,bp,s1,s3,s5
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.043401,0.019907
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,0.074412,-0.068332
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.032356,0.002861
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,-0.036038,0.022688
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.008142,-0.031988
...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.028674,0.031193
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,-0.028674,-0.018114
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.024993,-0.046883
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,-0.028674,0.044529


In [2]:
# 訓練データセットとテストデータセットに分割
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=0)

# 行番号をリセット
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((353, 7), (89, 7), (353,), (89,))

In [3]:
X_test

Unnamed: 0,age,sex,bmi,bp,s1,s3,s5
0,0.019913,0.050680,0.104809,0.070072,-0.035968,-0.024993,0.003709
1,-0.012780,-0.044642,0.060618,0.052858,0.047965,-0.017629,0.070207
2,0.038076,0.050680,0.008883,0.042529,-0.042848,-0.039719,-0.018114
3,-0.012780,-0.044642,-0.023451,-0.040099,-0.016704,-0.017629,-0.038460
4,-0.023677,-0.044642,0.045529,0.090729,-0.018080,0.070730,-0.034522
...,...,...,...,...,...,...,...
84,-0.070900,0.050680,-0.089197,-0.074527,-0.042848,-0.032356,-0.012909
85,0.001751,-0.044642,-0.070875,-0.022885,-0.001569,0.026550,-0.022517
86,-0.074533,-0.044642,0.043373,-0.033213,0.012191,0.063367,-0.027129
87,-0.041840,0.050680,0.014272,-0.005670,-0.012577,-0.072854,0.035459


In [4]:
# 線形回帰モデルの学習
model = LinearRegression()
model.fit(X_train, y_train)

model.intercept_    # 学習後の定数

152.480932725711

In [5]:
model.coef_     # 学習後の係数

array([ -26.62269923, -233.74864265,  574.32396316,  302.97781081,
       -191.37502339, -247.08884087,  598.19908899])

In [6]:
# テストデータセットへの推論
y_pred = model.predict(X_test)

y_pred.shape

(89,)

In [7]:
# 0番目のテストデータに対する推論結果
y_pred[0]

236.80637201611344

In [8]:
# 定数と係数から同じ値が得られることを確認
model.intercept_ + (model.coef_ * X_test.iloc[0]).sum()

236.80637201611347

In [9]:
# 0番目の推論結果を要素に分解
factors = pd.Series({
    'intercept': model.intercept_
})
factors = pd.concat([factors, model.coef_ * X_test.iloc[0]])

factors

intercept    152.480933
age           -0.530144
sex          -11.846409
bmi           60.194142
bp            21.230352
s1             6.883335
s3             6.175407
s5             2.218757
dtype: float64

In [10]:
# Traceを作成 
trace = go.Waterfall(
    x=factors.index,    # x軸に使用する変数
    y=factors,          # y軸に使用する変数
    orientation='v'     # グラフ方向（縦棒）
)   # 0番目のテストデータに対する推論結果のウォーターフォール図

# 独自テンプレートを読み込み
with open('custom_white.json') as f:
    custom_white_dict = json.load(f)
    template = Template(custom_white_dict, _validate=False)

# Layoutを作成
layout = go.Layout(
    template=template,
    title='Diabetes dataset',
    xaxis={'title': 'Factor'},
    yaxis={'title': 'Value'}
)

# Figureを作成
figure = go.Figure(trace, layout)

# figure.write_image('./figure/out_11_2_1.png', width=900, height=450, scale=2)
figure

In [11]:
# 合計値（＝推論結果）を追加
factors['total'] = factors.sum()

factors

intercept    152.480933
age           -0.530144
sex          -11.846409
bmi           60.194142
bp            21.230352
s1             6.883335
s3             6.175407
s5             2.218757
total        236.806372
dtype: float64

In [12]:
# 「total」のみ絶対値、それ以外は相対値に設定
measure = ['absolute' if feature_name == 'total' else 'relative' for feature_name in factors.index]

measure

['relative',
 'relative',
 'relative',
 'relative',
 'relative',
 'relative',
 'relative',
 'relative',
 'absolute']

In [13]:
# Traceを作成
trace = go.Waterfall(
    x=factors.index,
    y=factors,
    orientation='v',
    measure=measure,                # 相対値/絶対値の設定
    text=factors,                   # テキスト表示
    texttemplate ='%{text:.2f}',    # テキストテンプレート
    textposition='outside'          # テキスト位置（棒の外側）
)   # 0番目のテストデータに対する推論結果のウォーターフォール図（合計値あり）

# Layoutを作成
layout = go.Layout(
    template=template,
    title='Diabetes dataset',
    xaxis={'title': 'Factor'},
    yaxis={
        'title': 'Target',
        'range': [0, 260]
    }
)

# Figureを作成
figure = go.Figure(trace, layout)

# figure.write_image('./figure/out_11_2_2.png', width=900, height=450, scale=2)
figure

In [14]:
# 85番目のテストデータに対する推論結果を分解した要素
factors = pd.Series({
    'intercept': model.intercept_
})
factors = pd.concat([factors, model.coef_ * X_test.iloc[85]])
factors['total'] = factors.sum()

factors

intercept    152.480933
age           -0.046604
sex           10.434922
bmi          -40.705026
bp            -6.933549
s1             0.300260
s3            -6.560276
s5           -13.469367
total         95.501292
dtype: float64

In [15]:
# Traceを作成
trace = go.Waterfall(
    x=factors,
    y=factors.index,
    orientation='h',    # グラフ方向（横棒）
    measure=measure,
    text=factors,
    texttemplate ='%{text:.2f}',
    textposition='outside'
)   # 85番目のテストデータに対する推論結果のウォーターフォール図（合計値あり）

# Layoutを作成
layout = go.Layout(
    template=template,
    title='Diabetes dataset',
    yaxis={'title': 'Factor'},
    xaxis={'title': 'Value'}
)

# Figureを作成
figure = go.Figure(trace, layout)

# figure.write_image('./figure/out_11_2_3.png', width=900, height=450, scale=2)
figure