In [1]:
# 情報量基準とは
# モデルを選択する際に使われるモデルの良し悪しの指標のこと
# 報量規準を計算して，その値が最小のものが「良い」統計モデルとして採用される

In [3]:
# 赤池情報量基準(AIC)について
# データへの統計モデルの当てはまりの良さ(尤度の大きさ)と統計モデルの簡潔さ(パラメータ数の少なさ)のバランスをとる
# AIC = -2*logL + 2k (L：最大尤度、k：パラメータ数)


In [15]:
# 重回帰分析におけるAIC
# AICの値が最も小さくなるような説明変数の組合せ最適化を選ぶことが、重回帰分析におけるAICに基づく変数選択

# ワインデータセットで実践
import pandas as pd
import statsmodels.api as sm
from sklearn.datasets import load_wine

wine_data = load_wine()
wine_df = pd.DataFrame(data=wine_data.data, columns=wine_data.feature_names)
wine_df['class'] = wine_data.target
wine_df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,class
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [16]:
import warnings

warnings.simplefilter('ignore')

y_col = ['class']
y= wine_df[y_col]

# pattern1
x_pattern1_col = ['alcohol']
x = wine_df[x_pattern1_col]
model = sm.GLM(y, sm.add_constant(x), family=sm.families.NegativeBinomial())
result = model.fit()
print('Pattern1 AIC:{}'.format(result.aic.round()))

# pattern2
x_pattern2_col = ['malic_acid']
x = wine_df[x_pattern2_col]
model = sm.GLM(y, sm.add_constant(x), family=sm.families.NegativeBinomial())
result = model.fit()
print('Pattern2 AIC:{}'.format(result.aic.round()))

# pattern3
x_pattern3_col = ['ash','alcalinity_of_ash']
x = wine_df[x_pattern3_col]
model = sm.GLM(y, sm.add_constant(x), family=sm.families.NegativeBinomial())
result = model.fit()
print('Pattern3 AIC:{}'.format(result.aic.round()))

# pattern4
x_pattern4_col = ['magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins']
x = wine_df[x_pattern4_col]
model = sm.GLM(y, sm.add_constant(x), family=sm.families.NegativeBinomial())
result = model.fit()
print('Pattern4 AIC:{}'.format(result.aic.round()))

# pattern5
x_pattern5_col = ['color_intensity','hue','od280/od315_of_diluted_wines']
x = wine_df[x_pattern5_col]
model = sm.GLM(y, sm.add_constant(x), family=sm.families.NegativeBinomial())
result = model.fit()
print('Pattern5 AIC:{}'.format(result.aic.round()))

# AICに基づき、Pattern4のモデルを選択する

Pattern1 AIC:475.0
Pattern2 AIC:472.0
Pattern3 AIC:455.0
Pattern4 AIC:442.0
Pattern5 AIC:444.0


In [21]:
# ベイズ情報量基準(BIC)について
# BIC = -2 * logL + k*log(n) (L：最大尤度、k：パラメータ数、n：サンプル数)
# 上記の式より、n>=8以上の時により簡潔なモデルを選ぶ傾向にある
# AICより好まれている点として、サンプルサイズがn->∞の時、確率1で真のモデルを選択することができることが挙げられる

# ワインデータセットで実践
warnings.simplefilter('ignore')

y_col = ['class']
y= wine_df[y_col]

# pattern1
x_pattern1_col = ['alcohol']
x = wine_df[x_pattern1_col]
model = sm.OLS(y, sm.add_constant(x))
result = model.fit()
print('Pattern1 BIC:{}'.format(result.bic.round()))

# pattern2
x_pattern2_col = ['malic_acid']
x = wine_df[x_pattern2_col]
model = sm.OLS(y, sm.add_constant(x))
result = model.fit()
print('Pattern2 BIC:{}'.format(result.bic.round()))

# pattern3
x_pattern3_col = ['ash','alcalinity_of_ash']
x = wine_df[x_pattern3_col]
model = sm.OLS(y, sm.add_constant(x))
result = model.fit()
print('Pattern3 BIC:{}'.format(result.bic.round()))

# pattern4
x_pattern4_col = ['magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins']
x = wine_df[x_pattern4_col]
model = sm.OLS(y, sm.add_constant(x))
result = model.fit()
print('Pattern4 BIC:{}'.format(result.bic.round()))

# pattern5
x_pattern5_col = ['color_intensity','hue','od280/od315_of_diluted_wines']
x = wine_df[x_pattern5_col]
model = sm.OLS(y, sm.add_constant(x))
result = model.fit()
print('Pattern5 BIC:{}'.format(result.bic.round()))

# BICに基づき、Pattern4のモデルを選択する

Pattern1 BIC:403.0
Pattern2 BIC:386.0
Pattern3 BIC:348.0
Pattern4 BIC:213.0
Pattern5 BIC:224.0


In [27]:
# クロスバリデーション(交差検証)とは
# 各群のデータをK分割し、そのうちの1セットをテスト用として外しておき、
# K-1セットのデータで判別関数を求め、テスト用の1セットのデータで誤判別率を推定する、
# ということをK通り全ての場合を調べ、全ての場合の誤判別率の平均をとるという方法
# K = n(サンプルサイズ)とした場合は「Leave-one-out法」と呼ばれる

# irisデータで実践
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

iris = load_iris()

logreg = LogisticRegression()

kfold = KFold(n_splits=5) # K=5分割で実行する
scores = cross_val_score(logreg, iris.data, iris.target, cv=kfold)
print('Cross-Validation scores: {}'.format(scores))
print('Average score: {}'.format(np.mean(scores)))

lookfold = KFold(n_splits=len(iris.data)) # K=n分割で実行する
scores = cross_val_score(logreg, iris.data, iris.target, cv=lookfold)
print('Leave-one-out scores: {}'.format(scores))
print('Average score: {}'.format(np.mean(scores)))

Cross-Validation scores: [1.         1.         0.86666667 0.93333333 0.83333333]
Average score: 0.9266666666666665
Leave-one-out scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]
Average score: 0.9666666666666667
