# 0. import

## 0-1 import library

In [1]:
# import library
import numpy as np
import pandas as pd

# import algorithm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

# import tool
from IPython.core.display import display # print関数の代わり
from sklearn.preprocessing import StandardScaler, Imputer #StandardScaler ＝＞ 標準化、おまじない, Imputer ＝＞ one-hot
from sklearn.pipeline import Pipeline # Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFE, RFECV

# import 評価指標
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# pkl形式で保存
from sklearn.externals import joblib

## 0-2 アルゴリズム設定

In [2]:
# set estimator
knn = KNeighborsClassifier()
logistic = LogisticRegression(random_state=1)
random_forest = RandomForestClassifier(random_state=1)
gbc = GradientBoostingClassifier(random_state=1)
mlp = MLPClassifier(
    solver='lbfgs',
    hidden_layer_sizes=[100],
    max_iter=2000,
    activation='logistic',
    random_state=1
)
svc = CalibratedClassifierCV(
    base_estimator=LinearSVC(
        penalty='l2',dual=False
    ),
    cv=5
)

# set pipelines
pipe_knn = Pipeline([('scl', StandardScaler()), ('est', knn)])
pipe_logistic = Pipeline([('scl', StandardScaler()), ('est',logistic )])
pipe_random_forest = Pipeline([('scl', StandardScaler()), ('est', random_forest)])
pipe_gbc = Pipeline([('scl', StandardScaler()), ('est', gbc)])
pipe_mlp = Pipeline([('scl', StandardScaler()), ('est',mlp )])
pipe_svc = Pipeline([('scl', StandardScaler()), ('est', svc)])

clfs_dic = {
    'KNN': pipe_knn,
    'Logistic': pipe_logistic,
    'RandomForest': pipe_random_forest,
    'GradientBoosting': pipe_gbc,
    'MLP': pipe_mlp, 
    'LinearSVC': pipe_svc
}

# 1. 学習用データ(モデリング)

## 1-1 import data

In [3]:
#　model と score の csv を選択
model_csv = './data/model.csv'
score_csv = './data/score.csv'

# カテゴリ変数を選択
dtype_objects = [
    'Gender',
    'Married',
    'Dependents',
    'Education',
    'Self_Employed',
    'Property_Area'
]

# 量的変数をピックアップする
dtype_quantity = [
    'ApplicantIncome',
    'CoapplicantIncome',
    'LoanAmount',
    'Loan_Amount_Term',
    'Credit_History',
]

# カテゴリ変数をobject型として明記する為の　dictionary
dtype_objects_dic = {}
for dtype_object in dtype_objects:
    dtype_objects_dic.update({dtype_object:object})

#  read csv
dfm = pd.read_csv(
    model_csv,
    header=0,  # カラムの位置を確認
    dtype=dtype_objects_dic
)

# X, y,dorp項目を定義

# set dataframe
IDm  = dfm.iloc[:, [0]] # ID = 1カラム目
ym = dfm.iloc[:, [1]] # y = 2カラム目
Xm = dfm.iloc[:, 2:] # X = 3カラム目〜

y_column = dfm.columns[1] 

## 1-2 one-hot

In [4]:
# ohe するカテゴリ変数を代入
ohe_columns = dtype_objects

#　ダミー変数を立てる　　＝＞　dummy_na=True
Xm_ohe = pd.get_dummies(Xm, dummy_na=True, columns=ohe_columns)

## 1-3 欠損を平均値で置き換え

In [5]:
# 欠損値NaNを平均値(mean)で置換
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(Xm_ohe)

# 学習済みImputerを適用しX_newの欠損値を置換
Xm_ohe_columns = Xm_ohe.columns.values
Xm_ohe = pd.DataFrame(imp.transform(Xm_ohe), columns=Xm_ohe_columns)

## 1-4 RFE・RFECV

In [6]:
# アルゴリズムを選択
rfe_estimator = random_forest

# RFE・RFECVを選択　(RFECV : 最終的に残す特長量　n_features_to_select　を最適化)
selector = RFE(rfe_estimator, n_features_to_select=10, step=0.05)
# selector = RFECV(rfe_estimator, step=0.05)

selector.fit(Xm_ohe, ym.as_matrix().ravel())

RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
  n_features_to_select=10, step=0.05, verbose=0)

## 1-5 モデリング(前処理済み)

In [7]:
# データの絞り込み
# Xm_ohe_selected = selector.transform(Xm_ohe)
# Xm_ohe_selected = pd.DataFrame(Xm_ohe_selected, columns=Xm_ohe_columns[selector.support_])

# 他の書き方
Xm_fin = Xm_ohe.loc[:, Xm_ohe_columns[selector.support_]]

# 2. 未知データ(スコアリング)

## 2-1 import data

In [8]:
#  read csv
dfs = pd.read_csv(
    score_csv,
    header=0,  # カラムの位置を確認
    dtype=dtype_objects_dic
)

# X, y,dorp項目を定義

# set dataframe
IDs  = dfs.iloc[:, [0]]    # ID = 1カラム目
Xs   = dfs.iloc[:, 1:]      # X  = 2カラム目〜

## 2-2 one-hot

In [9]:
# カテゴリ変数をピックアップする
ohe_columns = dtype_objects

#　ダミー変数を立てる　　＝＞　dummy_na=True
Xs_ohe = pd.get_dummies(Xs, dummy_na=True, columns=ohe_columns)

# 3. 学習用データ(モデリング)と未知データ(スコアリング)

## 3-1 モデリングとスコアリングの比較

In [10]:
cols_model = set(Xm_ohe.columns.values)
cols_score = set(Xs_ohe.columns.values)

# モデルにはあったがスコアにはないデータ項目
diff1 = cols_model - cols_score
print('モデルのみに存在する項目: {}'.format(diff1))

# スコアにはあるがモデルになかったデータ項目
diff2 = cols_score - cols_model
print('スコアのみに存在する項目: {}'.format(diff2))

モデルのみに存在する項目: {'Dependents_3+'}
スコアのみに存在する項目: {'Gender_Unknown'}


## 3-2 モデリングのデータフレーム作成

In [11]:
dfm_cols = pd.DataFrame(None, columns=Xm_ohe_columns, dtype=float)

## 3-3 モデリングのデータフレームにスコアリングを結合

In [12]:
colsmXs_ohe1 = pd.concat([dfm_cols, Xs_ohe])

## 3-4 スコアリングのみのカラムを削除

In [13]:
colsmXs_ohe2 = colsmXs_ohe1.drop(list(set(Xs_ohe.columns.values)-set(Xm_ohe.columns.values)), axis=1)

## 3-5 スコアリングにない項目を NaN から 0 へ

In [14]:
colsmXs_ohe2.loc[:,list(set(Xm_ohe.columns.values)-set(Xs_ohe.columns.values))] = \
    colsmXs_ohe2.loc[:,list(set(Xm_ohe.columns.values)-set(Xs_ohe.columns.values))].fillna(0, axis=1)

## 3-6 モデリングの並び順に整列

In [15]:
colsmXs_ohe3 = colsmXs_ohe2.reindex_axis(Xm_ohe.columns.values, axis=1)

## 3-7 欠損をモデリングの平均値で置き換え

In [16]:
colsmXs_ohe4 = pd.DataFrame(imp.transform(colsmXs_ohe3), columns=Xm_ohe_columns)

## 3-8 スコアリング(前処理済み)

In [17]:
Xs_fin = colsmXs_ohe4.loc[:, Xm_ohe_columns[selector.support_]]

# 4. Classification(モデリング)

## 4-1 holdout

In [18]:
def main(est):
    print(est)
    
    Xmc = Xm_fin
    ymc =ym

    # cross-validation by holdout
    Xmc_train, Xmc_test, ymc_train, ymc_test = train_test_split(Xmc, ymc, test_size=0.20, random_state=1)

    ests_dic = {
        'accuracy_score': lambda clf: accuracy_score(ymc_test, clf.predict(Xmc_test)),
        'precision_score': lambda clf: precision_score(ymc_test, clf.predict(Xmc_test), average='macro'),
        'recall_score': lambda clf: recall_score(ymc_test, clf.predict(Xmc_test), average='macro'),
        'f1_score': lambda clf: f1_score(ymc_test, clf.predict(Xmc_test), average='macro'),
        'auc_score': lambda clf: roc_auc_score(ymc_test, clf.predict_proba(Xmc_test)[:,1])
    }

    ls_est = []
    ls_clf = []
    for k, clf in clfs_dic.items():
        clf.fit(Xmc_train, ymc_train.as_matrix().ravel())
        
        joblib.dump(clf, '{}.pkl'.format(k), compress=True) #  ＝＞  (13) dumpしない場合は、コメントアウト
        
        es = ests_dic[est](clf)
        ls_est.append(k)
        ls_clf.append(es)

    est_dict = {'clf': ls_clf, 'est': ls_est}
    est_pd = pd.DataFrame(est_dict)
    est_pd_rank = est_pd.sort_values(by='clf', ascending=False)
    display(est_pd_rank.reset_index(drop=True))
    
print('----------------------------------------------------------------------------------------')
main('accuracy_score')
print('----------------------------------------------------------------------------------------')
main('precision_score')
print('----------------------------------------------------------------------------------------')
main('recall_score')
print('----------------------------------------------------------------------------------------')
main('f1_score')
print('----------------------------------------------------------------------------------------')
main('auc_score')
print('----------------------------------------------------------------------------------------')

----------------------------------------------------------------------------------------
accuracy_score


Unnamed: 0,clf,est
0,0.804878,Logistic
1,0.804878,LinearSVC
2,0.796748,GradientBoosting
3,0.788618,KNN
4,0.756098,RandomForest
5,0.715447,MLP


----------------------------------------------------------------------------------------
precision_score


Unnamed: 0,clf,est
0,0.862098,Logistic
1,0.862098,LinearSVC
2,0.805131,GradientBoosting
3,0.796919,KNN
4,0.727755,RandomForest
5,0.665865,MLP


----------------------------------------------------------------------------------------
recall_score


Unnamed: 0,clf,est
0,0.70696,GradientBoosting
1,0.699176,Logistic
2,0.699176,LinearSVC
3,0.694139,KNN
4,0.67033,RandomForest
5,0.647436,MLP


----------------------------------------------------------------------------------------
f1_score


Unnamed: 0,clf,est
0,0.727514,GradientBoosting
1,0.722556,Logistic
2,0.722556,LinearSVC
3,0.713441,KNN
4,0.683207,RandomForest
5,0.653521,MLP


----------------------------------------------------------------------------------------
auc_score


Unnamed: 0,clf,est
0,0.796703,RandomForest
1,0.775946,GradientBoosting
2,0.749695,Logistic
3,0.746642,LinearSVC
4,0.702839,MLP
5,0.681471,KNN


----------------------------------------------------------------------------------------


# 5. Classification(スコアリング)

## 5-1 ベストモデリングで predict

In [19]:
#  (15) ベストモデリングの呼び出し
best_model1 = joblib.load('./RandomForest.pkl')

#スコアリングデータ
Xsc = Xs_fin

ysc1 = best_model1.predict_proba(Xsc) # 負例 or 正例
# ysc1 = best_model1.predict(Xsc) # 1 or 0

print('----------------------------------------------------------------------------------------')
print(ysc1[:10])
# ([負例になる確率, 正例になる確率])

print('----------------------------------------------------------------------------------------')
dfysc1 = pd.DataFrame(ysc1[:, [1]], columns=[y_column]) # 負例 or 正例
# dfysc1 = pd.DataFrame(ysc1, columns=[y_column]) # 1 or 0

display(dfysc1.head())

dfysc1_fin = IDs.join(dfysc1)
display(dfysc1_fin.head())

print('----------------------------------------------------------------------------------------')

----------------------------------------------------------------------------------------
[[ 0.7  0.3]
 [ 1.   0. ]
 [ 0.9  0.1]
 [ 1.   0. ]
 [ 0.8  0.2]
 [ 1.   0. ]
 [ 0.8  0.2]
 [ 0.   1. ]
 [ 0.6  0.4]
 [ 1.   0. ]]
----------------------------------------------------------------------------------------


Unnamed: 0,Loan_Status
0,0.3
1,0.0
2,0.1
3,0.0
4,0.2


Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,0.3
1,LP001022,0.0
2,LP001031,0.1
3,LP001035,0.0
4,LP001051,0.2


----------------------------------------------------------------------------------------


## 5-2 1週目提出

In [20]:
dfysc1_fin.to_csv('Classification1_fin.csv', index=0)

## Q1. なぜKNNとRandomForestが少数点第1位までしか出ない？

In [21]:
print('----------------------------------------------------------------------------------------')
#  (15) ベストモデリングの呼び出し
for k, best_model in clfs_dic.items():
    print('▼ ' + k)
    print('')
    best_model2 = best_model

    #スコアリングデータ
    Xsc = Xs_fin

    ysc2 = best_model2.predict_proba(Xsc) # 負例 or 正例
    # ysc2 = best_model2.predict(Xsc) # 1 or 0

    print(ysc2[:10])
    # ([負例になる確率, 正例になる確率])

    dfysc2 = pd.DataFrame(ysc2[:, [1]], columns=[y_column]) # 負例 or 正例
    # dfysc2 = pd.DataFrame(ysc2, columns=[y_column]) # 1 or 0

    display(dfysc2.head())

    dfysc2_fin = IDs.join(dfysc2)
    display(dfysc2_fin.head())

    print('----------------------------------------------------------------------------------------')

----------------------------------------------------------------------------------------
▼ KNN

[[ 0.8  0.2]
 [ 0.8  0.2]
 [ 1.   0. ]
 [ 0.8  0.2]
 [ 0.4  0.6]
 [ 1.   0. ]
 [ 0.8  0.2]
 [ 0.   1. ]
 [ 0.6  0.4]
 [ 0.8  0.2]]


Unnamed: 0,Loan_Status
0,0.2
1,0.2
2,0.0
3,0.2
4,0.6


Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,0.2
1,LP001022,0.2
2,LP001031,0.0
3,LP001035,0.2
4,LP001051,0.6


----------------------------------------------------------------------------------------
▼ Logistic

[[ 0.79517185  0.20482815]
 [ 0.78121549  0.21878451]
 [ 0.77127914  0.22872086]
 [ 0.65639303  0.34360697]
 [ 0.69447002  0.30552998]
 [ 0.76825323  0.23174677]
 [ 0.84392647  0.15607353]
 [ 0.0841151   0.9158849 ]
 [ 0.8321105   0.1678895 ]
 [ 0.8182611   0.1817389 ]]


Unnamed: 0,Loan_Status
0,0.204828
1,0.218785
2,0.228721
3,0.343607
4,0.30553


Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,0.204828
1,LP001022,0.218785
2,LP001031,0.228721
3,LP001035,0.343607
4,LP001051,0.30553


----------------------------------------------------------------------------------------
▼ RandomForest

[[ 0.7  0.3]
 [ 1.   0. ]
 [ 0.9  0.1]
 [ 1.   0. ]
 [ 0.8  0.2]
 [ 1.   0. ]
 [ 0.8  0.2]
 [ 0.   1. ]
 [ 0.6  0.4]
 [ 1.   0. ]]


Unnamed: 0,Loan_Status
0,0.3
1,0.0
2,0.1
3,0.0
4,0.2


Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,0.3
1,LP001022,0.0
2,LP001031,0.1
3,LP001035,0.0
4,LP001051,0.2


----------------------------------------------------------------------------------------
▼ GradientBoosting

[[ 0.82953927  0.17046073]
 [ 0.90784382  0.09215618]
 [ 0.69338867  0.30661133]
 [ 0.89866603  0.10133397]
 [ 0.70129147  0.29870853]
 [ 0.89717046  0.10282954]
 [ 0.84725379  0.15274621]
 [ 0.05169657  0.94830343]
 [ 0.83331797  0.16668203]
 [ 0.94859088  0.05140912]]


Unnamed: 0,Loan_Status
0,0.170461
1,0.092156
2,0.306611
3,0.101334
4,0.298709


Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,0.170461
1,LP001022,0.092156
2,LP001031,0.306611
3,LP001035,0.101334
4,LP001051,0.298709


----------------------------------------------------------------------------------------
▼ MLP

[[  1.00000000e+00   4.00763133e-17]
 [  1.00000000e+00   8.59885357e-11]
 [  1.00000000e+00   3.44374455e-26]
 [  1.00000000e+00   1.65492430e-16]
 [  7.09379033e-01   2.90620967e-01]
 [  9.99999992e-01   7.87747012e-09]
 [  9.99999999e-01   7.70482389e-10]
 [  1.15083498e-11   1.00000000e+00]
 [  1.59663925e-07   9.99999840e-01]
 [  1.00000000e+00   1.77334322e-15]]


Unnamed: 0,Loan_Status
0,4.007631e-17
1,8.598854e-11
2,3.4437449999999995e-26
3,1.654924e-16
4,0.290621


Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,4.007631e-17
1,LP001022,8.598854e-11
2,LP001031,3.4437449999999995e-26
3,LP001035,1.654924e-16
4,LP001051,0.290621


----------------------------------------------------------------------------------------
▼ LinearSVC

[[ 0.79589469  0.20410531]
 [ 0.78383749  0.21616251]
 [ 0.77548522  0.22451478]
 [ 0.67167513  0.32832487]
 [ 0.71852477  0.28147523]
 [ 0.77185821  0.22814179]
 [ 0.8239782   0.1760218 ]
 [ 0.12318403  0.87681597]
 [ 0.82008695  0.17991305]
 [ 0.79945259  0.20054741]]


Unnamed: 0,Loan_Status
0,0.204105
1,0.216163
2,0.224515
3,0.328325
4,0.281475


Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,0.204105
1,LP001022,0.216163
2,LP001031,0.224515
3,LP001035,0.328325
4,LP001051,0.281475


----------------------------------------------------------------------------------------
