モデル用データの読み込み

In [1]:
# import sample data: Loan screening data for classification 
from IPython.core.display import display
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('./data/final_hr_analysis_train.csv',header=0)

X  = df.iloc[:,2:]           # ３列目以降を特徴量X
ID = df.iloc[:,[0]]          # 最初列をID情報としてセット
y  = df.iloc[:,1]          # ２列目を正解データ

# check the shape
print('--------------------------------------')
print('Raw shape: (%i,%i)' %df.shape)
print('X shape: (%i,%i)' %X.shape)

print('---------------------------------------')
print(y.value_counts())
print('---------------------------------------')
print(ID.join(X).join(y).dtypes)
display(ID.join(X).join(y).head())

# 表示列数のオプション変更
pd.options.display.max_columns = 50

print(X.isnull().sum())

--------------------------------------
Raw shape: (10499,11)
X shape: (10499,9)
---------------------------------------
0    7966
1    2533
Name: left, dtype: int64
---------------------------------------
index                      int64
satisfaction_level       float64
last_evaluation          float64
number_project             int64
average_montly_hours       int64
time_spend_company         int64
Work_accident              int64
promotion_last_5years      int64
sales                     object
salary                    object
left                       int64
dtype: object


Unnamed: 0,index,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales,salary,left
0,10438,0.53,0.52,2,135,4,0,0,technical,medium,0
1,9236,0.77,0.53,5,256,3,0,0,accounting,medium,0
2,818,0.89,0.79,3,149,2,0,0,support,medium,1
3,11503,0.64,0.63,3,156,6,1,0,support,low,0
4,11721,0.98,0.74,4,151,3,0,0,sales,medium,0


satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
promotion_last_5years    0
sales                    0
salary                   0
dtype: int64


モデル用データの前処理：カテゴリ変数の数量化(欠損値なしのため、欠損補完は省略)

In [2]:
ohe_columns = ['sales',
               'salary']

X_ohe = pd.get_dummies(X,
                       dummy_na=True,
                       columns=ohe_columns)

print(X_ohe.shape)
display(X_ohe.head())

(10499, 22)


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,sales_nan,salary_high,salary_low,salary_medium,salary_nan
0,0.53,0.52,2,135,4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.77,0.53,5,256,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
2,0.89,0.79,3,149,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
3,0.64,0.63,3,156,6,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
4,0.98,0.74,4,151,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0


モデル用データの前処理：次元圧縮（特徴選択）

In [3]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

selector = RFE(RandomForestClassifier(n_estimators=100,random_state=1),
               n_features_to_select=15,
               step=.05)

selector.fit(X_ohe,y)

X_fin = pd.DataFrame(selector.transform(X_ohe),
                     columns=X_ohe.columns.values[selector.support_])

print('X_fin shape:(%i,%i)' % X_fin.shape)
display(X_fin.head())

X_fin shape:(10499,15)


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,sales_IT,sales_RandD,sales_accounting,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.53,0.52,2.0,135.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.77,0.53,5.0,256.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.89,0.79,3.0,149.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.64,0.63,3.0,156.0,6.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.98,0.74,4.0,151.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


アンダーサンプリング・オーバーサンプリング

In [4]:

'''from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler,SMOTE

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,f1_score

rus = RandomUnderSampler(random_state=0)
ros = RandomOverSampler(random_state=0)
smt = SMOTE(random_state=0)


# holdout
X_train,X_valid,y_train,y_valid= train_test_split(X_fin,
                                                y,
                                                test_size=0.4,
                                                random_state=0)
# resampling
X_train_under, y_train_under = rus.fit_sample(X_train, y_train)
X_train_over, y_train_over = ros.fit_sample(X_train, y_train)
X_train_smt, y_train_smt = smt.fit_sample(X_train, y_train)

# modeling
pipe_gb = Pipeline([('scl',StandardScaler()),
                    ('est',GradientBoostingClassifier(random_state=1))])
# evaluation
###############################################
pipe_gb.fit(X_train,
            y_train)
print('Original Train:', 
      f1_score(y_train,
               pipe_gb.predict(X_train)))
print('Original Test:', 
      f1_score(y_valid,
               pipe_gb.predict(X_valid)))
###############################################
pipe_gb.fit(X_train_under,
            y_train_under)
print('Undersampling Train:',
      f1_score(y_train_under,
               pipe_gb.predict(X_train_under)))
print('Undersampling Test:', 
      f1_score(y_valid,
               pipe_gb.predict(X_valid)))
###############################################
pipe_gb.fit(X_train_over,
            y_train_over)
print('Oversampling Train:',
      f1_score(y_train_over,
               pipe_gb.predict(X_train_over)))
print('Oversampling Test:',
      f1_score(y_valid,
               pipe_gb.predict(X_valid)))
###############################################
pipe_gb.fit(X_train_smt,
            y_train_smt)
print('SMOTE Train:',
      f1_score(y_train_smt,
               pipe_gb.predict(X_train_smt)))
print('SMOTE Test:',
      f1_score(y_valid,
               pipe_gb.predict(X_valid)))'''


"from collections import Counter\nfrom imblearn.under_sampling import RandomUnderSampler\nfrom imblearn.over_sampling import RandomOverSampler,SMOTE\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.metrics import accuracy_score,f1_score\n\nrus = RandomUnderSampler(random_state=0)\nros = RandomOverSampler(random_state=0)\nsmt = SMOTE(random_state=0)\n\n\n# holdout\nX_train,X_valid,y_train,y_valid= train_test_split(X_fin,\n                                                y,\n                                                test_size=0.4,\n                                                random_state=0)\n# resampling\nX_train_under, y_train_under = rus.fit_sample(X_train, y_train)\nX_train_over, y_train_over = ros.fit_sample(X_train, y_train)\nX_train_smt, y_train_smt = smt.fit_sample(X_train, y_train)\n\n# modeling\npipe_gb = 

スコア用データの読み込み

In [5]:
df_s = pd.read_csv('./data/final_hr_analysis_test.csv',header=0)

X_s = df_s.iloc[:,2:]           # ３列目以降を特徴量X
ID_s = df_s.iloc[:,[0]]          # 最初列をID情報としてセット

# check the shape
print('--------------------------------------')
print('Raw shape: (%i,%i)' %df_s.shape)
print('X shape: (%i,%i)' %X_s.shape)

print('---------------------------------------')
print(ID.join(X).dtypes)
display(ID.join(X).head())
# 表示列数のオプション変更
pd.options.display.max_columns = 50

--------------------------------------
Raw shape: (4500,11)
X shape: (4500,9)
---------------------------------------
index                      int64
satisfaction_level       float64
last_evaluation          float64
number_project             int64
average_montly_hours       int64
time_spend_company         int64
Work_accident              int64
promotion_last_5years      int64
sales                     object
salary                    object
dtype: object


Unnamed: 0,index,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales,salary
0,10438,0.53,0.52,2,135,4,0,0,technical,medium
1,9236,0.77,0.53,5,256,3,0,0,accounting,medium
2,818,0.89,0.79,3,149,2,0,0,support,medium
3,11503,0.64,0.63,3,156,6,1,0,support,low
4,11721,0.98,0.74,4,151,3,0,0,sales,medium


スコア用データの前処理：カテゴリ変数の数量化(欠損値なしのため、欠損補完は省略)

In [6]:
X_ohe_s = pd.get_dummies(X_s,
                       dummy_na=True,
                       columns=ohe_columns)

print(X_ohe_s.shape)
display(X_ohe_s.head())

(4500, 22)


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,sales_nan,salary_high,salary_low,salary_medium,salary_nan
0,0.44,0.57,2,141,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
1,0.55,0.96,3,194,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
2,0.72,0.67,5,210,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
3,0.96,0.75,4,177,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0.96,0.54,3,198,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0


スコア用データの前処理：one-hotエンコーディング後のデータ整合チェック

In [7]:
# Pythonの集合型変数を利用
cols_model = set(X_ohe.columns.values)
cols_score = set(X_ohe_s.columns.values)

# モデルにはあったスコアにはないデータ項目
diff1 = cols_model - cols_score
print('Modelのみ:%s' % diff1)

# スコアにはあるがモデルになかったデータ項目
diff2 = cols_score - cols_model
print('Scoreのみ:%s' % diff2)

Modelのみ:set()
Scoreのみ:set()


スコア用データの前処理：次元圧縮（特徴選択）

In [8]:
X_fin_s = X_ohe_s.loc[:, X_ohe.columns.values[selector.support_]]
print(X_fin_s.shape)
X_fin_s.head(3)

(4500, 15)


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,sales_IT,sales_RandD,sales_accounting,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.44,0.57,2,141,3,0,0,0,0,0,0,0,0,0,1
1,0.55,0.96,3,194,3,0,0,0,0,0,0,0,0,0,1
2,0.72,0.67,5,210,2,0,0,0,0,0,0,0,0,0,1


In [9]:
import pandas as pd
import numpy as np
import scipy as scp
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score



X_train, X_test, y_train, y_test = train_test_split(X_fin, y, test_size=0.20, random_state=1)

pipelines={
    'knn':Pipeline([('scl', StandardScaler()), ('est', KNeighborsClassifier())]),
    'logistic':Pipeline([('scl', StandardScaler()), ('est', LogisticRegression(random_state=1))]),
    'rsvc':Pipeline([('scl', StandardScaler()), ('est', SVC(random_state=1))]),
    'lsvc':Pipeline([('scl',StandardScaler()),('est',LinearSVC(C=1.0, class_weight='balanced', random_state=1))]),
    'tree':Pipeline([('scl', StandardScaler()), ('est', DecisionTreeClassifier(random_state=1))]),
    'rf':Pipeline([('scl', StandardScaler()), ('est', RandomForestClassifier(random_state=1))]),
    'gb':Pipeline([('scl', StandardScaler()), ('est', GradientBoostingClassifier(random_state=1))]),
    'lgb':Pipeline([('scl', StandardScaler()), ('est', LGBMClassifier(random_state=1))]),
    'mlp':Pipeline([('scl', StandardScaler()), ('est', MLPClassifier(hidden_layer_sizes=(3,3),max_iter=1000))])
}


In [10]:
param_grid = {'knn':{'est__n_neighbors':[1,2,3,4,5]},
              'logistic':{'est__C':np.arange(2,4,1)},
              'rsvc':{'est__C':np.arange(0.1,1,0.05)},
              'lsvc':{'est__C':np.arange(4,7,1)},
              'tree':{'est__max_depth': [7,8,9]},
              'rf':{'est__max_depth': [2,3,6,None],
                    'est__n_estimators':[150,160,200],
                    'est__max_features': [3,4,5]},
              'gb':{'est__learning_rate':[0.01,0.1],
                    'est__n_estimators':[100,400,500],
                    'est__max_depth':[5,6,7]},
              'lgb':{'est__learning_rate': [0.15,0.20,0.25],
                     'est__num_iterations':[100,200,300],
                     'est__n_estimators': [40],
                     'est__num_leaves': list(range(24, 33, 2))},
              'mlp':{'est__learning_rate': ['constant'],
                     'est__activation': ['tanh']}}


GridSearchにおける評価指標選択（性能評価指標に連動）

In [11]:
import ipywidgets as widgets

def get_answer(x):
    return x

evaluation_choice_grid = get_answer(widgets.RadioButtons(options=['accuracy', 'f1', 'roc_auc']))
display(evaluation_choice_grid)

RadioButtons(options=('accuracy', 'f1', 'roc_auc'), value='accuracy')

In [12]:
print('選択した評価指標：',evaluation_choice_grid.value)

選択した評価指標： accuracy


In [13]:
from sklearn.model_selection import GridSearchCV
scores={}
pipelines_gs = {}

for pipe_name, pipeline in pipelines.items():
    print(pipe_name + ' Start')
    pipeline_gs = GridSearchCV(estimator=pipeline,
                  param_grid=param_grid[pipe_name],
                  scoring=evaluation_choice_grid.value,
                  cv=3,
                  return_train_score=False)
    

    
    pipeline_gs.fit(X_train, y_train)
    scores[(pipe_name, 'train_accuracy')] = accuracy_score(y_train, pipeline_gs.predict(X_train))
    scores[(pipe_name, 'test_accuracy')] = accuracy_score(y_test, pipeline_gs.predict(X_test))
    scores[(pipe_name, 'train_f1')] = f1_score(y_train, pipeline_gs.predict(X_train))
    scores[(pipe_name, 'test_f1')] = f1_score(y_test, pipeline_gs.predict(X_test))
    scores[(pipe_name, 'train_auc')] = roc_auc_score(y_train, pipeline_gs.predict(X_train))
    scores[(pipe_name, 'test_auc')] = roc_auc_score(y_test, pipeline_gs.predict(X_test))
    
    pipelines_gs[pipe_name] = pipeline_gs.best_estimator_
    
    print(pipe_name + ' End')
    print(pipeline_gs.best_params_)
    print('-'*80)

knn Start
knn End
{'est__n_neighbors': 2}
--------------------------------------------------------------------------------
logistic Start
logistic End
{'est__C': 3}
--------------------------------------------------------------------------------
rsvc Start


KeyboardInterrupt: 

性能評価指標を選択(GridSearchの評価指標と連動)

In [None]:
evaluation_choice = {'accuracy':'test_accuracy', 'f1':'test_f1', 'roc_auc':'test_auc'}


モデルのランキング

In [None]:
#選択した評価指標でソート
scores_ranking = pd.Series(scores).unstack().sort_values(evaluation_choice[evaluation_choice_grid.value], ascending=False)
display(scores_ranking)

ベストモデル・スコア、予測確率の表示

In [None]:
best_model = scores_ranking.index[0]
print('-'*80)
print('BestModel:',best_model)
print('-'*80)

clf = pipelines_gs[best_model]
print(clf)
prediction = clf.predict_proba(X_fin_s)
probability = pd.DataFrame(prediction)
print(probability)

学習済モデルの保存

In [None]:
import pickle

s = pickle.dumps(clf)
clf2 = pickle.loads(s)

#prediction2 = clf2.predict(X_fin_s)

提出用ファイルへの書き出し

In [None]:
df_s['left'] = probability[1]
df_s_submission = df_s.iloc[:,:2]

df_s_submission.to_csv('./data/aijc3207.csv',index=False)

print('完了')