# ケーススタディ１: ２値分類

UCIで提供されているデータセット[Bank Marketing](https://archive.ics.uci.edu/dataset/222/bank+marketing)を使う。
データセットをロードするためのPythonモジュールは[ここ](https://github.com/uci-ml-repo/ucimlrepo)で公開されている。
pipで簡単に導入もできる。

機械学習を使って、営業の成功・失敗の予測をさせる例

In [518]:
!pip3 install ucimlrepo --break-system-packages | tail -1



In [519]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
bank_marketing = fetch_ucirepo(id=222) 
  
# data (as pandas dataframes) 
X = bank_marketing.data.features 
y = bank_marketing.data.targets 
  
# metadata 
print(bank_marketing.metadata) 
  
# variable information 
print(bank_marketing.variables) 


{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'title': 'A data-driven approach to predict the success of bank telemarketing'

In [520]:
X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,


In [521]:
# columnsの日本語化
X.columns = ['年齢', '職業', '婚姻', '学歴', '債務不履行', '平均残高', '住宅ローン',
             '個人ローン', '連絡手段', '最終通話日', '最終通話月', '最終通話秒数', '通話回数_販促中',
             '前回販促後_経過日数', '通話回数_販促前', '前回販促結果']

In [522]:
X.head()

Unnamed: 0,年齢,職業,婚姻,学歴,債務不履行,平均残高,住宅ローン,個人ローン,連絡手段,最終通話日,最終通話月,最終通話秒数,通話回数_販促中,前回販促後_経過日数,通話回数_販促前,前回販促結果
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,


In [523]:
X.shape

(45211, 16)

## 前処理

### 欠損値の処理

In [524]:
X.isnull().sum()

年齢                0
職業              288
婚姻                0
学歴             1857
債務不履行             0
平均残高              0
住宅ローン             0
個人ローン             0
連絡手段          13020
最終通話日             0
最終通話月             0
最終通話秒数            0
通話回数_販促中          0
前回販促後_経過日数        0
通話回数_販促前          0
前回販促結果        36959
dtype: int64

すべて、カテゴリカル値に関する欠損値なので'unknown'というラベル値にする

In [525]:
X = X.fillna({'職業': 'unknown', '学歴': 'unknown', '連絡手段': 'unknown', '前回販促結果': 'unknown'})

In [526]:
X.isnull().sum()

年齢            0
職業            0
婚姻            0
学歴            0
債務不履行         0
平均残高          0
住宅ローン         0
個人ローン         0
連絡手段          0
最終通話日         0
最終通話月         0
最終通話秒数        0
通話回数_販促中      0
前回販促後_経過日数    0
通話回数_販促前      0
前回販促結果        0
dtype: int64

### カテゴリカル値のOne Hotエンコーディング

In [527]:
X['職業'].value_counts()

職業
blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
unknown           288
Name: count, dtype: int64

In [528]:
X['婚姻'].value_counts()

婚姻
married     27214
single      12790
divorced     5207
Name: count, dtype: int64

In [529]:
X['学歴'].value_counts()

学歴
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

In [530]:
X['連絡手段'].value_counts()

連絡手段
cellular     29285
unknown      13020
telephone     2906
Name: count, dtype: int64

In [531]:
X['前回販促結果'].value_counts()

前回販促結果
unknown    36959
failure     4901
other       1840
success     1511
Name: count, dtype: int64

In [532]:
import pandas as pd

# カテゴリカル値のOne Hotエンコーディング
def enc(df, column):
    # dtype is set to 'bool' by default, so we have to specify dtype as 'int'
    df_dummy = pd.get_dummies(df[column], prefix=column, dtype=int)
    df = pd.concat([df.drop([column], axis=1), df_dummy], axis=1)
    return df

In [533]:
X1 = X.copy()
X1 = enc(X1, '職業')
X1 = enc(X1, '婚姻')
X1 = enc(X1, '学歴')
X1 = enc(X1, '連絡手段')
X1 = enc(X1, '前回販促結果')

#### 今回の販促結果の確認

In [534]:
y.isnull().sum()

y    0
dtype: int64

In [535]:
y.value_counts()

y  
no     39922
yes     5289
Name: count, dtype: int64

In [536]:
X1.head()

Unnamed: 0,年齢,債務不履行,平均残高,住宅ローン,個人ローン,最終通話日,最終通話月,最終通話秒数,通話回数_販促中,前回販促後_経過日数,...,学歴_secondary,学歴_tertiary,学歴_unknown,連絡手段_cellular,連絡手段_telephone,連絡手段_unknown,前回販促結果_failure,前回販促結果_other,前回販促結果_success,前回販促結果_unknown
0,58,no,2143,yes,no,5,may,261,1,-1,...,0,1,0,0,0,1,0,0,0,1
1,44,no,29,yes,no,5,may,151,1,-1,...,1,0,0,0,0,1,0,0,0,1
2,33,no,2,yes,yes,5,may,76,1,-1,...,1,0,0,0,0,1,0,0,0,1
3,47,no,1506,yes,no,5,may,92,1,-1,...,0,0,1,0,0,1,0,0,0,1
4,33,no,1,no,no,5,may,198,1,-1,...,0,0,1,0,0,1,0,0,0,1


### バイナリ値の処理

In [537]:
X1['債務不履行'] = X1['債務不履行'].map(dict(yes=1, no=0))
X1['住宅ローン'] = X1['住宅ローン'].map(dict(yes=1, no=0))
X1['個人ローン'] = X1['個人ローン'].map(dict(yes=1, no=0))
#y = y.map(dict(yes=1, no=0))

In [538]:
X1.head()

Unnamed: 0,年齢,債務不履行,平均残高,住宅ローン,個人ローン,最終通話日,最終通話月,最終通話秒数,通話回数_販促中,前回販促後_経過日数,...,学歴_secondary,学歴_tertiary,学歴_unknown,連絡手段_cellular,連絡手段_telephone,連絡手段_unknown,前回販促結果_failure,前回販促結果_other,前回販促結果_success,前回販促結果_unknown
0,58,0,2143,1,0,5,may,261,1,-1,...,0,1,0,0,0,1,0,0,0,1
1,44,0,29,1,0,5,may,151,1,-1,...,1,0,0,0,0,1,0,0,0,1
2,33,0,2,1,1,5,may,76,1,-1,...,1,0,0,0,0,1,0,0,0,1
3,47,0,1506,1,0,5,may,92,1,-1,...,0,0,1,0,0,1,0,0,0,1
4,33,0,1,0,0,5,may,198,1,-1,...,0,0,1,0,0,1,0,0,0,1


In [539]:
y1 = y.copy()
y1['y'] = y1['y'].map(dict(yes=1, no=0))

### 月名の処理

カテゴリカル値と違い、順序があるので、単純にOne Hotエンコーディングすると、情報が失われる。
そこで、月名は数値に変えてしまう。

In [540]:
X1['最終通話月'].value_counts()

最終通話月
may    13766
jul     6895
aug     6247
jun     5341
nov     3970
apr     2932
feb     2649
jan     1403
oct      738
sep      579
mar      477
dec      214
Name: count, dtype: int64

In [541]:
month_dict = dict(jan=1, feb=2, mar=3, apr=4, may=5, jun=6,
                  jul=7, aug=8, sep=9, oct=10, nov=11, dec=12)

def enc_month(df, column):
    df[column] = df[column].map(month_dict)
    return df

In [542]:
X1 = enc_month(X1, '最終通話月')

In [543]:
X1['最終通話月'].value_counts()

最終通話月
5     13766
7      6895
8      6247
6      5341
11     3970
4      2932
2      2649
1      1403
10      738
9       579
3       477
12      214
Name: count, dtype: int64

In [544]:
X1.head()

Unnamed: 0,年齢,債務不履行,平均残高,住宅ローン,個人ローン,最終通話日,最終通話月,最終通話秒数,通話回数_販促中,前回販促後_経過日数,...,学歴_secondary,学歴_tertiary,学歴_unknown,連絡手段_cellular,連絡手段_telephone,連絡手段_unknown,前回販促結果_failure,前回販促結果_other,前回販促結果_success,前回販促結果_unknown
0,58,0,2143,1,0,5,5,261,1,-1,...,0,1,0,0,0,1,0,0,0,1
1,44,0,29,1,0,5,5,151,1,-1,...,1,0,0,0,0,1,0,0,0,1
2,33,0,2,1,1,5,5,76,1,-1,...,1,0,0,0,0,1,0,0,0,1
3,47,0,1506,1,0,5,5,92,1,-1,...,0,0,1,0,0,1,0,0,0,1
4,33,0,1,0,0,5,5,198,1,-1,...,0,0,1,0,0,1,0,0,0,1


In [595]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()

X1[['平均残高', '最終通話秒数']] = stdsc.fit_transform(X1[['平均残高', '最終通話秒数']])

In [596]:
## データ分割
from sklearn.model_selection import train_test_split

#x_train, x_test, y_train, y_test = train_test_split(X1, y1.to_numpy().flatten(), test_size=0.4, random_state=123, stratify=y)
x_train, x_test, y_train, y_test = train_test_split(X1, y1.to_numpy().flatten(), test_size=0.4, random_state=123)

In [597]:
display(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(27126, 37)

(27126,)

(18085, 37)

(18085,)

## アルゴリズムの選択

In [616]:
from sklearn.linear_model import LogisticRegression
algorithm1 = LogisticRegression(random_state=123, max_iter=2000) # 収束性が悪いので、反復回数を増やした

from sklearn.tree import DecisionTreeClassifier
algorithm2 = DecisionTreeClassifier(random_state=123)

from sklearn.ensemble import RandomForestClassifier
algorithm3 = RandomForestClassifier(random_state=123)

from xgboost import XGBClassifier
algorithm4 = XGBClassifier(random_state=123)

algorithms = [algorithm1, algorithm2, algorithm3, algorithm4]

In [617]:
from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=3)

In [618]:
from sklearn.model_selection import cross_val_score
for algorithm in algorithms:
    scores = cross_val_score(algorithm, x_train, y_train,
                             cv=stratifiedkfold, scoring='roc_auc')
    score = scores.mean()
    name = algorithm.__class__.__name__
    print(f'平均スコア: {score:.4f} / 個別スコア: {scores} / {name}')

平均スコア: 0.8930 / 個別スコア: [0.89191588 0.89311627 0.89399766] / LogisticRegression
平均スコア: 0.6958 / 個別スコア: [0.70314027 0.68899261 0.69523436] / DecisionTreeClassifier
平均スコア: 0.9229 / 個別スコア: [0.92182038 0.92161282 0.92533232] / RandomForestClassifier
平均スコア: 0.9261 / 個別スコア: [0.92685575 0.92713068 0.92421758] / XGBClassifier


XGBoostが最良の結果になった

## 学習・予測・評価

In [619]:
# 学習
algorithm = XGBClassifier(random_state=123)
algorithm.fit(x_train, y_train)
y_pred = algorithm.predict(x_test)

In [627]:
# 評価
from sklearn.metrics import confusion_matrix
ans = confusion_matrix(y_test, y_pred)

display(ans)

array([[15377,   604],
       [ 1112,   992]])

In [634]:
from sklearn.metrics import precision_recall_fscore_support

precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print(f'適合率: {precision:.4f} / 再現率: {recall:.4f} / F値: {fscore:.4f}')

適合率: 0.6216 / 再現率: 0.4715 / F値: 0.5362


## チューニング

## 重要度分析