## 여러 모델 사용하기
* IBM sample datasets
https://www.kaggle.com/blastchar/telco-customer-churn

* Demographic info: 
    * Gender, SeniorCitizen, Partner, Dependents
* Services subscribed: 
    * PhoneService, MultipleLine, InternetService, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies
* Customer account info:
    * CustomerID, Contract, PaperlessBilling, PaymentMethod, MonthlyCharges, TotalCharges, Tenure

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from IPython.display import set_matplotlib_formats

set_matplotlib_formats('retina')

## 데이터 로드하기

In [11]:
df = pd.read_csv("data/telco_feature.csv")
df.shape

(7043, 50)

In [12]:
df = df.set_index("customerID")

In [13]:
df.head()

Unnamed: 0_level_0,Churn_bool,Dependents_bool,PhoneService_bool,gender_bool,Partner_bool,PaperlessBilling_bool,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,PaymentMethod_Bank transfer (automatic),...,tenure_group_0-20,tenure_group_20-40,tenure_group_40-60,tenure_group_60plus,InternetService_DSL,InternetService_Fiber optic,InternetService_No,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7590-VHVEG,False,False,False,False,True,True,1,0,0,0,...,1,0,0,0,1,0,0,1,0,0
5575-GNVDE,False,False,True,True,False,False,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
3668-QPYBK,True,False,True,True,False,True,1,0,0,0,...,1,0,0,0,1,0,0,1,0,0
7795-CFOCW,False,False,False,True,False,False,0,0,1,1,...,0,0,1,0,1,0,0,1,0,0
9237-HQITU,True,False,True,False,False,True,1,0,0,0,...,1,0,0,0,0,1,0,1,0,0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7043 entries, 7590-VHVEG to 3186-AJIEK
Data columns (total 49 columns):
 #   Column                                   Non-Null Count  Dtype
---  ------                                   --------------  -----
 0   Churn_bool                               7043 non-null   bool 
 1   Dependents_bool                          7043 non-null   bool 
 2   PhoneService_bool                        7043 non-null   bool 
 3   gender_bool                              7043 non-null   bool 
 4   Partner_bool                             7043 non-null   bool 
 5   PaperlessBilling_bool                    7043 non-null   bool 
 6   DeviceProtection_No                      7043 non-null   int64
 7   DeviceProtection_No internet service     7043 non-null   int64
 8   DeviceProtection_Yes                     7043 non-null   int64
 9   PaymentMethod_Bank transfer (automatic)  7043 non-null   int64
 10  PaymentMethod_Credit card (automatic)    7043 non-null   int64

## 전처리

In [16]:
df["Churn_bool"].value_counts()

False    5174
True     1869
Name: Churn_bool, dtype: int64

## 학습, 예측 데이터셋 나누기
### 학습, 예측에 사용할 컬럼

In [17]:
df.nunique()

Churn_bool                                 2
Dependents_bool                            2
PhoneService_bool                          2
gender_bool                                2
Partner_bool                               2
PaperlessBilling_bool                      2
DeviceProtection_No                        2
DeviceProtection_No internet service       2
DeviceProtection_Yes                       2
PaymentMethod_Bank transfer (automatic)    2
PaymentMethod_Credit card (automatic)      2
PaymentMethod_Electronic check             2
PaymentMethod_Mailed check                 2
StreamingMovies_No                         2
StreamingMovies_No internet service        2
StreamingMovies_Yes                        2
monthlycharges_group_0-30                  2
monthlycharges_group_30-70                 2
monthlycharges_group_70-99                 2
monthlycharges_group_99plus                2
totalcharges_group_0-2k                    2
totalcharges_group_2k-4k                   2
totalcharg

### 정답값이자 예측해야 될 값

In [18]:
# label_name 이라는 변수에 예측할 컬럼의 이름을 담습니다.
label_name = "Churn_bool"

### 문제(feature)와 답안(label)을 나누기

* X, y를 만들어 줍니다.
* X는 feature, 독립변수, 예) 시험의 문제
* y는 label, 종속변수, 예) 시험의 정답

In [20]:
# X, y를 만들어 줍니다.

X = df.drop(label_name, axis=1)
y = df[label_name]

### 학습, 예측 데이터셋 만들기
* X_train : 학습 세트 만들기, 행렬, 판다스의 데이터프레임, 2차원 리스트(배열) 구조,  예) 시험의 기출문제
* y_train : 정답 값을 만들기, 벡터, 판다스의 시리즈, 1차원 리스트(배열) 구조, 예) 기출문제의 정답
* X_test : 예측에 사용할 데이터세트를 만듭니다. 예) 실전 시험 문제
* y_test : 예측의 정답값 예) 실전 시험 문제의 정답

In [22]:
# train_test_split 으로 데이터셋을 나눕니다.

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [23]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4718, 48), (2325, 48), (4718,), (2325,))

In [None]:
X_train.head(3)

In [None]:
X_test.head(3)

In [26]:
y_train.head(2)

customerID
2985-JUUBZ    False
5016-ETTFF     True
Name: Churn_bool, dtype: bool

## 머신러닝 모델로 예측하기

In [27]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

In [29]:
from sklearn.model_selection import cross_val_predict

y_pred = cross_val_predict(model, X_train, y_train, cv=5, verbose=2, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.4s finished


In [32]:
(y_train == y_pred).mean()

0.7223399745654938

## Grid Search

In [36]:
max_depth_list = np.arange(3, 15)
max_depth_list

array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [39]:
max_features_list = np.arange(0.3, 1.1, step=0.1)
max_features_list

array([0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

In [49]:
result_list = []
for max_depth in max_depth_list:
    for max_features in max_features_list:
        
        result = []
        model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
        y_pred = cross_val_predict(model, X_train, y_train, cv=5, verbose=0, n_jobs=-1)
        accuracy = (y_train == y_pred).mean()
        result.append(max_depth)
        result.append(max_features)
        result.append(accuracy)
        result_list.append(result)

In [1]:
df_result = pd.DataFrame(result_list, columns=["max_depth", "max_feautures", "score"])
df_result.sort_values(by="score", ascending=False)

NameError: name 'pd' is not defined

In [51]:
# parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
parameters = {"max_depth": max_depth_list, "max_features" : max_features_list}

In [None]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(model, parameters)
clf.fit(X_train, y_train)

In [54]:
clf.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=0.8000000000000003,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       presort='deprecated', random_state=None,
                       splitter='best')

In [55]:
clf.best_score_

0.7859270808994014

In [57]:
best_model = clf.best_estimator_

In [67]:
score_list = [0.800847, 0.780720, 0.772246, 0.784730, 0.791092]
pd.Series(score_list).mean()

0.7859269999999999

In [65]:
df_cv_result = pd.DataFrame(clf.cv_results_)
df_cv_result.sort_values("rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,0.012878,0.001207,0.003386,0.000477,3,0.8,"{'max_depth': 3, 'max_features': 0.80000000000...",0.800847,0.780720,0.772246,0.784730,0.791092,0.785927,0.009644,1
14,0.016708,0.002111,0.003781,0.001107,4,0.9,"{'max_depth': 4, 'max_features': 0.90000000000...",0.800847,0.780720,0.769068,0.786850,0.784730,0.784443,0.010249,2
18,0.013331,0.000246,0.003306,0.000314,5,0.5,"{'max_depth': 5, 'max_features': 0.5}",0.790254,0.792373,0.771186,0.790032,0.776246,0.784018,0.008601,3
27,0.014915,0.000434,0.003671,0.000705,6,0.6,"{'max_depth': 6, 'max_features': 0.60000000000...",0.791314,0.779661,0.773305,0.786850,0.783669,0.782960,0.006157,4
31,0.020359,0.005267,0.004316,0.001009,6,1,"{'max_depth': 6, 'max_features': 1.00000000000...",0.788136,0.779661,0.780720,0.784730,0.781548,0.782959,0.003092,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,0.016479,0.000303,0.003022,0.000080,14,0.6,"{'max_depth': 14, 'max_features': 0.6000000000...",0.712924,0.712924,0.725636,0.734889,0.727466,0.722767,0.008614,92
92,0.017135,0.000321,0.003093,0.000144,14,0.7,"{'max_depth': 14, 'max_features': 0.7000000000...",0.700212,0.715042,0.724576,0.737010,0.731707,0.721709,0.013030,93
94,0.019249,0.000463,0.003100,0.000191,14,0.9,"{'max_depth': 14, 'max_features': 0.9000000000...",0.727754,0.708686,0.721398,0.722163,0.721103,0.720221,0.006257,94
93,0.018057,0.000217,0.002948,0.000011,14,0.8,"{'max_depth': 14, 'max_features': 0.8000000000...",0.720339,0.708686,0.722458,0.737010,0.711559,0.720010,0.009947,95


## Random Search

In [94]:
max_depth = np.random.randint(1, 10)
max_features = np.random.uniform(0.1, 1.0)
max_depth, max_features

(6, 0.7020302643067419)

In [98]:
result_list = []
for i in range(10):
    max_depth = np.random.randint(1, 10)
    max_features = np.random.uniform(0.1, 1.0)
        
    result = []
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    y_pred = cross_val_predict(model, X_train, y_train, cv=5, verbose=0, n_jobs=-1)
    accuracy = (y_train == y_pred).mean()
    result.append(max_depth)
    result.append(max_features)
    result.append(accuracy)
    result_list.append(result)

In [99]:
result_list

[[6, 0.45835171925944496, 0.7787197965239508],
 [7, 0.8824991135673157, 0.7797795676133955],
 [9, 0.4589337888885663, 0.7664264518863926],
 [1, 0.28901886510250324, 0.7369648155998304],
 [2, 0.973198859065771, 0.7465027554048326],
 [2, 0.3126020001177864, 0.7399321746502755],
 [5, 0.4462623865137142, 0.7787197965239508],
 [8, 0.5858349132916518, 0.768122085629504],
 [2, 0.8297712186316641, 0.7454429843153879],
 [7, 0.2427731232688692, 0.7721492157693938]]

In [100]:
df_result = pd.DataFrame(result_list, columns=["max_depth", "max_feautures", "score"])
df_result.sort_values(by="score", ascending=False).head()

Unnamed: 0,max_depth,max_feautures,score
1,7,0.882499,0.77978
0,6,0.458352,0.77872
6,5,0.446262,0.77872
9,7,0.242773,0.772149
7,8,0.585835,0.768122


In [105]:
max_depth_list = np.random.randint(1, 10, 10)
max_features_list = np.random.uniform(0.1, 1.0, 10)
max_depth_list, max_features_list

(array([2, 1, 3, 9, 7, 3, 8, 7, 6, 4]),
 array([0.19565819, 0.89198132, 0.94247687, 0.29715637, 0.83376139,
        0.24678521, 0.94845863, 0.32424473, 0.77687148, 0.91418409]))

In [117]:
distributions = {"max_depth" : np.random.randint(3, 10, 100), 
                 "max_features" : np.random.uniform(0.5, 1.0, 100)}

In [118]:
from sklearn.model_selection import RandomizedSearchCV

clf = RandomizedSearchCV(model, distributions, random_state=42)

In [119]:
clf.fit(X_train, y_train)

RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=7,
                                                    max_features=0.2427731232688692,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort='deprecated',
                                                    random_state=No

In [120]:
clf.best_score_

0.7825365763790284

In [121]:
df_cv_result = pd.DataFrame(clf.cv_results_)
df_cv_result.sort_values("rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
6,0.013542,0.000219,0.003078,6.7e-05,0.871978,4,"{'max_features': 0.8719781319617559, 'max_dept...",0.790254,0.78072,0.769068,0.78685,0.78579,0.782537,0.007395,1
1,0.015309,0.001388,0.00384,0.000773,0.589878,6,"{'max_features': 0.5898776286086898, 'max_dept...",0.78072,0.779661,0.783898,0.78473,0.778367,0.781475,0.002449,2
2,0.011785,0.000513,0.003027,1.8e-05,0.52024,3,"{'max_features': 0.5202396298256813, 'max_dept...",0.762712,0.78072,0.757415,0.790032,0.777306,0.773637,0.011958,3
3,0.012626,0.000815,0.003089,3.6e-05,0.634668,3,"{'max_features': 0.6346675534983975, 'max_dept...",0.764831,0.78072,0.772246,0.757158,0.790032,0.772997,0.011558,4
0,0.019314,0.003591,0.003878,0.000585,0.558595,7,"{'max_features': 0.5585950487840451, 'max_dept...",0.78178,0.757415,0.754237,0.782609,0.780488,0.771306,0.012697,5
7,0.011565,5.6e-05,0.003027,8.5e-05,0.512405,3,"{'max_features': 0.512405035441384, 'max_depth...",0.797669,0.777542,0.744703,0.759279,0.777306,0.7713,0.018013,6
4,0.017124,0.000405,0.003232,0.000134,0.844308,8,"{'max_features': 0.844307985081804, 'max_depth...",0.775424,0.774364,0.76589,0.774125,0.759279,0.769816,0.006279,7
5,0.016128,0.000497,0.003191,0.000112,0.734611,8,"{'max_features': 0.7346106445121743, 'max_dept...",0.773305,0.772246,0.761653,0.770944,0.768823,0.769394,0.004149,8
8,0.01656,0.002034,0.003719,0.000673,0.568802,8,"{'max_features': 0.5688019398873644, 'max_dept...",0.771186,0.768008,0.757415,0.773065,0.773065,0.768548,0.005865,9
9,0.016779,0.001356,0.003158,0.000119,0.666573,9,"{'max_features': 0.6665734896211726, 'max_dept...",0.754237,0.764831,0.763771,0.767762,0.7614,0.7624,0.004565,10


## 학습 예측하기

In [58]:
best_model.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=0.8000000000000003,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       presort='deprecated', random_state=None,
                       splitter='best')

In [59]:
y_test_pred = best_model.predict(X_test)

(y_test == y_test_pred).mean()

0.7974193548387096

## 모델 평가하기

In [None]:
# 피처의 중요도를 추출하기


In [None]:
# 피처의 중요도 시각화 하기


### 점수 측정하기