# 작업 2유형
- https://www.datamanim.com/dataset/03_dataq/typetwo.html#id3

# 1. 서비스 이탈예측 데이터(Classification)

> Attention
- 데이터 설명 : 고객의 신상정보 데이터를 통한 회사 서비스 이탈 예측 (종속변수 : Exited)
- 데이터 출처 : https://www.kaggle.com/shubh0799/churn-modelling 에서 변형
- X_train : https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_train.csv
- y_train : https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_train.csv
- X_test : https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_test.csv
- y_test(평가용) : https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_test.csv

#### 0. 시험 환경 세팅

In [1]:
import pandas as pd

X_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_train.csv")
X_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_test.csv")
y_test = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_test.csv")

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6499, 12), (6499, 2), (3501, 12), (3501, 2))

#### 1. 라이브러리 및 데이터 호출

In [2]:
# 필요한 라이브러리 import

In [3]:
import numpy as np
import pandas as pd

In [4]:
import warnings
warnings.filterwarnings("ignore")

- CustomerId 컬럼 제거
- y_train의 Exited 컬럼을 label data 로 지정 (그냥 정답 데이터로 하라는거)

In [5]:
X_train.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,15799217,Zetticci,791,Germany,Female,35,7,52436.2,1,1,0,161051.75
1,15748986,Bischof,705,Germany,Male,42,8,166685.92,2,1,1,55313.51
2,15722004,Hsiung,543,France,Female,31,4,138317.94,1,0,0,61843.73
3,15780966,Pritchard,709,France,Female,32,2,0.0,2,0,0,109681.29
4,15636731,Ts'ai,714,Germany,Female,36,1,101609.01,2,1,1,447.73


In [6]:
X_train.drop(columns="CustomerId", inplace = True)
X_test.drop(columns="CustomerId", inplace = True)

In [7]:
y_train = y_train['Exited']
y_test = y_test['Exited']

In [8]:
y_train.head()

0    0
1    0
2    0
3    0
4    0
Name: Exited, dtype: int64

#### 2. EDA
- X_train 의 기초 통계량, null 값 확인

In [9]:
# X_train 의 기초 통계량, null 값 확인

In [10]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6499 entries, 0 to 6498
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Surname          6499 non-null   object 
 1   CreditScore      6499 non-null   int64  
 2   Geography        6499 non-null   object 
 3   Gender           6499 non-null   object 
 4   Age              6499 non-null   int64  
 5   Tenure           6499 non-null   int64  
 6   Balance          6499 non-null   float64
 7   NumOfProducts    6499 non-null   int64  
 8   HasCrCard        6499 non-null   int64  
 9   IsActiveMember   6499 non-null   int64  
 10  EstimatedSalary  6499 non-null   float64
dtypes: float64(2), int64(6), object(3)
memory usage: 558.6+ KB
None


In [11]:
X_train.isnull().sum()

Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64

In [12]:
object_columns = X_train.dtypes[X_train.dtypes == X_train.dtypes[0]] # X_train.dtypes[0]이 object 형태이기 때문에

In [13]:
object_columns.index

Index(['Surname', 'Geography', 'Gender'], dtype='object')

In [14]:
for i in object_columns.index:
    print(i, X_train[i].nunique())

Surname 2289
Geography 3
Gender 4


In [15]:
del X_train['Surname']
del X_test['Surname']

#### 3. Preprocessing
- Gender : 띄어쓰기 제거 및 대문자 변경
- 수치형 컬럼 : 정규화(MinMaxScaler)
- 범주형 컬럼 : 인코딩(LabelEncoder)

In [16]:
# 띄어쓰기 제거
# 대문자 로 전부 변경

In [17]:
X_train['Gender'] = X_train['Gender'].str.upper().str.replace(' ','')
X_test['Gender'] = X_test['Gender'].str.upper().str.replace(' ','')

In [18]:
X_train['Geography'] = X_train['Geography'].str.upper().str.replace(' ','')
X_test['Geography'] = X_test['Geography'].str.upper().str.replace(' ','')

In [19]:
# 수치형 컬럼 정규화

In [20]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [21]:
int_col = X_train.select_dtypes(include= ['int', 'float']).columns
int_col

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary'],
      dtype='object')

In [22]:
# 방법1
ss = StandardScaler()

for i in int_col:
    X_train[i] = ss.fit_transform(X_train[[i]])
    X_test[i] = ss.fit_transform(X_test[[i]])

In [23]:
# 방법2
# scaler = MinMaxScaler()
# X_train[int_col] = scaler.fit_transform(X_train[int_col])
# X_test[int_col] = scaler.fit_transform(X_test[int_col])

In [24]:
# 범주형 컬럼 인코딩

In [25]:
from sklearn.preprocessing import LabelEncoder

In [26]:
obj_col = X_train.select_dtypes(include= 'object').columns
pre = LabelEncoder()
for i in obj_col:
    X_train[i] = pre.fit_transform(X_train[[i]])
    X_test[i] = pre.fit_transform(X_test[[i]])

In [27]:
X_train.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,1.455346,1,0,-0.376792,0.677301,-0.391014,-0.897814,0.640843,-1.0292,1.047721
1,0.565183,1,1,0.289748,1.023136,1.439829,0.829508,0.640843,0.971629,-0.777233
2,-1.111636,0,0,-0.757672,-0.360202,0.985234,-0.897814,-1.560445,-1.0292,-0.664527
3,0.606586,0,0,-0.662452,-1.051871,-1.231301,0.829508,-1.560445,-1.0292,0.16111
4,0.658339,1,0,-0.281572,-1.397706,0.396976,0.829508,0.640843,0.971629,-1.724171


In [28]:
X_test.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,1.563603,0,0,2.021461,-0.677503,0.273389,-0.937646,0.655856,-1.033405,-1.060819
1,-0.504296,0,0,1.639037,-0.677503,0.642267,2.474724,0.655856,-1.033405,-1.683428
2,1.553263,0,0,-0.655506,-0.331739,-0.010651,-0.937646,0.655856,0.967674,-1.088603
3,0.436598,2,0,-0.464294,1.742841,0.50857,0.768539,-1.524725,-1.033405,0.97044
4,-0.607691,0,0,2.212673,0.014024,-1.215823,-0.937646,0.655856,0.967674,0.02341


#### 4. Modeling
- 분류MODEL 을 이용해 서비스 이탈예측
- 각 MODEL 의 기본값을 이용해 예측해 보세요
    - logistic regressor
    - KNN
    - SVC
    - RandomForestClassifier
    - XGBClassifier
- accuracy_score, roc_auc_score 를 이용해 평가
    - accuracy : 0.85
    - auc : 0.85
- gridsearch를 활용한 best model 과 best_parameter, RandomForestClassifier의 feature_importances_ 는 어떤값을 나타 내는가?

In [29]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, roc_auc_score

In [30]:
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
xgb = XGBClassifier(eval_metric='mlogloss', use_label_encoder =False) # XGB 에서 발생하는 warnings 꺼준다.

models = [rf,gb,xgb]

In [31]:
from sklearn.model_selection import GridSearchCV

In [32]:
params = {'n_estimators': [100,200,300,400,500], 'max_depth' : [1,2,3,4,5]}

best_models = [] # 안날라가게 저장

for model in models:
    gs = GridSearchCV(model, param_grid= params, cv = 5, scoring = 'roc_auc', n_jobs=4)
    gs.fit(X_train, y_train)

    print(f'model : {model}')
    print(f'params : {gs.best_params_}')
    print(f'score : {gs.best_score_}')

    best_models.append(gs.best_estimator_)

model : RandomForestClassifier()
params : {'max_depth': 5, 'n_estimators': 200}
score : 0.8467869258623975
model : GradientBoostingClassifier()
params : {'max_depth': 3, 'n_estimators': 100}
score : 0.8621624599840351


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Ind

model : XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='mlogloss', gamma=None,
              gpu_id=None, importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=None,
              tree_method=None, use_label_encoder=False,
              validate_parameters=None, verbosity=None)
params : {'max_depth': 2, 'n_estimators': 100}
score : 0.8569066574964852


In [33]:
len(best_models)

3

In [34]:
for i in best_models:

    model = i

    test_predict = model.predict(X_test)
    test_predict_proba = model.predict_proba(X_test)

    print(f'accuracy_score: {accuracy_score(y_test,test_predict)}')
    print(f'roc_auc_score: {roc_auc_score(y_test, test_predict_proba[:,1])}')
    print("---------------------------------")

accuracy_score: 0.8557554984290203
roc_auc_score: 0.8459275476345226
---------------------------------
accuracy_score: 0.8631819480148529
roc_auc_score: 0.8591340668583651
---------------------------------
accuracy_score: 0.8628963153384748
roc_auc_score: 0.8584992081873629
---------------------------------


In [179]:
rf_model = best_models[0]

In [180]:
rf_model.feature_importances_

array([0.0325387 , 0.03002397, 0.01420772, 0.42557122, 0.00744234,
       0.05187491, 0.32431066, 0.00102012, 0.09881885, 0.01419152])

In [178]:
X_train.columns[rf_model.feature_importances_ > 0.3]

Index(['Age', 'NumOfProducts'], dtype='object')

In [226]:
test_predict_proba = best_models[0].predict_proba(X_test)

test_predict_proba

# predict_proba 함수는 각 샘플에 대해 어느 클래스에 속할 확률을 0에서 1 사이의 값으로 돌려준다.
# (Claee1, Class2) 형태로 나타나는데 둘을 합치면 1이 됨.

array([[0.37817917, 0.62182083],
       [0.14903766, 0.85096234],
       [0.88524455, 0.11475545],
       ...,
       [0.34137702, 0.65862298],
       [0.92249508, 0.07750492],
       [0.9041138 , 0.0958862 ]])

In [227]:
test_predict_proba[:,1]

array([0.62182083, 0.85096234, 0.11475545, ..., 0.65862298, 0.07750492,
       0.0958862 ])

In [228]:
roc_auc_score(y_test, test_predict_proba[:,1])

0.8469980541732651

In [190]:
test_predict = best_models[0].predict(X_test)
test_predict

# predict 함수는 새로운 속성들을 넣었을 때 그 클래스에 속하는지 속하지 않는지를 나타내는 1 또는 0으로 구성된 벡터를 반환해준다.

array([1, 1, 0, ..., 1, 0, 0])

In [191]:
accuracy_score(y_test,test_predict)

0.8574692944872894