## 웹으로 시각화하기

In [1]:
# Web에 포함시킬 모델 구성하기
import pandas as pd

In [2]:
df1 = pd.read_csv("01_Data.csv")


In [3]:
df1

Unnamed: 0,Index,Member_ID,Sales_Type,Contract_Type,Channel,Datetime,Term,Payment_Type,Product_Type,Amount_Month,Customer_Type,Age,Address1,Address2,State,Overdue_count,Overdue_Type,Gender,Credit_Rank,Bank
0,1,66758234,렌탈,일반계약,영업방판,2019-05-06,60,CMS,DES-1,96900,개인,42.0,경기도,경기도,계약확정,0,없음,여자,9.0,새마을금고
1,2,66755948,렌탈,교체계약,영업방판,2020-02-20,60,카드이체,DES-1,102900,개인,39.0,경기도,경기도,계약확정,0,없음,남자,2.0,현대카드
2,3,66756657,렌탈,일반계약,홈쇼핑/방송,2019-02-28,60,CMS,DES-1,96900,개인,48.0,경기도,경기도,계약확정,0,없음,여자,8.0,우리은행
3,4,66423450,멤버십,멤버십3유형,재계약,2019-05-13,12,CMS,DES-1,66900,개인,39.0,경기도,경기도,계약확정,0,없음,남자,5.0,농협회원조합
4,5,66423204,멤버십,멤버십3유형,재계약,2019-05-10,12,CMS,DES-1,66900,개인,60.0,경기도,경기도,기간만료,12,있음,남자,8.0,농협회원조합
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51296,51298,66579515,렌탈,프로모션계약,대형마트A,2019-03-01,60,CMS,DES-3A,96900,개인,47.0,경기도,경기도,계약확정,0,없음,남자,,기업은행
51297,51299,66799558,렌탈,일반계약,대형마트A,2019-04-01,60,CMS,DES-1,96900,개인,42.0,경기도,경기도,계약확정,0,없음,여자,8.0,새마을금고
51298,51300,66799197,렌탈,프로모션계약,영업방판,2019-04-01,39,카드이체,ERA,120900,개인,65.0,서울특별시,서울특별시,계약확정,0,없음,여자,1.0,롯데카드
51299,51301,66792778,렌탈,일반계약,홈쇼핑/방송,2020-02-06,60,카드이체,DES-1,96900,개인,54.0,서울특별시,서울특별시,계약확정,0,없음,여자,2.0,롯데카드


In [4]:
df1['Target'] = df1['State'].replace({'계약확정': 0,
                     '기간만료': 0,
                     '해약확정': 1,
                     '해약진행중': 1})

In [5]:
df1['Target']

0        0
1        0
2        0
3        0
4        0
        ..
51296    0
51297    0
51298    0
51299    0
51300    0
Name: Target, Length: 51301, dtype: int64

In [6]:
df1.columns

Index(['Index', 'Member_ID', 'Sales_Type', 'Contract_Type', 'Channel',
       'Datetime', 'Term', 'Payment_Type', 'Product_Type', 'Amount_Month',
       'Customer_Type', 'Age', 'Address1', 'Address2', 'State',
       'Overdue_count', 'Overdue_Type', 'Gender', 'Credit_Rank', 'Bank',
       'Target'],
      dtype='object')

## - RandomForestClassifier()
## - MinMaxSccaler()
## - OneHotEncoder()
## - SimpleImputer()
## - SMOTE
## - CV = 3 / hyperparameter: max_depth 5 ~ 10 / min_samples_split 5 ~ 10
## - model_web.sav

필요한 라이브러리 호출

In [10]:
# 훈련데이터와 검증데이터 분리
from sklearn.model_selection import train_test_split

# 컬럼 변환
from sklearn.compose import make_column_transformer

# 파이프라인 만들기
from imblearn.pipeline import make_pipeline

# 결측치 대체 방법
from sklearn.impute import SimpleImputer

# 전처리 방법
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# 데이터의 비율 맞춰주기
from imblearn.over_sampling import SMOTE

# 대표적인 배깅 모델
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# 알고리즘 분류 결과 확인
from sklearn.metrics import classification_report

# 모델 저장
import pickle

1. X, Y 데이터 분할

In [7]:
X = df1[['Term', 'Product_Type', 'Amount_Month', 'Age', 'Gender', 'Credit_Rank']]

Y = df1['Target']

2. 학습데이터, 검증데이터 분할

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1234)

3. 파이프라인 만들기

In [11]:
# 1) numeric_pipe와 category_pipe 만들기
numeric_pipe = make_pipeline(SimpleImputer(), MinMaxScaler())
category_pipe = make_pipeline(SimpleImputer(strategy = 'most_frequent'), OneHotEncoder())

# 2) numeric_list와 category_list 만들기
numeric_list = X.describe().columns.tolist()
category_list = X.describe(include = 'object').columns.tolist()

# 3) make_column_transformer을 사용해서 preprocess_pipe 만들기 (numeric_pipe, numeric_list와 category_pipe, category_list 사용해서)
preprocess_pipe = make_column_transformer((numeric_pipe, numeric_list),
                                          (category_pipe, category_list))

# 4) make_pipeline으로 model_pipe 만들기
model_pipe = make_pipeline(preprocess_pipe, SMOTE(), RandomForestClassifier())

4. 하이퍼파라미터를 사용해서 교차검증 진행하기

In [16]:
hyper_list = {'randomforestclassifier__max_depth': range(5,11),
             'randomforestclassifier__min_samples_split': range(5,11)}

grid_model = GridSearchCV(model_pipe, param_grid = hyper_list,
                          cv = 5, scoring = 'f1', n_jobs = -1)

grid_model.fit(X_train, Y_train)

5. 베스트모델 테스트하고 저장하기

In [17]:
best_model = grid_model.best_estimator_
pickle.dump(best_model, open('model_web.sav', 'wb'))

6. 

In [19]:
best_model.predict_proba(X_train)

array([[0.12751909, 0.87248091],
       [0.69960335, 0.30039665],
       [0.56281724, 0.43718276],
       ...,
       [0.71729959, 0.28270041],
       [0.53793411, 0.46206589],
       [0.59374673, 0.40625327]])

In [20]:
best_model.predict_proba(X_train)[0][1]

0.872480906807947

In [22]:
Y_train_pred = best_model.predict(X_train)
Y_test_pred = best_model.predict(X_test)