### 라이브러리 불러오기

In [2]:
import pandas as pd
import numpy as np

In [5]:
# 모델 라이브러리 선언
from sklearn import datasets, tree

# 분석 모델
from sklearn.tree import DecisionTreeClassifier

# 훈련/테스트 데이터 자동 분리
from sklearn.model_selection import train_test_split

# 모델 정확도 라이브러리 선언
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

### 데이터 불러오기

In [6]:
csData = pd.read_csv("https://raw.githubusercontent.com/hyokwan/python-lecture/master/dataset/customer.csv")
csData.head()

Unnamed: 0,balance,stock,label
0,30000000,22500000,normal
1,280000000,48000000,diamond
2,300000000,40666666,diamond
3,54000000,28000000,normal
4,768000000,32000000,vip


### 1. 타입통합 / 특정 숫자 컬럼 추가

In [9]:
# 라벨 유형 확인
labels = csData.label.drop_duplicates()
labels

0     normal
1    diamond
4        vip
Name: label, dtype: object

In [10]:
# 상관관계 분석을 위한 라벨코드 컬럼 추가
labelDict = {"normal":0,
             "diamond":1,
             "vip":2}

In [11]:
csData["labelCode"] = csData.label.map(labelDict)

In [12]:
csData

Unnamed: 0,balance,stock,label,labelCode
0,30000000,22500000,normal,0
1,280000000,48000000,diamond,1
2,300000000,40666666,diamond,1
3,54000000,28000000,normal,0
4,768000000,32000000,vip,2
...,...,...,...,...
19995,628000000,44666666,diamond,1
19996,276000000,20000000,normal,0
19997,652000000,41333333,diamond,1
19998,676000000,45333333,diamond,1


### 2. 특성 선정 / 데이터 분리

#### 2-1. 특성 선정

In [15]:
corrDf = csData.corr()
corrDf

Unnamed: 0,balance,stock,labelCode
balance,1.0,0.565942,0.883144
stock,0.565942,1.0,0.824174
labelCode,0.883144,0.824174,1.0


In [14]:
featuresStd = 0.5

In [16]:
corrDf[ ( abs(corrDf.labelCode) > featuresStd ) & ( abs(corrDf.labelCode) != 1 )]

Unnamed: 0,balance,stock,labelCode
balance,1.0,0.565942,0.883144
stock,0.565942,1.0,0.824174


In [17]:
features = list( (corrDf[ ( abs(corrDf.labelCode) > featuresStd ) & ( abs(corrDf.labelCode) != 1 )]).index )
features

['balance', 'stock']

In [18]:
label = ["label"]

#### 2-2. 데이터 분리

In [19]:
featuresData = csData.loc[:, features]
labelData = csData.loc[:, label]

In [20]:
trainingData_features,\
testData_features,\
trainingData_label,\
testData_label = \
                train_test_split(featuresData, labelData, test_size = 0.3, random_state = 1)

In [21]:
print(trainingData_features.shape)
print(testData_features.shape)
print(trainingData_label.shape)
print(testData_label.shape)

(14000, 2)
(6000, 2)
(14000, 1)
(6000, 1)


### 3. 모델 선언 및 학습

In [22]:
# 모델 선언
modelMethod = tree.DecisionTreeClassifier(random_state = 1)

In [23]:
# 모델 훈련(훈련데이터 features, label)
model_Dt = modelMethod.fit(trainingData_features, trainingData_label)

### 4. 모델 예측

In [24]:
predict = model_Dt.predict(testData_features)
predict

array(['diamond', 'diamond', 'diamond', ..., 'diamond', 'vip', 'diamond'],
      dtype=object)

### 5. 데이터 정리

In [25]:
# 예측 결과를 데이터 프레임으로 변환
predictData = pd.DataFrame(predict, columns=["PREDICT"])

In [26]:
testData_label

Unnamed: 0,label
11456,diamond
16528,diamond
3253,diamond
18614,normal
1544,normal
...,...
12696,diamond
14288,diamond
18768,diamond
3950,vip


In [27]:
# 테스트데이터 정답지 인덱스 초기화
testData_label.reset_index(drop=True, inplace=True)
testData_label

Unnamed: 0,label
0,diamond
1,diamond
2,diamond
3,normal
4,normal
...,...
5995,diamond
5996,diamond
5997,diamond
5998,vip


In [29]:
# 예측결과, 정답지 병합
finalData = pd.concat( [testData_label, predictData] , axis = 1)
finalData

Unnamed: 0,label,PREDICT
0,diamond,diamond
1,diamond,diamond
2,diamond,diamond
3,normal,normal
4,normal,normal
...,...,...
5995,diamond,diamond
5996,diamond,diamond
5997,diamond,diamond
5998,vip,vip


### 6. 결과 검증

In [30]:
# 결과 테스트 하기
ac_score = accuracy_score(testData_label, predict)
cl_report = classification_report(testData_label, predict)

In [31]:
# 결과 리포트 하기
### accuracy : 정확도
### precision : 정밀도 (ex. 사과라고 분류기가 예측한 결과중에 분류기가 맞춘 비율
### recall : 재현율 (ex. 원래 사과들중에 사과라고 분류기가 맞춘 비율)
### f1-score : precision 과 recall의 조화평균

print("Accuracy = ", ac_score)
print("result = \n", cl_report)

Accuracy =  0.9945
result = 
               precision    recall  f1-score   support

     diamond       1.00      1.00      1.00      3483
      normal       0.99      0.99      0.99      1803
         vip       0.99      0.99      0.99       714

    accuracy                           0.99      6000
   macro avg       0.99      0.99      0.99      6000
weighted avg       0.99      0.99      0.99      6000

