### 라이브러리 불러오기

In [1]:
import pandas as pd
import numpy as np

In [2]:
# 모델 라이브러리 선언
from sklearn.neighbors import KNeighborsClassifier

# 훈련/테스트 데이터 자동 분리
from sklearn.model_selection import train_test_split

# 모델 정확도 라이브러리 선언
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

### 데이터 불러오기

In [3]:
csData = pd.read_csv("https://raw.githubusercontent.com/hyokwan/python-lecture/master/dataset/customer.csv")

In [4]:
csData.head()

Unnamed: 0,balance,stock,label
0,30000000,22500000,normal
1,280000000,48000000,diamond
2,300000000,40666666,diamond
3,54000000,28000000,normal
4,768000000,32000000,vip


### 1. 타입통합 / 특성 숫자 컬럼 추가

In [5]:
# 라벨 유형 확인
labels = csData.label.drop_duplicates()
labels

0     normal
1    diamond
4        vip
Name: label, dtype: object

In [6]:
# 상관관계 분석을 위한 라벨코드 컬럼 추가
labelDict = {"normal":0,
             "diamond":1,
             "vip":2}

In [7]:
csData["labelCode"] =  csData.label.map(labelDict)

In [8]:
csData

Unnamed: 0,balance,stock,label,labelCode
0,30000000,22500000,normal,0
1,280000000,48000000,diamond,1
2,300000000,40666666,diamond,1
3,54000000,28000000,normal,0
4,768000000,32000000,vip,2
...,...,...,...,...
19995,628000000,44666666,diamond,1
19996,276000000,20000000,normal,0
19997,652000000,41333333,diamond,1
19998,676000000,45333333,diamond,1


### 2. 특성선정 / 데이터 분리

#### 2-1. 특성 선정

In [13]:
corrDf = csData.corr()
corrDf

Unnamed: 0,balance,stock,labelCode
balance,1.0,0.565942,0.883144
stock,0.565942,1.0,0.824174
labelCode,0.883144,0.824174,1.0


In [14]:
# features(특성) 선정
featuresStd = 0.5

In [15]:
corrDf[ ( abs( corrDf.labelCode ) > featuresStd ) & ( abs( corrDf.labelCode ) != 1 ) ]

Unnamed: 0,balance,stock,labelCode
balance,1.0,0.565942,0.883144
stock,0.565942,1.0,0.824174


In [17]:
list( ( corrDf[ ( abs( corrDf.labelCode ) > featuresStd ) & ( abs( corrDf.labelCode ) != 1 ) ] ).index )

['balance', 'stock']

In [18]:
features = list( ( corrDf[ ( abs( corrDf.labelCode ) > featuresStd ) & ( abs( corrDf.labelCode ) != 1 ) ] ).index )
features

['balance', 'stock']

In [19]:
label = ["label"]

In [20]:
# features, label 컬럼 설정
featuresData = csData.loc[:, features]
labelData = csData.loc[:, label]

In [21]:
traingData_features,\
testData_features,\
traingData_label,\
testData_label = \
                train_test_split(featuresData, labelData, test_size = 0.3, random_state = 1)

In [23]:
print(traingData_features.shape)
print(testData_features.shape)
print(traingData_label.shape)
print(testData_label.shape)

(14000, 2)
(6000, 2)
(14000, 1)
(6000, 1)


### 3. 모델 선언 및 학습

In [34]:
# 모델 정의
modelMethod = KNeighborsClassifier(n_neighbors = 3)

In [35]:
# 머신러닝(훈련데이터 features / label)
model_KNN = modelMethod.fit(traingData_features, traingData_label)

  return self._fit(X, y)


### 4. 모델 예측

In [36]:
# 예측
predict = model_KNN.predict(testData_features)
predict

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


array(['diamond', 'diamond', 'diamond', ..., 'diamond', 'vip', 'diamond'],
      dtype=object)

### 5. 데이터 정리

In [37]:
# 예측 결과를 데이터프레임으로 변환
predictData = pd.DataFrame(predict, columns=["PREDICT"])

In [38]:
# 테스트데이터 정답지 인덱스 초기화
testData_label.reset_index(drop = True, inplace = True)

In [39]:
# 예측결과 / 정답지 병합
finalResult = pd.concat( [testData_label, predictData], axis = 1 )

In [40]:
finalResult

Unnamed: 0,label,PREDICT
0,diamond,diamond
1,diamond,diamond
2,diamond,diamond
3,normal,normal
4,normal,normal
...,...,...
5995,diamond,diamond
5996,diamond,diamond
5997,diamond,diamond
5998,vip,vip


### 6. 결과 검증

In [41]:
# 결과 테스트 하기
ac_score = accuracy_score(testData_label, predict)
cl_report = classification_report(testData_label, predict)

In [42]:
# 결과 리포트 하기
### accuracy : 정확도
### precision : 정밀도 (ex. 사과라고 분류기가 예측한 결과중에 분류기가 맞춘 비율
### recall : 재현율 (ex. 원래 사과들중에 사과라고 분류기가 맞춘 비율)
### f1-score : precision 과 recall의 조화평균

print("Accuracy = ", ac_score)
print("result = \n", cl_report)

Accuracy =  0.9951666666666666
result = 
               precision    recall  f1-score   support

     diamond       1.00      1.00      1.00      3483
      normal       0.99      0.99      0.99      1803
         vip       0.99      0.99      0.99       714

    accuracy                           1.00      6000
   macro avg       0.99      0.99      0.99      6000
weighted avg       1.00      1.00      1.00      6000

