### 라이브러리 불러오기

In [1]:
import pandas as pd
import numpy as np

In [18]:
# 모델 라이브러리 선언
from sklearn import svm

# 훈련/테스트 데이터 자동 분리
from sklearn.model_selection import train_test_split

# 모델 정확도 라이브러리 선언
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

### 데이터 불러오기

In [3]:
csData = pd.read_csv("https://raw.githubusercontent.com/hyokwan/python-lecture/master/dataset/customer.csv")

In [4]:
csData.head()

Unnamed: 0,balance,stock,label
0,30000000,22500000,normal
1,280000000,48000000,diamond
2,300000000,40666666,diamond
3,54000000,28000000,normal
4,768000000,32000000,vip


### 1. 타입통합 / 특정 숫자 컬럼 추가

In [5]:
# 라벨 유형 확인
labels = csData.label.drop_duplicates()

In [6]:
print(labels)

0     normal
1    diamond
4        vip
Name: label, dtype: object


In [7]:
# 상관관계 분석을 위한 라벨코드 컬럼 추가
labelDict = {"normal" : 0,
             "diamond" : 1,
             "vip" : 2}

In [8]:
csData["labelCode"] = csData.label.map(labelDict)

In [9]:
csData

Unnamed: 0,balance,stock,label,labelCode
0,30000000,22500000,normal,0
1,280000000,48000000,diamond,1
2,300000000,40666666,diamond,1
3,54000000,28000000,normal,0
4,768000000,32000000,vip,2
...,...,...,...,...
19995,628000000,44666666,diamond,1
19996,276000000,20000000,normal,0
19997,652000000,41333333,diamond,1
19998,676000000,45333333,diamond,1


### 2. 특성 선정 / 데이터 분리

#### 2-1. 특성 선정

In [10]:
# minMaxNorm 정규화 (0 ~ 1 사이로 값 변경)
### 최소최대정규화 공식
### x' = (x - x_min) / (x_max - x_min)
def minMaxNorm(indata) :
    maxValue = max(indata)
    minValue = min(indata)
    deNormValue = maxValue - minValue
    # 전체 양수로 변경
    plusData = indata - minValue
    # 최대값 기준 각 데이터 정규화
    outData = indata
    if deNormValue != 0 :
        outData = plusData / deNormValue
    else :
        pass
    return outData

In [11]:
csData["balance_norm"] = minMaxNorm(csData.balance)
csData["stock_norm"] = minMaxNorm(csData.stock)

In [12]:
csData

Unnamed: 0,balance,stock,label,labelCode,balance_norm,stock_norm
0,30000000,22500000,normal,0,0.000000,0.080000
1,280000000,48000000,diamond,1,0.324675,0.488000
2,300000000,40666666,diamond,1,0.350649,0.370667
3,54000000,28000000,normal,0,0.031169,0.168000
4,768000000,32000000,vip,2,0.958442,0.232000
...,...,...,...,...,...,...
19995,628000000,44666666,diamond,1,0.776623,0.434667
19996,276000000,20000000,normal,0,0.319481,0.040000
19997,652000000,41333333,diamond,1,0.807792,0.381333
19998,676000000,45333333,diamond,1,0.838961,0.445333


In [25]:
csData.corr()

Unnamed: 0,balance,stock,labelCode,balance_norm,stock_norm
balance,1.0,0.565942,0.883144,1.0,0.565942
stock,0.565942,1.0,0.824174,0.565942,1.0
labelCode,0.883144,0.824174,1.0,0.883144,0.824174
balance_norm,1.0,0.565942,0.883144,1.0,0.565942
stock_norm,0.565942,1.0,0.824174,0.565942,1.0


#### 2-2. 데이터 분리

In [44]:
# features, label 컬럼 설정
features = ["balance_norm", "stock_norm"]
# label = ["labelCode"]
label = ["label"]

In [45]:
# features 데이터, label 데이터 분리
featuresData = csData.loc[:, features]
lableData = csData.loc[:, label]

In [46]:
# train_test_split 함수를 활용해 feature, label 데이터 분리 ( 7 : 3 )
trainingData_features,\
testData_features,\
trainingData_label,\
testData_label = \
                train_test_split(featuresData, lableData, test_size = 0.3,
                                 random_state = 1)

In [47]:
print(trainingData_features.shape)
print(testData_features.shape)
print(trainingData_label.shape)
print(testData_label.shape)

(14000, 2)
(6000, 2)
(14000, 1)
(6000, 1)


### 3. 모델 선언 및 학습

In [48]:
# 모델 정의
modelMethod = svm.SVC(random_state = 1)

In [49]:
# 머신러닝(훈련데이터 특성 / 답지)
model_SVM = modelMethod.fit(trainingData_features, trainingData_label)

  y = column_or_1d(y, warn=True)


### 4. 모델 예측

In [50]:
# 예측
predict = model_SVM.predict(testData_features)
predict

array(['diamond', 'diamond', 'diamond', ..., 'diamond', 'vip', 'diamond'],
      dtype=object)

### 5. 데이터 정리

In [51]:
# 예측결과를 데이터프레임으로 변환
predictData = pd.DataFrame(predict, columns=["PREDICT"])

In [52]:
# 테스트데이터 정답지 인덱스 초기화
testData_label.reset_index(drop=True, inplace=True)

In [53]:
# 예측결과 / 정답지 병합
finalResult = pd.concat( [testData_label, predictData], axis = 1)

In [54]:
finalResult

Unnamed: 0,label,PREDICT
0,diamond,diamond
1,diamond,diamond
2,diamond,diamond
3,normal,normal
4,normal,normal
...,...,...
5995,diamond,diamond
5996,diamond,diamond
5997,diamond,diamond
5998,vip,vip


### 6. 결과 검증

In [55]:
# 결과 테스트 하기
ac_score = accuracy_score(testData_label, predict)
cl_report = classification_report(testData_label, predict)

In [56]:
# 결과 리포트 하기
### accuracy : 정확도
### precision : 정밀도 (ex. 사과라고 분류기가 예측한 결과중에 분류기가 맞춘 비율
### recall : 재현율 (ex. 원래 사과들중에 사과라고 분류기가 맞춘 비율)
### f1-score : precision 과 recall의 조화평균

print("Accuracy = ", ac_score)
print("result = \n", cl_report)

Accuracy =  0.9953333333333333
result = 
               precision    recall  f1-score   support

     diamond       1.00      1.00      1.00      3483
      normal       0.99      0.99      0.99      1803
         vip       0.99      0.99      0.99       714

    accuracy                           1.00      6000
   macro avg       1.00      0.99      0.99      6000
weighted avg       1.00      1.00      1.00      6000

