### 라이브러리 불러오기

In [1]:
import pandas as pd
import numpy as np

In [3]:
# 모델 라이브러리 선언
from sklearn import svm

# 훈련/테스트 데이터 자동 분리
from sklearn.model_selection import train_test_split

# 모델 정확도 라이브러리 선언
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

### 데이터 불러오기

In [2]:
csData = pd.read_csv("https://raw.githubusercontent.com/hyokwan/python-lecture/master/dataset/pima-indians-diabetes.csv")
csData

Unnamed: 0,pregnancies,glucose,diastolic,trceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


### 1. 타입통합 / 특정 숫자 컬럼 추가

In [4]:
csData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pregnancies  768 non-null    int64  
 1   glucose      768 non-null    int64  
 2   diastolic    768 non-null    int64  
 3   trceps       768 non-null    int64  
 4   insulin      768 non-null    int64  
 5   bmi          768 non-null    float64
 6   dpf          768 non-null    float64
 7   age          768 non-null    int64  
 8   diabetes     768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
# 당뇨병 유무 유형 확인
csData.diabetes.drop_duplicates()

0    1
1    0
Name: diabetes, dtype: int64

In [6]:
sortKey = ["dpf"]

In [10]:
csData = csData.sort_values(sortKey).reset_index(drop=True)
csData

Unnamed: 0,pregnancies,glucose,diastolic,trceps,insulin,bmi,dpf,age,diabetes
0,0,102,52,0,0,25.1,0.078,21,0
1,6,87,80,0,0,23.2,0.084,32,0
2,2,90,70,17,0,27.3,0.085,22,0
3,6,92,62,32,126,32.0,0.085,46,0
4,2,125,60,20,140,33.8,0.088,31,0
...,...,...,...,...,...,...,...,...,...
763,0,180,66,39,0,42.0,1.893,25,1
764,3,173,82,48,465,38.4,2.137,25,1
765,0,137,40,35,168,43.1,2.288,33,1
766,4,197,70,39,744,36.7,2.329,31,0


### 2. 특성 선정 / 데이터 분리

#### 2-1. 특성 선정

In [11]:
corrDf = csData.corr()
corrDf

Unnamed: 0,pregnancies,glucose,diastolic,trceps,insulin,bmi,dpf,age,diabetes
pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
diastolic,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
trceps,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
bmi,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
dpf,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
diabetes,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


In [12]:
featuresStd = 0.2

In [13]:
corrDf[ ( abs(corrDf.diabetes) > featuresStd ) & ( abs(corrDf.diabetes) != 1 ) ]

Unnamed: 0,pregnancies,glucose,diastolic,trceps,insulin,bmi,dpf,age,diabetes
pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
bmi,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356


In [15]:
features = list( (corrDf[ ( abs(corrDf.diabetes) > featuresStd ) & ( abs(corrDf.diabetes) != 1 ) ]).index )
features

['pregnancies', 'glucose', 'bmi', 'age']

In [16]:
label = ["diabetes"]

### 2-2. 데이터 분리

In [17]:
featuresData = csData.loc[:, features]
labelData = csData.loc[:, label]

In [19]:
# train_test_split 함수를 활용해 feature, label 데이터 분리 ( 7 : 3 )
trainingData_features,\
testData_features,\
trainingData_label,\
testData_label = \
                train_test_split(featuresData, labelData, test_size = 0.3,
                                 random_state = 1)

In [20]:
print(trainingData_features.shape)
print(testData_features.shape)
print(trainingData_label.shape)
print(testData_label.shape)

(537, 4)
(231, 4)
(537, 1)
(231, 1)


### 3. 모델 선언 및 학습

In [21]:
# 모델 정의
modelMethod = svm.SVC(random_state=1)

In [22]:
# 머신러닝(훈련데이터 features / label)
model_SVM = modelMethod.fit(trainingData_features, trainingData_label)

  y = column_or_1d(y, warn=True)


### 4. 모델 예측

In [23]:
predict = model_SVM.predict(testData_features)
predict

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], dtype=int64)

### 5. 데이터 정리

In [24]:
predictData = pd.DataFrame(predict, columns=["PREDICT"])

In [25]:
testData_label.reset_index(drop=True, inplace=True)

In [26]:
finalData = pd.concat( [testData_label, predictData], axis = 1 )
finalData

Unnamed: 0,diabetes,PREDICT
0,0,0
1,0,0
2,0,0
3,0,1
4,0,0
...,...,...
226,1,0
227,0,0
228,1,1
229,0,0


### 6. 결과 검증

In [27]:
# 결과 테스트 하기
ac_score = accuracy_score(testData_label, predict)
cl_report = classification_report(testData_label, predict)

In [28]:
# 결과 리포트 하기
### accuracy : 정확도
### precision : 정밀도 (ex. 사과라고 분류기가 예측한 결과중에 분류기가 맞춘 비율
### recall : 재현율 (ex. 원래 사과들중에 사과라고 분류기가 맞춘 비율)
### f1-score : precision 과 recall의 조화평균

print("Accuracy = ", ac_score)
print("result = \n", cl_report)

Accuracy =  0.7229437229437229
result = 
               precision    recall  f1-score   support

           0       0.76      0.87      0.81       156
           1       0.60      0.43      0.50        75

    accuracy                           0.72       231
   macro avg       0.68      0.65      0.65       231
weighted avg       0.71      0.72      0.71       231

