#### Chapter 01. Introduction

In [1]:
import numpy as np
import pandas as pd 
import matplotlib as mpl 
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set(style='whitegrid',  palette="pastel" )

import warnings
warnings.filterwarnings("ignore")

#### <font color="blue">1-7. Iris Classification
- <font color="blue">species(품종) : setosa, versicolor, virginica </font>
- <font color="blue">측정항목(cm) : petal(꽃잎), sepal(꽃받침) </font>

In [None]:
###############################################################################
############ 1. Data loading 및 확인 
###############################################################################

In [9]:
from sklearn.datasets import load_iris 
iris_dataset = load_iris()

In [10]:
iris_dataset.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [31]:
print('1.data:', iris_dataset['data'][:5] )                    #### 측정값 들 (numpy 행렬)
print('2.target:', iris_dataset['target'][:5] )                #### iris의 품종을 0, 1, 2로 표현 
print('3.target_names:', iris_dataset['target_names'][:5] )    #### iris의 품종 
print('4.DESCR:', iris_dataset['DESCR'][:50] )                 #### 데이터셋의 정보, 즉 readme에 해당
print('5.Feature_names:', iris_dataset['feature_names'] )      #### 컬럼명 
print('6.filename:', iris_dataset['filename'] )                #### 데이터셋의 경로와 파일명 

1.data: [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
2.target: [0 0 0 0 0]
3.target_names: ['setosa' 'versicolor' 'virginica']
4.DESCR: .. _iris_dataset:

Iris plants dataset
-----------
5.Feature_names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
6.filename: C:\Users\youngboo.choi\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\datasets\data\iris.csv


In [21]:
###############################################################################
############ 2. train / test set 구분 (sklearn' train_test_split)
###############################################################################

In [22]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], 
                                                    iris_dataset['target'], random_state = 0)

- <font color="blue"> 데이터셋이 정렬되어 있으므로, 난수를 발생시켜 랜덤하게 섞어준다. </font>
- <font color="blue"> train / test 가 각 75:25 비율로 나누어 진다. </font>

In [26]:
print( X_train.shape )
print( X_test.shape )
print( y_train.shape )
print( y_test.shape )

(112, 4)
(38, 4)
(112,)
(38,)


In [37]:
#### X_train 데이터셋을 df로 변환하고, 시각화 
iris_df = pd.DataFrame(X_train, columns=iris_dataset['feature_names'])
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.9,3.0,4.2,1.5
1,5.8,2.6,4.0,1.2
2,6.8,3.0,5.5,2.1
3,4.7,3.2,1.3,0.2
4,6.9,3.1,5.1,2.3


In [None]:
###############################################################################
############ 3. kNN model 
###############################################################################

In [62]:
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(n_neighbors = 1)

In [63]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [65]:
#### 꽃받침: 5 x 2.9 cm, 꽃잎: 1 x 0.2 cm --> 어떤 '종'일까? 
X_new = np.array( [[5, 2.9, 1, 0.2]] )
prediction = knn.predict(X_new)
prediction

array([0])

- <font color="blue"> 예측결과 target = 0, 즉 setoa로 예상되었다. (0: 'setosa', 1:'versicolor', 2:'virginica') </font>

In [68]:
#### test dataset을 대상으로 모델 성능 평가 
#### 알고리즘에 의해 예측된 '종'(target)과, 실제 데이터셋의 '종'(target)간의 차이의 평균 
score = knn.score(X_test, y_test)
score

0.9736842105263158

#### <font color="blue"> 1-8. Summary
- <font color="blue"> 1) 알고리즘과 n 갯수를 설정 </font>
- <font color="blue"> 2) 모델 피팅 </font>
- <font color="blue"> 3) 모델 평가 </font>

In [78]:
#### summary code of kNN classifier 
X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], 
                                                    iris_dataset['target'], random_state = 0 )
n = 3
knn_n = KNeighborsClassifier(n_neighbors = n)
knn_n.fit(X_train, y_train)
score_n = knn_n.score(X_test, y_test)
score_n

0.9736842105263158