#  아이리스 데이터 셋을 활용한 KNN모델 실습

## 필요한 라이브러리와 데이터셋 로드

In [1]:
import pandas as pd
import os
import glob

In [2]:
path = glob.glob('data/*.csv')
path

['data\\IRIS.csv']

In [3]:
iris = pd.read_csv(path[0])

## iris 데이터 살펴보기

In [4]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
iris.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [6]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [7]:
# 결측치 확인
iris.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [8]:
# 타겟변수가 될 피쳐 살펴보기
iris['species'].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: species, dtype: int64

In [9]:
iris['species'].nunique()

3

In [10]:
iris['species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [11]:
# 유니크 / 엔유니크 메서드로도 피쳐 살펴볼 수 있음 

In [12]:
#현재 아이리스 데이터셋에는 인덱스는 있지만 꽃잎 별 id (pk)는 없음 . 이걸 지정해 줄것임
iris.sample(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,Iris-versicolor
67,5.8,2.7,4.1,1.0,Iris-versicolor
112,6.8,3.0,5.5,2.1,Iris-virginica


In [13]:
#id 라는 새로운 컬럼을 range, len메서드를 이용해 채워줄 것임 
iris['id'] = range(len(iris))
iris.sample(1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,id
16,5.4,3.9,1.3,0.4,Iris-setosa,16


In [14]:
# iris데이터 컬럼 순서를 바꾸고 싶다면?  -> 컬럼 나열 후 서브셋 지정 후 덮어씌우기 
iris = iris[['id','sepal_length','sepal_width','petal_length','petal_width','species']]

In [15]:
# 순서가 잘 바뀌었는지 확인
iris.head()

Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,species
0,0,5.1,3.5,1.4,0.2,Iris-setosa
1,1,4.9,3.0,1.4,0.2,Iris-setosa
2,2,4.7,3.2,1.3,0.2,Iris-setosa
3,3,4.6,3.1,1.5,0.2,Iris-setosa
4,4,5.0,3.6,1.4,0.2,Iris-setosa


# KNN (예측) 준비 

 - KNN 실습 (분류) 는 범주형 변수 예측에 적합
 
 - 데이터가 많으면 느리다 
 
 - 사용하는 파라미터 : n_neighbors

### knn 학습을 위한 train / test 데이터로 나누는 과정

In [16]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            150 non-null    int64  
 1   sepal_length  150 non-null    float64
 2   sepal_width   150 non-null    float64
 3   petal_length  150 non-null    float64
 4   petal_width   150 non-null    float64
 5   species       150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [17]:
#iris dataset 150개의 row 중 100개를 랜덤으로 추출해 train 데이터로 지정  / 나머지 50개는 test 데이터로 지정 
#replace = False -> 비복원추출 :  뽑아낸거는 다시 뽑아내지 않는다는 의미 
train = iris.sample(100, replace = False, random_state = 2022)

In [18]:
train.head()

Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,species
128,128,6.4,2.8,5.6,2.1,Iris-virginica
105,105,7.6,3.0,6.6,2.1,Iris-virginica
30,30,4.8,3.1,1.6,0.2,Iris-setosa
137,137,6.4,3.1,5.5,1.8,Iris-virginica
17,17,5.1,3.5,1.4,0.3,Iris-setosa


In [19]:
#'id' 컬럼 있으니까 인덱스 초기화 해도 무방 
train.reset_index(inplace = True)
train

Unnamed: 0,index,id,sepal_length,sepal_width,petal_length,petal_width,species
0,128,128,6.4,2.8,5.6,2.1,Iris-virginica
1,105,105,7.6,3.0,6.6,2.1,Iris-virginica
2,30,30,4.8,3.1,1.6,0.2,Iris-setosa
3,137,137,6.4,3.1,5.5,1.8,Iris-virginica
4,17,17,5.1,3.5,1.4,0.3,Iris-setosa
...,...,...,...,...,...,...,...
95,98,98,5.1,2.5,3.0,1.1,Iris-versicolor
96,108,108,6.7,2.5,5.8,1.8,Iris-virginica
97,2,2,4.7,3.2,1.3,0.2,Iris-setosa
98,46,46,5.1,3.8,1.6,0.2,Iris-setosa


In [20]:
# 컬럼으로 들어온 index를 drop 해주기 
# axis = 1의 의미는 index 컬럼의 모든 data를 삭제하겠다는 의미 
train = train.drop(['index'], axis = 1)

In [21]:
train

Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,species
0,128,6.4,2.8,5.6,2.1,Iris-virginica
1,105,7.6,3.0,6.6,2.1,Iris-virginica
2,30,4.8,3.1,1.6,0.2,Iris-setosa
3,137,6.4,3.1,5.5,1.8,Iris-virginica
4,17,5.1,3.5,1.4,0.3,Iris-setosa
...,...,...,...,...,...,...
95,98,5.1,2.5,3.0,1.1,Iris-versicolor
96,108,6.7,2.5,5.8,1.8,Iris-virginica
97,2,4.7,3.2,1.3,0.2,Iris-setosa
98,46,5.1,3.8,1.6,0.2,Iris-setosa


In [22]:
# 기존 iris data set 중 train 데이타로 들어가지 않은 나머지 50개의 row를 test 데이터로 지정해 줄것
# ~column.isin()

In [23]:
test = iris.loc[~iris['id'].isin(train['id'])]

In [24]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 11 to 144
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            50 non-null     int64  
 1   sepal_length  50 non-null     float64
 2   sepal_width   50 non-null     float64
 3   petal_length  50 non-null     float64
 4   petal_width   50 non-null     float64
 5   species       50 non-null     object 
dtypes: float64(4), int64(1), object(1)
memory usage: 2.7+ KB


In [25]:
test.head()

Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,species
11,11,4.8,3.4,1.6,0.2,Iris-setosa
12,12,4.8,3.0,1.4,0.1,Iris-setosa
13,13,4.3,3.0,1.1,0.1,Iris-setosa
14,14,5.8,4.0,1.2,0.2,Iris-setosa
15,15,5.7,4.4,1.5,0.4,Iris-setosa


In [26]:
test.tail()

Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,species
130,130,7.4,2.8,6.1,1.9,Iris-virginica
134,134,6.1,2.6,5.6,1.4,Iris-virginica
138,138,6.0,3.0,4.8,1.8,Iris-virginica
143,143,6.8,3.2,5.9,2.3,Iris-virginica
144,144,6.7,3.3,5.7,2.5,Iris-virginica


In [27]:
# test 데이터프레임도 무사히 추출했으니 인덱스를 삭제해 보자 

In [28]:
test = test.reset_index().drop(['index'], axis = 1)

In [29]:
#기존 인덱스 컬럼으로 올리고 삭제까지 잘 되었는지 확인 
test.head()

Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,species
0,11,4.8,3.4,1.6,0.2,Iris-setosa
1,12,4.8,3.0,1.4,0.1,Iris-setosa
2,13,4.3,3.0,1.1,0.1,Iris-setosa
3,14,5.8,4.0,1.2,0.2,Iris-setosa
4,15,5.7,4.4,1.5,0.4,Iris-setosa


### knn 패키지 임포트 - 사이킷런 

In [30]:
from sklearn.neighbors import KNeighborsClassifier

In [31]:
# 모델을 정의하는 함수 KNeighborsClassifier를 KNN에 저장하고 가장 가까운  3개의 개체를 참조헤 데이터 분석을 할 것이므로 
# n_neighbors = 3 으로 지정 

knn = KNeighborsClassifier(n_neighbors = 3)
#n_neighbors가 3인 모델을 knn이라고 정의하는 문장 

### knn 학습시키기

- knn.fit(A,B)

A: input 변수 - 학습을 위해 사용되는 피쳐


B: target 변수 - 학습해서 알고자 하는 피쳐 

#### knn 모델을 train데이터에서 학습시킬것 
- 꽃잎 길이 . 넓이 피쳐 가지고 종을 예측하는거임 

In [32]:
train.columns

Index(['id', 'sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [33]:
knn.fit(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], train['species'])

KNeighborsClassifier(n_neighbors=3)

In [34]:
test.head()

Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,species
0,11,4.8,3.4,1.6,0.2,Iris-setosa
1,12,4.8,3.0,1.4,0.1,Iris-setosa
2,13,4.3,3.0,1.1,0.1,Iris-setosa
3,14,5.8,4.0,1.2,0.2,Iris-setosa
4,15,5.7,4.4,1.5,0.4,Iris-setosa


#### knn.predict(A)

In [35]:
# knn.fit(A, B)로 피쳐 지정 했으니 본격적으로 학습을 진행해보자 

In [36]:
knn.predict(test[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica'], dtype=object)

In [37]:
# 예측한 값은 새로운 컬럼으로 지정해서 저장 

test['pred'] = knn.predict(test[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])

In [38]:
test.head()

Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,species,pred
0,11,4.8,3.4,1.6,0.2,Iris-setosa,Iris-setosa
1,12,4.8,3.0,1.4,0.1,Iris-setosa,Iris-setosa
2,13,4.3,3.0,1.1,0.1,Iris-setosa,Iris-setosa
3,14,5.8,4.0,1.2,0.2,Iris-setosa,Iris-setosa
4,15,5.7,4.4,1.5,0.4,Iris-setosa,Iris-setosa


In [39]:
# 예측한 값과 실제 speices의 값이 얼마나 일치하는지를 봐보자 
# mean 활용해 평균치 출력 

In [40]:
(test['species'] == test['pred']).mean()

0.94

In [44]:
#일치하는 정도가 94%... 그렇다면 최적의 k를 찾는 여정을 떠나자 

In [42]:
# 위 작업을 반복해서 일치하는 정도가 가장 높은 개체군을 찾아야 하니까 for문을 활용해보자 

In [43]:
# 우리가 알고싶은건 k의 갯수니까 ~ range를 한 1~30정도로 셋팅해주고 시작

for k in range(1, 30): 
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(test[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], test['species'])
    predictions = knn.predict(test[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])
    test['pred'] = predictions
    print((test['species'] == test['pred']).mean())

1.0
0.96
0.98
0.94
0.94
0.94
0.94
0.92
0.94
0.92
0.92
0.92
0.92
0.92
0.92
0.92
0.92
0.92
0.92
0.9
0.92
0.88
0.88
0.86
0.84
0.86
0.86
0.72
0.72


In [None]:
# 결과 
# k = 3일 떄 최적의 정도 0.98 
# 만약 중복값이 있다면 k가 가장 적은 수를 택하면 된다 