## KNN으로 음수 가능 여부를 판단하기

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(2021)

## 1. Data

### 1.1 Data Load

In [2]:
water = pd.read_csv('water_potability.csv')

In [3]:
data = water.drop(['Potability'], axis=1)
label = water['Potability']

### 1.2 Data EDA

- count를 확인하면, count들이 다른 것을 확인할 수 있음.

In [4]:
data.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
count,2785.0,3276.0,3276.0,3276.0,2495.0,3276.0,3276.0,3114.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786
std,1.59432,32.879761,8768.570828,1.583085,41.41684,80.824064,3.308162,16.175008,0.780382
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45
25%,6.093092,176.850538,15666.690297,6.127421,307.699498,365.734414,12.065801,55.844536,3.439711
50%,7.036752,196.967627,20927.833607,7.130299,333.073546,421.884968,14.218338,66.622485,3.955028
75%,8.062066,216.667456,27332.762127,8.114887,359.95017,481.792304,16.557652,77.337473,4.50032
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739


- 결측치가 있다는 것을 알 수 있음.

In [5]:
data.isna().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
dtype: int64

### 1.3 Data Preprocess

- 빈 데이터를 제거하는 전처리.
    - row를 제거하는 방법과 column을 제거하는 방법.

#### 1.3.1 row를 제거하는 방법

In [6]:
na_cnt = data.isna().sum(axis=1)
na_cnt

0       1
1       1
2       1
3       0
4       0
       ..
3271    0
3272    2
3273    1
3274    1
3275    1
Length: 3276, dtype: int64

In [7]:
# 위 값이 0보다 크면 na값이 들어있다고 판단할 수 있음.
drop_idx = na_cnt.loc[na_cnt > 0].index

In [9]:
# 추출한 index값
drop_idx

Int64Index([   0,    1,    2,    8,   11,   13,   14,   16,   18,   20,
            ...
            3247, 3252, 3258, 3259, 3260, 3266, 3272, 3273, 3274, 3275],
           dtype='int64', length=1265)

In [8]:
drop_row = data.drop(drop_idx, axis=0)

In [11]:
drop_row.shape

(2011, 9)

In [13]:
data.shape

(3276, 9)

#### 1.3.2 column을 제거하는 방법

In [14]:
na_cnt = data.isna().sum()
drop_cols = na_cnt.loc[na_cnt > 0].index

In [16]:
# 결측치가 존재하는 column
drop_cols

Index(['ph', 'Sulfate', 'Trihalomethanes'], dtype='object')

In [15]:
# 이번 실습에서는 column을 제거
data = data.drop(drop_cols, axis=1)

### 1.4 Data Split

- 데이터를 Train, Test로 나누기.

In [17]:
from sklearn.model_selection import train_test_split

train_data, test_data, train_label, test_label = train_test_split(
    data, label, train_size=0.7, random_state=2021
)

In [18]:
print(f"train_data size: {len(train_label)}, {len(train_label)/len(data):.2f}")
print(f"test_data size: {len(test_label)}, {len(test_label)/len(data):.2f}")

train_data size: 2293, 0.70
test_data size: 983, 0.30


## 2. KNN

In [19]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

### 2.1 Best Hyper Parameter

- n_neighbors
    - 몇 개의 이웃으로 예측할 것인지
    
- P
    - 거리를 어떤 방식으로 계산할지.
        - 1: manhattan distance
        - 2: euclidean distance

In [20]:
from sklearn.model_selection import GridSearchCV

### 2.1.1 탐색 범위 선정

In [25]:
params = {
    'n_neighbors': [i for i in range(1, 12, 2)],
    'p': [1, 2]
}

In [26]:
params

{'n_neighbors': [1, 3, 5, 7, 9, 11], 'p': [1, 2]}

### 2.1.2 탐색

In [27]:
# n_jobs: 모든 resource를 사용하기.
grid_cv = GridSearchCV(knn, param_grid=params, cv=3, n_jobs=-1)

In [28]:
grid_cv.fit(train_data, train_label)

GridSearchCV(cv=3, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11], 'p': [1, 2]})

### 2.1.3 결과

In [29]:
print(f"Best score of parameter search is: {grid_cv.best_score_:.4f}")

Best score of parameter search is: 0.5652


In [31]:
grid_cv.best_params_

{'n_neighbors': 11, 'p': 1}

In [30]:
print('Best parameter of best score is')
print(f"\t n_neighbors: {grid_cv.best_params_['n_neighbors']}")
print(f"\t p: {grid_cv.best_params_['p']}")

Best parameter of best score is
	 n_neighbors: 11
	 p: 1


### 2.1.4 예측

In [32]:
train_pred = grid_cv.best_estimator_.predict(train_data)
test_pred = grid_cv.best_estimator_.predict(test_data)

### 2.1.5 평가

In [33]:
from sklearn.metrics import accuracy_score

train_acc = accuracy_score(train_label, train_pred)
test_acc = accuracy_score(test_label, test_pred)

In [34]:
print(f"train accuracy is {train_acc:.4f}")
print(f"test accuracy is {test_acc:.4f}")

train accuracy is 0.6520
test accuracy is 0.5595


## 3. Scaling을 할 경우

### 3.1 Data Scaling

- KNN은 거리를 기반으로 하는 알고리즘이기 때문에 크기에 영향을 받음.
- Scaling을 진행해 크기를 맞추기.

In [35]:
data.describe()

Unnamed: 0,Hardness,Solids,Chloramines,Conductivity,Organic_carbon,Turbidity
count,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0
mean,196.369496,22014.092526,7.122277,426.205111,14.28497,3.966786
std,32.879761,8768.570828,1.583085,80.824064,3.308162,0.780382
min,47.432,320.942611,0.352,181.483754,2.2,1.45
25%,176.850538,15666.690297,6.127421,365.734414,12.065801,3.439711
50%,196.967627,20927.833607,7.130299,421.884968,14.218338,3.955028
75%,216.667456,27332.762127,8.114887,481.792304,16.557652,4.50032
max,323.124,61227.196008,13.127,753.34262,28.3,6.739


- Solids의 거리가 굉장히 크기 때문에 영향을 받을 수 밖에 없음.

In [36]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [37]:
scaler.fit(train_data)

StandardScaler()

In [38]:
scaled_train_data = scaler.transform(train_data)
scaled_test_data = scaler.transform(test_data)

### 3.2 탐색

In [39]:
scaling_knn = KNeighborsClassifier()
scaling_grid_cv = GridSearchCV(scaling_knn, param_grid=params, n_jobs=-1)

In [47]:
scaling_grid_cv.fit(scaled_train_data, train_label)

GridSearchCV(estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11], 'p': [1, 2]})

In [48]:
scaling_grid_cv.best_score_

0.587011825593896

In [49]:
scaling_grid_cv.best_params_

{'n_neighbors': 9, 'p': 1}

### 3.3 평가

In [53]:
scaling_train_pred = scaling_grid_cv.best_estimator_.predict(scaled_train_data)
scaling_test_pred = scaling_grid_cv.best_estimator_.predict(scaled_test_data)

In [54]:
scaling_train_acc = accuracy_score(train_label, scaling_train_pred)
scaling_test_acc = accuracy_score(test_label, scaling_test_pred)

In [55]:
print(f"Scaled data train accuracy is {scaling_train_acc:.4f}")
print(f"Scaled data test accuracy is {scaling_test_acc:.4f}")

Scaled data train accuracy is 0.6829
Scaled data test accuracy is 0.5799


## 4. 마무리

In [56]:
print(f"test accuracy is {test_acc:.4f}")
print(f"Scaled data test accuracy is {scaling_test_acc:.4f}")

test accuracy is 0.5595
Scaled data test accuracy is 0.5799
