In [1]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris, load_wine
from sklearn.svm import SVC
import pandas as pd
import numpy as np

In [2]:
iris = load_iris()
iris_df = pd.DataFrame(iris['data'], columns=iris['feature_names'])
iris_df['target'] = iris.target
iris_df = iris_df.drop_duplicates()
iris_df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'label']
iris_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [3]:
#1 iris에서 중복된 값 제거하고 클래스 범주 통일하지 않고 정확도 확인

X1 = iris_df.loc[:, 'sepal_length':'petal_width']
y1 = iris_df.loc[:, 'label']
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.3,
                                                    random_state=42)

#학습데이터셋
print(X_train.shape, X_test.shape)
#테스트데이터셋
print(y_train.shape, y_test.shape)

#비율 확인
print(np.unique(y_train, return_counts = True)) #훈련데이터의 클래스 레이블 분포 확인
print(np.unique(y_test, return_counts = True)) #테스트데이터의 클래스 레이블 분포 확인

knn = KNeighborsClassifier(algorithm = "auto", leaf_size = 30, metric = "minkowski",
                         metric_params = None, n_jobs = None, n_neighbors = 8, p = 2)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
knn_acc = accuracy_score(y_test, y_pred)
print(f"정확도 : {knn_acc:.2f}")


(104, 4) (45, 4)
(104,) (45,)
(array([0, 1, 2]), array([31, 37, 36], dtype=int64))
(array([0, 1, 2]), array([19, 13, 13], dtype=int64))
정확도 : 0.98


In [4]:
#2 iris에서 중복된 값 제거하고 클래스 범주 통일해서 정확도 확인

X2 = iris_df.loc[:, 'sepal_length':'petal_width']
y2 = iris_df.loc[:, 'label']
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.3,
                                                    random_state=42, stratify=y2)

#학습데이터셋
print(X_train.shape, X_test.shape)

#테스트데이터셋
print(y_train.shape, y_test.shape)

#비율 확인
print(np.unique(y_train, return_counts = True)) # 훈련데이터의 클래스 레이블 분포 확인
print(np.unique(y_test, return_counts = True)) # 테스트데이터의 클래스 레이블 분포 확인

knn = KNeighborsClassifier(algorithm = "auto", leaf_size = 30, metric = "minkowski",
                         metric_params = None, n_jobs = None, n_neighbors = 8, p = 2)

knn.fit(X_train, y_train)
 
y_pred = knn.predict(X_test)
knn_acc = accuracy_score(y_test, y_pred)
print(f"정확도 : {knn_acc:.2f}")


(104, 4) (45, 4)
(104,) (45,)
(array([0, 1, 2]), array([35, 35, 34], dtype=int64))
(array([0, 1, 2]), array([15, 15, 15], dtype=int64))
정확도 : 0.91


In [5]:
#3 와인의 특성을 기반으로 세가지 클래스 중 하나로 분류하고 정확도 출력해보라
wine = load_wine()
wine.keys()

print(wine.feature_names)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [6]:
print(wine.data) # 입력데이퉈

[[1.423e+01 1.710e+00 2.430e+00 ... 1.040e+00 3.920e+00 1.065e+03]
 [1.320e+01 1.780e+00 2.140e+00 ... 1.050e+00 3.400e+00 1.050e+03]
 [1.316e+01 2.360e+00 2.670e+00 ... 1.030e+00 3.170e+00 1.185e+03]
 ...
 [1.327e+01 4.280e+00 2.260e+00 ... 5.900e-01 1.560e+00 8.350e+02]
 [1.317e+01 2.590e+00 2.370e+00 ... 6.000e-01 1.620e+00 8.400e+02]
 [1.413e+01 4.100e+00 2.740e+00 ... 6.100e-01 1.600e+00 5.600e+02]]


In [7]:
print(wine.target) # 정답임다

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


In [8]:
wine_df = pd.DataFrame(wine['data'], columns = wine['feature_names'])

wine_df['target'] = wine.target

wine_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [9]:
wine_df.loc[wine_df.duplicated(), : ] # 중복 데이터가 없네 ~~

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target


In [10]:
X3 = wine_df.loc[:, 'alcohol':'proline']
y3 = wine_df.loc[:, 'target']

X_train, X_test, y_train, y_test = train_test_split(X3, y3, test_size=0.3,
                                                    random_state=42)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

print(np.unique(y_train, return_counts = True)) 
print(np.unique(y_test, return_counts = True))

svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)
svm_acc = accuracy_score(y_test, y_pred)
print(f"정확도: {svm_acc:.2f}")

(124, 13) (54, 13)
(124,) (54,)
(array([0, 1, 2]), array([40, 50, 34], dtype=int64))
(array([0, 1, 2]), array([19, 21, 14], dtype=int64))
정확도: 0.98
