# SVMで分類
変数は６個ぐらい使う

<img src='./スクリーンショット 2023-10-19 15.28.25.png'>

<img src='./スクリーンショット 2023-10-19 15.17.13.png'>

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


In [2]:
# データセットの読み込み
data = sns.load_dataset("penguins")
print(data.shape)
display(data.head())
print(data.dtypes)
print('欠損値:\n', data.isnull().sum())


(344, 7)


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


species               object
island                object
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
dtype: object
欠損値:
 species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64


In [3]:
# 欠損値削除
data = data.dropna()
print(data.shape)
display(data.head())
print(data.dtypes)
print(data.isnull().sum())


(333, 7)


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


species               object
island                object
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
sex                   object
dtype: object
species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64


In [4]:
# speciesをダミー変数にする
data['species'] = data['species'].replace('Adelie', 0)
data['species'] = data['species'].replace('Chinstrap', 1)
data['species'] = data['species'].replace('Gentoo', 2)

# islandにone-hot-encoding
data = pd.get_dummies(data, columns=['island'])


In [5]:
print(data.shape)
display(data.head(10))


(333, 9)


Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,island_Biscoe,island_Dream,island_Torgersen
0,0,39.1,18.7,181.0,3750.0,Male,0,0,1
1,0,39.5,17.4,186.0,3800.0,Female,0,0,1
2,0,40.3,18.0,195.0,3250.0,Female,0,0,1
4,0,36.7,19.3,193.0,3450.0,Female,0,0,1
5,0,39.3,20.6,190.0,3650.0,Male,0,0,1
6,0,38.9,17.8,181.0,3625.0,Female,0,0,1
7,0,39.2,19.6,195.0,4675.0,Male,0,0,1
12,0,41.1,17.6,182.0,3200.0,Female,0,0,1
13,0,38.6,21.2,191.0,3800.0,Male,0,0,1
14,0,34.6,21.1,198.0,4400.0,Male,0,0,1


# 訓練データとテストデータに分割

In [7]:
# 特徴量とターゲット変数の準備
features = ['island_Torgersen', 'island_Dream', 'island_Biscoe', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
target = 'species'
X = data[features]
y = data[target]


In [8]:
# データセットをトレーニングセットとテストセットに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


## 標準化

In [9]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)


# SVCで学習

In [19]:
from sklearn.svm import SVC

svm = SVC(kernel='linear', C=1.0, random_state=1)   # 線形SVMのインスタンスを生成(Cでソフトマージンの厳しさを調整)
svm.fit(X_train_std, y_train)                       # モデルを訓練

# plot_decision_regions(X_train_std,
#                       X_test_std,
#                       classifier=svm,
#                       test_idx=range(105, 150))
# plt.xlabel('Petal length [standardized]')
# plt.ylabel('Petal width [standardized]')
# plt.legend(loc='upper left')
# plt.tight_layout()
# #plt.savefig('figures/03_11.png', dpi=300)
# plt.show()


## テストデータの予測

In [20]:
y_pred_svm = svm.predict(X_test_std)


## モデルの評価

In [22]:
# 分類精度の評価
from sklearn.metrics import confusion_matrix as CM
print("{}".format(CM(y_test, y_pred_svm)))

from sklearn.metrics import classification_report as CR
print("{}".format(CR(y_test, y_pred_svm)))


[[48  0  0]
 [ 0 18  0]
 [ 0  0 34]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        48
           1       1.00      1.00      1.00        18
           2       1.00      1.00      1.00        34

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

