# 3차과제_와인데이터분류: Using SVM

# **1. 패키지 Import**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## **2. 데이터 불러오기**

In [2]:
# Load the dataset
wine_data = pd.read_csv('wine.csv', delimiter=';')

# Inspect the dataset
print(wine_data.head())

FileNotFoundError: data-02_1var.csv not found.

# 3. 학습/검증 데이터 분리 및 특징 선택

In [None]:
from sklearn.model_selection._split import train_test_split

In [None]:
# Select specific features and target
features = ['alcohol', 'volatile acidity']
X = wine_data[features]
y = wine_data['quality']

In [None]:
# 3. Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

## **4. 스케일링**

### **표준화: StandardScaler**

* 데이터의 특징을 각각 ```평균이 0```이고, ```분산이 1```인 ```가우시안 정규 분포```를 가진 값으로 변환

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
ss_scaler = StandardScaler()
X_train_ss = ss_scaler.fit_transform(X_train)
X_test_ss = ss_scaler.fit_transform(X_test)

* 데이터 선택: X_train/X_test, X_train_ss/X_test_ss

In [None]:
X_train = X_train_ss
X_test = X_test_ss

In [None]:
from sklearn.svm import SVC

In [None]:
model = SVC()
model.fit(X_train, y_train)

* **모델의 하이퍼 파라미터 불러오기**

In [None]:
model.get_params()

* **하이퍼 파라미터 튜닝**

In [None]:
model = SVC(
    C = 1,          # 0.1, 1, 10, 100 등
    gamma = 10,      # 0.1, 1, 10, 100 등
    kernel = 'rbf',   # 'linear', 'poly', 'rbf' 등
    # degree = 3,       # 2 ~ 5 등
)

In [None]:
model.fit(X_train, y_train)

* **GridSearch를 통한 하이퍼파라미터 튜닝**

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    "C": [0.1, 1, 10, 100],           # 줄임
    "gamma": [0.1, 1, 10, 100],       # 줄임
    "kernel": ['rbf'],       # 하나의 커널만 선택
}

grid_model = GridSearchCV(
    model,
    param_grid=param_grid,
    cv=2,   # K-fold
    refit=True)

In [None]:
grid_model.fit(X_train, y_train)

In [None]:
# GridSearch 결과 중, 최적 하이퍼파라미터 학습 모델 객체 반환
print(grid_model.best_params_)
best_model = grid_model.best_estimator_

## **6. 모델 평가하기**

* 모델 선택:  best_model

In [None]:
model = best_model

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
y_pred = model.predict(X_test)
print(f'분류 정확도: {accuracy_score(y_test, y_pred)}')

In [None]:
from sklearn.metrics import classification_report

In [None]:
# Unique quality levels for target names
quality_labels = sorted(y_test.unique())  # Ensure labels are sorted

# Generate classification report with appropriate target names
print(classification_report(y_test, y_pred, target_names=[f'Quality {label}' for label in quality_labels]))


## **7. 시각화**

> **주의**
> * 2차원으로 시각화 하기 때문에 특징을 2개를 사용하여 학습한 모델만 사용할 것

In [None]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Unique classes in y:", np.unique(y))


In [None]:
from matplotlib.colors import ListedColormap

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Title and axis labels
title = 'SVM Decision Boundary'
xlabel = 'Alcohol'
ylabel = 'Volatile Acidity'

plt.rcParams['figure.figsize'] = (10, 10)

def visualize_boundary(model, X, y, title, xlabel, ylabel):
    ax = plt.gca()
    ax.scatter(X[:, 0], X[:, 1], c=y, s=25, cmap='rainbow', edgecolor='k', zorder=3)
    ax.axis('tight')
    xlim_start , xlim_end = ax.get_xlim()
    ylim_start , ylim_end = ax.get_ylim()
    xx, yy = np.meshgrid(np.linspace(xlim_start,xlim_end, num=200),np.linspace(ylim_start,ylim_end, num=200))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    n_classes = len(np.unique(y))
    contours = ax.contourf(xx, yy, Z, alpha=0.3,levels=np.arange(n_classes + 1) - 0.5,cmap='rainbow',zorder=1)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)



In [None]:
visualize_boundary(model, np.vstack([X_train, X_test]), np.hstack([y_train, y_test]), title, xlabel, ylabel)
