In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC

# Load Data

In [3]:
df = pd.read_csv("data/ionosphere.csv")
df.head()

Unnamed: 0,var1,var2,var3,var4,var5,var6,var7,var8,var9,var10,...,var25,var26,var27,var28,var29,var30,var31,var32,var33,class
0,1,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,0.85243,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,g
1,1,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,0.50874,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,b
2,1,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,0.73082,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,g
3,1,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,b
4,1,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,0.52798,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,g


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 34 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   var1    351 non-null    int64  
 1   var2    351 non-null    float64
 2   var3    351 non-null    float64
 3   var4    351 non-null    float64
 4   var5    351 non-null    float64
 5   var6    351 non-null    float64
 6   var7    351 non-null    float64
 7   var8    351 non-null    float64
 8   var9    351 non-null    float64
 9   var10   351 non-null    float64
 10  var11   351 non-null    float64
 11  var12   351 non-null    float64
 12  var13   351 non-null    float64
 13  var14   351 non-null    float64
 14  var15   351 non-null    float64
 15  var16   351 non-null    float64
 16  var17   351 non-null    float64
 17  var18   351 non-null    float64
 18  var19   351 non-null    float64
 19  var20   351 non-null    float64
 20  var21   351 non-null    float64
 21  var22   351 non-null    float64
 22  va

In [5]:
df["class"] = df["class"].apply(lambda x: 1 if x=="g" else 0)
df["class"] = df["class"].astype("category")
df.head()

Unnamed: 0,var1,var2,var3,var4,var5,var6,var7,var8,var9,var10,...,var25,var26,var27,var28,var29,var30,var31,var32,var33,class
0,1,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,0.85243,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,1
1,1,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,0.50874,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,0
2,1,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,0.73082,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,1
3,1,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,0
4,1,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,0.52798,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 34 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   var1    351 non-null    int64   
 1   var2    351 non-null    float64 
 2   var3    351 non-null    float64 
 3   var4    351 non-null    float64 
 4   var5    351 non-null    float64 
 5   var6    351 non-null    float64 
 6   var7    351 non-null    float64 
 7   var8    351 non-null    float64 
 8   var9    351 non-null    float64 
 9   var10   351 non-null    float64 
 10  var11   351 non-null    float64 
 11  var12   351 non-null    float64 
 12  var13   351 non-null    float64 
 13  var14   351 non-null    float64 
 14  var15   351 non-null    float64 
 15  var16   351 non-null    float64 
 16  var17   351 non-null    float64 
 17  var18   351 non-null    float64 
 18  var19   351 non-null    float64 
 19  var20   351 non-null    float64 
 20  var21   351 non-null    float64 
 21  var22   351 non-

In [7]:
X = df.drop(["class"], axis=1)
y = df["class"]
print(X.shape, y.shape)

(351, 33) (351,)


# Split Data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(245, 33) (245,)
(106, 33) (106,)


# (a)

## Linear SVM

In [9]:
# =================train model================
# train
linear_svm = LinearSVC(random_state=42) # default C=1.0
linear_svm.fit(X_train, y_train)

# predict
linear_svm_y_train = linear_svm.predict(X_train)
linear_svm_y_test = linear_svm.predict(X_test)

# ==================Accuracy==================
linear_svm_train_acc = accuracy_score(y_train, linear_svm_y_train)
linear_svm_test_acc = accuracy_score(y_test, linear_svm_y_test)
# ============================================
print(f"Accuracy of Linear SVM train : {round((linear_svm_train_acc)*100, 2)}")
print(f"Accuracy of Linear SVM test  : {round((linear_svm_test_acc)*100, 2)}")

Accuracy of Linear SVM train : 92.65
Accuracy of Linear SVM test  : 87.74


### Hyper-parameter Tuning using CV

In [10]:
# 하이퍼파라미터 그리드 정의
param_grid = {'C': [0.1, 0.5, 1, 5, 10, 50, 100]}

# 그리드 서치 객체 생성 및 학습
linear_svm_grid_search = GridSearchCV(LinearSVC(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error', verbose=0)
linear_svm_grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LinearSVC(random_state=42),
             param_grid={'C': [0.1, 0.5, 1, 5, 10, 50, 100]},
             scoring='neg_mean_squared_error')

In [11]:
linear_svm_grid_search.best_params_

{'C': 5}

### Best Model

In [12]:
# =================train model================
# train
best_linear_svm = LinearSVC(random_state=42, **linear_svm_grid_search.best_params_)
best_linear_svm.fit(X_train, y_train)

# predict
best_linear_svm_y_train = best_linear_svm.predict(X_train)
best_linear_svm_y_test = best_linear_svm.predict(X_test)

# ==================Accuracy==================
best_linear_svm_train_acc = accuracy_score(y_train, best_linear_svm_y_train)
best_linear_svm_test_acc = accuracy_score(y_test, best_linear_svm_y_test)
# ============================================
print(f"Accuracy of Best Linear SVM train : {round((best_linear_svm_train_acc)*100, 2)}")
print(f"Accuracy of Best Linear SVM test  : {round((best_linear_svm_test_acc)*100, 2)}")

Accuracy of Best Linear SVM train : 94.69
Accuracy of Best Linear SVM test  : 88.68


## Radial SVM

In [13]:
# =================train model================
# train
radial_svm = SVC(random_state=42) # default C=1.0  / kernel="rbf"
radial_svm.fit(X_train, y_train)

# predict
radial_svm_y_train = radial_svm.predict(X_train)
radial_svm_y_test = radial_svm.predict(X_test)

# ==================Accuracy==================
radial_svm_train_acc = accuracy_score(y_train, radial_svm_y_train)
radial_svm_test_acc = accuracy_score(y_test, radial_svm_y_test)
# ============================================
print(f"Accuracy of Radial SVM train : {round((radial_svm_train_acc)*100, 2)}")
print(f"Accuracy of Radial SVM test  : {round((radial_svm_test_acc)*100, 2)}")

Accuracy of Radial SVM train : 96.33
Accuracy of Radial SVM test  : 96.23


### 하이퍼파라미터 그리드 정의

In [14]:
param_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2.5, 3, 3.5, 4, 5, 10, 25, 50, 100]}

# 그리드 서치 객체 생성 및 학습
radial_svm_grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error', verbose=0)
radial_svm_grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVC(random_state=42),
             param_grid={'C': [0.01, 0.05, 0.1, 0.5, 1, 2.5, 3, 3.5, 4, 5, 10,
                               25, 50, 100]},
             scoring='neg_mean_squared_error')

In [15]:
radial_svm_grid_search.best_params_

{'C': 2.5}

### Best Model

In [16]:
# =================train model================
# train
best_radial_svm = SVC(random_state=42, **radial_svm_grid_search.best_params_)
best_radial_svm.fit(X_train, y_train)

# predict
best_radial_svm_y_train = best_radial_svm.predict(X_train)
best_radial_svm_y_test = best_radial_svm.predict(X_test)

# ==================Accuracy==================
best_radial_svm_train_acc = accuracy_score(y_train, best_radial_svm_y_train)
best_radial_svm_test_acc = accuracy_score(y_test, best_radial_svm_y_test)
# ============================================
print(f"Accuracy of Best Radial SVM train : {round((best_radial_svm_train_acc)*100, 2)}")
print(f"Accuracy of Best Radial SVM test  : {round((best_radial_svm_test_acc)*100, 2)}")

Accuracy of Best Radial SVM train : 97.55
Accuracy of Best Radial SVM test  : 95.28


# (b)

In [17]:
# =================train model================
# train
boosting = GradientBoostingClassifier(random_state=42)
boosting.fit(X_train, y_train)

# predict
boosting_y_train = boosting.predict(X_train)
boosting_y_test = boosting.predict(X_test)

# ==================Accuracy==================
boosting_train_acc = accuracy_score(y_train, boosting_y_train)
boosting_test_acc = accuracy_score(y_test, boosting_y_test)
# ============================================
print(f"Accuracy of Boosting train : {round((boosting_train_acc)*100, 2)}")
print(f"Accuracy of Boosting SVM test  : {round((boosting_test_acc)*100, 2)}")

Accuracy of Boosting train : 100.0
Accuracy of Boosting SVM test  : 95.28


# (c)

In [19]:
print("=================Accuracy===============")
print("Model\t\t\t| Train\t| Test")
print("------------------------|-------|-------")
print(f"Linear SVM\t\t| {round(linear_svm_train_acc * 100, 2)}\t| {round(linear_svm_test_acc * 100, 2)}")
print(f"Linear SVM (Tuned)\t| {round(best_linear_svm_train_acc * 100, 2)}\t| {round(best_linear_svm_test_acc * 100, 2)}")
print("........................|.......|.......")
print(f"Radial SVM\t\t| {round(radial_svm_train_acc * 100, 2)}\t| {round(radial_svm_test_acc * 100, 2)}")
print(f"Radial SVM (Tuned)\t| {round(best_radial_svm_test_acc * 100, 2)}\t| {round(best_radial_svm_test_acc * 100, 2)}")
print("........................|.......|.......")
print(f"Gradient Boosting\t| {round(boosting_train_acc * 100, 2)}\t| {round(boosting_test_acc * 100, 2)}")
print("========================================")

Model			| Train	| Test
------------------------|-------|-------
Linear SVM		| 92.65	| 87.74
Linear SVM (Tuned)	| 94.69	| 88.68
........................|.......|.......
Radial SVM		| 96.33	| 96.23
Radial SVM (Tuned)	| 95.28	| 95.28
........................|.......|.......
Gradient Boosting	| 100.0	| 95.28
