# SVM ( Support Vector Machine ) 

In [1]:
import pandas

In [2]:
automobile = pandas.read_csv('../1015_Numpy&Pandas/automobile.csv')

In [3]:
automobile.head()

Unnamed: 0,symboling,normalized_losses,maker,fuel,aspiration,doors,body,wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
1,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
2,1,158,audi,gas,std,four,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,17710
3,1,158,audi,gas,turbo,four,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.4,8.3,140,5500,17,20,23875
4,2,192,bmw,gas,std,two,sedan,rwd,front,101.2,...,108,mpfi,3.5,2.8,8.8,101,5800,23,29,16430


# 데이터 준비 

In [4]:
variables = ['bore', 'city_mpg', 'compression_ratio', 'curb_weight', 'engine_size',
             'horsepower', 'peak_rpm', 'city_mpg', 'price']
X = automobile[variables]
y = automobile['doors']

# 데이터 분할 

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

# 선형 SVM

In [7]:
from sklearn.svm import SVC # SVM중 Classification이라 SVC
from sklearn import metrics

In [8]:
svc = SVC(kernel='linear')

In [9]:
svc

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [11]:
y_svc = svc.predict(X_test)

# 모형 평가 

In [13]:
metrics.confusion_matrix(y_test, y_svc)

array([[21, 18],
       [11, 14]])

In [16]:
metrics.accuracy_score(y_test, y_svc)

0.546875

In [18]:
metrics.precision_score(y_test, y_svc, pos_label='four')

0.65625

In [19]:
metrics.recall_score(y_test, y_svc, pos_label='four')

0.53846153846153844

In [20]:
metrics.f1_score(y_test, y_svc, pos_label='four')

0.59154929577464788

# 페널티 조정

In [21]:
svc2 = SVC(kernel='linear', C=0.1) # 기본이 1, 10배씩 조절해보면된다.

In [22]:
svc2.fit(X_train, y_train)

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [23]:
y_svc2 = svc2.predict(X_test)

# 모형 평가

In [24]:
metrics.confusion_matrix(y_test, y_svc2)

array([[24, 15],
       [ 9, 16]])

In [25]:
metrics.accuracy_score(y_test, y_svc2)

0.625

In [26]:
metrics.precision_score(y_test, y_svc2, pos_label='four')

0.72727272727272729

In [27]:
metrics.recall_score(y_test, y_svc2, pos_label='four')

0.61538461538461542

In [29]:
metrics.f1_score(y_test, y_svc2, pos_label='four')

0.66666666666666674

# 같은 패턴이 반복되니 함수로 만듭시다

In [35]:
def run_model(kernel, penalty):
    model = SVC(kernel=kernel, C=penalty)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print('confusion matrix : \n',metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : ',metrics.accuracy_score(y_test, y_pred))
    print('precision : ',metrics.precision_score(y_test, y_pred, pos_label='four'))
    print('recall : ',metrics.recall_score(y_test, y_pred, pos_label='four'))
    print('F1 : ',metrics.f1_score(y_test, y_pred, pos_label='four'))
    return model

In [36]:
svc3 = run_model('linear', 10)

confusion matrix : 
 [[22 17]
 [11 14]]
accuracy :  0.5625
precision :  0.666666666667
recall :  0.564102564103
F1 :  0.611111111111


In [44]:
penal = {0.01,0.1,1,10,100}
print(penal)

{0.1, 1, 10, 100, 0.01}


In [45]:
for i in penal:
    run_model('linear', i)

confusion matrix : 
 [[24 15]
 [ 9 16]]
accuracy :  0.625
precision :  0.727272727273
recall :  0.615384615385
F1 :  0.666666666667
confusion matrix : 
 [[21 18]
 [11 14]]
accuracy :  0.546875
precision :  0.65625
recall :  0.538461538462
F1 :  0.591549295775
confusion matrix : 
 [[22 17]
 [11 14]]
accuracy :  0.5625
precision :  0.666666666667
recall :  0.564102564103
F1 :  0.611111111111
confusion matrix : 
 [[24 15]
 [11 14]]
accuracy :  0.59375
precision :  0.685714285714
recall :  0.615384615385
F1 :  0.648648648649
confusion matrix : 
 [[24 15]
 [10 15]]
accuracy :  0.609375
precision :  0.705882352941
recall :  0.615384615385
F1 :  0.657534246575


# SVM + RBF 커널

In [51]:
svc_rbf = run_model('rbf', 1)

confusion matrix : 
 [[39  0]
 [24  1]]
accuracy :  0.625
precision :  0.619047619048
recall :  1.0
F1 :  0.764705882353


In [52]:
def run_model(kernel, penalty,gamma='auto'):
    model = SVC(kernel=kernel, C=penalty)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print('confusion matrix : \n',metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : ',metrics.accuracy_score(y_test, y_pred))
    print('precision : ',metrics.precision_score(y_test, y_pred, pos_label='four'))
    print('recall : ',metrics.recall_score(y_test, y_pred, pos_label='four'))
    print('F1 : ',metrics.f1_score(y_test, y_pred, pos_label='four'))
    return model

In [55]:
svc_rbf2 = run_model('rbf', 100, gamma=0.000001)

confusion matrix : 
 [[39  0]
 [24  1]]
accuracy :  0.625
precision :  0.619047619048
recall :  1.0
F1 :  0.764705882353


# 수제(?) 커널
독립 변수 중에 이산 변수만 골라낸다
 - 이제까지 연속변수만 사용하였다.

In [56]:
for v in automobile.columns:
    print(v)

symboling
normalized_losses
maker
fuel
aspiration
doors
body
wheels
engine_location
wheel_base
length
width
height
curb_weight
engine_type
cylinders
engine_size
fuel_system
bore
stroke
compression_ratio
horsepower
peak_rpm
city_mpg
highway_mpg
price


In [62]:
discrete = []
for v in automobile.columns:
    print(automobile[v].dtype)
    if automobile[v].dtype == object:
        discrete.append(v)

int64
int64
object
object
object
object
object
object
object
float64
float64
float64
float64
int64
object
object
int64
object
float64
float64
float64
int64
int64
int64
int64
int64


In [63]:
discrete

['maker',
 'fuel',
 'aspiration',
 'doors',
 'body',
 'wheels',
 'engine_location',
 'engine_type',
 'cylinders',
 'fuel_system']

위의 표현은 아래처럼 간략히 쓸 수도 있음

In [58]:
discrete = [v for v in automobile.columns if automobile[v].dtype == object]

In [60]:
data = automobile[discrete]
data.head()

Unnamed: 0,maker,fuel,aspiration,doors,body,wheels,engine_location,engine_type,cylinders,fuel_system
0,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
1,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi
2,audi,gas,std,four,sedan,fwd,front,ohc,five,mpfi
3,audi,gas,turbo,four,sedan,fwd,front,ohc,five,mpfi
4,bmw,gas,std,two,sedan,rwd,front,ohc,four,mpfi


In [61]:
X = data[[x for x in discrete if x != 'doors']]
y = automobile['doors']

# 커널 만들기
9가지 변수 중에 같은 변수의 갯수를 센다

In [64]:
data.columns # 아래에서는 메이커와 함수 

Index(['maker', 'fuel', 'aspiration', 'doors', 'body', 'wheels',
       'engine_location', 'engine_type', 'cylinders', 'fuel_system'],
      dtype='object')

In [66]:
same = 0
# 두개의 열을 비교 zip으로 list형성 이후 a,b에 넣는다 
for a, b in zip(data.ix[0,:], data.ix[1,:]):
    if a == b:
        same = same + 1

same

8

In [67]:
def hand_made_kernel(d1, d2):
    same = 0
    for a, b in zip(d1, d2):
        if a == b:
            same = same + 1
    return same

# 데이터 분할

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [69]:
row_train, _ = X_train.shape

# 커널 적용

In [70]:
import numpy

In [72]:
P_train = numpy.zeros((row_train, row_train))

In [73]:
for i in range(row_train):
    for j in range(row_train):
        P_train[i, j] = hand_made_kernel(X_train.iloc[i,:], X_train.iloc[j,:])

In [74]:
P_train

array([[ 9.,  5.,  8., ...,  6.,  6.,  3.],
       [ 5.,  9.,  6., ...,  4.,  7.,  6.],
       [ 8.,  6.,  9., ...,  6.,  6.,  3.],
       ..., 
       [ 6.,  4.,  6., ...,  9.,  5.,  2.],
       [ 6.,  7.,  6., ...,  5.,  9.,  5.],
       [ 3.,  6.,  3., ...,  2.,  5.,  9.]])

# 훈련