# Support Vector Machine

## 1. Lib

In [1]:
import copy
import pandas as pd
import numpy as np
import math
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

## 2. Dataset

In [2]:
data = load_breast_cancer()
X = data.data
y = data.target

In [3]:
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
print(X.shape)
print(y.shape)

(569, 30)
(569,)


In [5]:
def correct_targets(targets):
    new_targets = copy.copy(targets)
    new_targets[np.where(targets > 0)] = 1
    new_targets[np.where(targets <= 0)] = -1
    return new_targets.reshape((new_targets.shape[0],1))    

In [6]:
y = correct_targets(y)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=0
)

## 3. Problem definition
### 3.1. Ogólnie
Zadanie polega na znalezieniu funkcji $$ f(x)={w^{T}x+b}$$, która tworzy hiperpłaszczyznę zapewniającą klasyfikację (dopuszczającą pomyłki) z użyciem maszyny wektorów nośnych SVM. Otrzymana funkcja powinna zapewniać jak najmniejszą liczbę pomyłek przy klasyfikowaniu elementów zbioru BREAST CANCER do odpowiedniej klasy.

Klasyfikacja odbywa się poprzez zwrócenie dla danego zestawu cech $x$ grupy $y(x) = -1 \lor y(x) = 1$, do której należy za pomocą funkcji:

$$
y(x) =
{
\left\{
\begin{array}{ll}
-1 & \textrm{, $f(x) \leq 0$}\\
1 & \textrm{, $f(x) > 0$}
\end{array}
\right
}
$$


Aby otrzymać funkcję $f(x)$ należy znaleźć parametry $w$ i $b$, które minimalizują funkcję straty:
$$ J(w,b)=\Sigma_i(max(1-f(x_i)y_i, 0)) + \lambda*||w||^2 $$

Aby zoptymalizować owe parametry, zastosowana zostanie metoda gradientu prostego, w tym celu potrzebny będzie gradient funkcji $J(w,b)$:
$$
\nabla J =
\begin{bmatrix}
    \partial J \over \partial w_1 \\
    \vdots \\
    \partial J \over \partial w_n \\
    \partial J \over \partial b \\
\end{bmatrix}
$$



Natomiast pochodne cząstkowe te prezentują się następująco:
$$
{\partial J \over \partial w_i}=
{\lambda*2*w_i} + \Sigma_k(1 \cdot
{\left\{ \begin{array}{ll}
0 & \textrm{, $ 1-f(x_k)y_k \leq 0$ }\\
-y_k \cdot x_{k[i]} & \textrm{, $ 1-f(x_k)y_k > 0$}
\end{array}\right })
$$


$$
{\partial J \over \partial b}=
\Sigma_k (1 * {
\left\{ \begin{array}{ll}
0 & \textrm{, $ 1-f(x_k) \cdot y_k \leq 0$ }\\
y_k & \textrm{, $ 1-f(x_k) \cdot y_k > 0$}
\end{array}\right
}
)
$$



### 3.2. Functions to train:
+ funkcja **f(x)**:

In [8]:
def f(x, params):
    b = params[-1,:]
    W = params[:-1,:]
    return np.dot(x,W) + b

+ gradient **J(w, b)**:

In [9]:
def grad_j(params, set_xs, set_ys, lambd, function_f):
    # numpy array of partial derivatives
    b = params[-1,:]
    W = params[:-1,:]
    partials = np.zeros_like(params, dtype=np.float64)

    # counting gradients for w1, w2, ..., wn
    distances = 1 - np.multiply(set_ys, function_f(set_xs, params))
    distances = distances.reshape((distances.shape[0],))
    # sum = 2 * λ * wi + (0 or iyx) over all samples
    x_w_part = np.zeros_like(set_xs)
    x_w_part[np.where(distances>0)] -= (set_ys*set_xs)[np.where(distances>0)]
    partials[:-1,:] = 2*lambd*W + np.sum(x_w_part,axis=0).reshape(partials[:-1,:].shape)

    # counting gradient for b
    x_b_part = np.zeros_like(set_ys, dtype=np.float64)
    x_b_part[np.where(distances>0)] += set_ys[np.where(distances>0)]
    partials[-1,:] = np.sum(x_b_part, axis=0)

    return partials

+ algorytm realizujący metodę gradientu prostego:

In [10]:
def gradient_descent(function_f, gradient_f, params, beta, set_xs, set_ys, lambd, max_steps=10000, min_epsilon = 1e-20):
    new_param = params
    act_step = 0
    while 1:
        act_gradient = gradient_f(new_param, set_xs, set_ys, lambd, function_f)
        if np.linalg.norm(act_gradient) < min_epsilon or act_step > max_steps:
            return new_param
        new_param = new_param - beta * act_gradient
        act_step += 1

### 3.2. Functions to evaluate:
+ funkcja **y(x)**:

In [11]:
def classify_y(x, function_f, params):
    return 2*(function_f(x, params) > 0) - 1

## 4. Train & test

+ dodatkowo zostaną zdefiniowane funkcje: trenujące model (**train_model()**) oraz wykonujące walidacje dla hiperparametru lambda(**validate_model()**)
+ a także zostanie zdefiniowany zbiór możliwych lambd **lambdas = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5]**:

In [12]:
lambdas = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5]

def train_model(model0, training_set_x, training_set_y, param_lambda):
    return gradient_descent(f, grad_j, model0, 0.01, training_set_x, training_set_y, param_lambda)

def validate_model(training_set_x, training_set_y, validating_set_x, validating_set_y):
    best_model = None
    best_lambda = None
    best_score = - math.inf

    for param_lambda in lambdas:
        model0 = np.zeros(shape=(training_set_x.shape[1] + 1,1), dtype=np.float64)
        current_model = train_model(model0, training_set_x, training_set_y, param_lambda)
        # results_validating = np.zeros(len(validating_set_y), dtype='int')
        results_validating = classify_y(validating_set_x, f, current_model)
        n_of_successes = 0
        for x, y in zip(results_validating, validating_set_y):
            if x == y:
                n_of_successes += 1
        print(f"Validating model with lambda: {param_lambda} gave score: {n_of_successes / len(results_validating)}")
        # as long as new score is not worse than actual best, lambda should be maximized
        if (n_of_successes / len(results_validating) >= best_score):      
            best_score = n_of_successes / len(results_validating)
            best_lambda = param_lambda
            best_model = current_model
    print(f"Best lambda for this validation equals: {best_lambda} with score: {best_score}")
    return best_model

In [13]:
model = validate_model(X_train, y_train, X_test, y_test)

Validating model with lambda: 0.0001 gave score: 0.9418604651162791


KeyboardInterrupt: 

In [14]:
pd.DataFrame(y_test).value_counts()

 1    50
-1    36
dtype: int64

In [15]:
model

array([[ 2.39605746e+03],
       [-6.71778034e+02],
       [ 1.15054477e+04],
       [ 2.56168915e+03],
       [ 4.27113273e+00],
       [-8.49543611e+01],
       [-1.50519592e+02],
       [-5.74942435e+01],
       [ 3.79854269e+00],
       [ 8.77068016e+00],
       [ 1.17598571e+01],
       [-2.10088355e+01],
       [-3.89976153e+02],
       [-6.20431836e+03],
       [-1.46888398e+00],
       [-2.47697882e+01],
       [-3.50312999e+01],
       [-7.06221046e+00],
       [-3.84809543e+00],
       [-1.51190488e+00],
       [ 2.57466856e+03],
       [-2.42793558e+03],
       [ 9.03459914e+03],
       [-4.20244745e+03],
       [-8.05246570e+00],
       [-3.22491205e+02],
       [-4.42874013e+02],
       [-1.07890628e+02],
       [-4.48809547e+01],
       [-1.43400794e+01],
       [-5.72050000e+02]])

In [16]:
# def get_success_percent(model_results, official_results):
#     sum = 0
#     for x, y in zip(model_results, official_results):
#         if x==y:
#             sum+=1
#     fraction = sum / len(model_results)
#     print(f"Success percent: {100*fraction}%")
#
# get_success_percent(results_testing_41, testing_setosa_versicolor_y)

In [17]:
model0 = np.zeros(shape=(31,1), dtype=np.float64)
current_model = train_model(model0, X_train, y_train, 0.0005)

In [18]:
f(X_test, current_model)

array([[  650132.73644149],
       [ 1228359.14425789],
       [ 1465126.85839114],
       [ 1117732.36169439],
       [ 1522802.58091279],
       [ 1075378.79974308],
       [ 1431809.98140495],
       [ 1229910.60180833],
       [  581453.82015347],
       [ 1138862.90043798],
       [ 1255161.30610527],
       [ 1189930.18371181],
       [ 1067112.55510621],
       [ 1382129.75905942],
       [ 1230254.91771583],
       [  610998.5481445 ],
       [  917917.11408273],
       [  248890.64002792],
       [ 1677704.36092897],
       [ -455434.09855369],
       [  -89938.18479008],
       [ 1314594.42608711],
       [ 1358694.95737635],
       [ 1039736.58862855],
       [ 2134567.8611222 ],
       [  980927.53699214],
       [  982668.353319  ],
       [ 1417924.12699632],
       [ 1308881.68733822],
       [ -878463.05643089],
       [ 1359951.00553518],
       [  -55241.04085472],
       [ 1097822.70425327],
       [  612472.80943222],
       [ 1042583.63168814],
       [  124427.814

In [19]:
np.sum(classify_y(X_test, f, current_model) == y_test) / y_test.shape[0]

0.6627906976744186