# Data pre-processing with SVM basic examples

### Pre-processing
Several steps can be taken to process data before it is modelled, this can include:
* scaling and "standardising" the data, to ensure all input parameters have equal weight, 
* reducing the number of input variables by removing those that provide complimentary information (e.g. Principal Component Analysis - PCA), 
* transforming the data to a space where the modelling step is easier.

This is known as data processing or pre-processing.


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import loguniform
from sklearn.decomposition import PCA
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.svm import SVC
import pandas as pd

### Linearisation
One of the simplest examples of transforming data is linearisation. For equations where the x value is in the exponent, we can achieve a linear equation via logarithms (which are on some GCSE syllabuses!)
logarithms are the inverse of an exponentationl operation i.e.
$$ y = A^x $$
$$ \log_A{A^x} = x $$

Therefore if we have an equation:
$$ y = A^{2x+3} $$ 
We can take a log to give:
$$
\log{y} = (2x+3)\log{A}\\
\log{y} = (2\log{A})x+(3\log{A})
$$
This is a linear form ($y = B_0 + B_1x$)

There is a special case where A = $e$ (Eulers number 2.71828)

if x is in the denominator of an equation e.g.
$$y = \frac{A}{Bx}$$
we can similarly linearise via logs as:
$$ \log{y} = \log{\frac{A}{Bx}}$$

$$ \log{y} = \log{\frac{A}{B}} - \log{x}$$

Finally if we have non-integer powers, we can use logs to linearise:

$$ y = Ax^B $$ 
$$\log{y}=B\log{Ax}$$
$$\log{y}=B\log{A} + B\log{x}$$




In [None]:
import math
# what is a log 
print(np.log2(2**16) == 16)
print(np.log10(10**4.5) == 4.5)
print(math.log(3**7,3) == 7) # note we use math here, as np doesn't have arbitary base.
# log identities

print(np.log(12.2/6.5))
print(np.log(12.2) - np.log(6.5))
print(np.log(5.6*8.2)) 
print(np.log(5.6) + np.log(8.2))
M = 50
error_sigma = .2
xs = np.linspace(0.1,10,M)

noise = np.random.normal(0,error_sigma,M)

A = 2
B = 2
print("Exponential function: plot x vs log(y) becomes linear - fit with OLS")
plt.subplot(1, 2, 1)
ys = np.exp((A*xs+B))
plt.scatter(xs,ys)
plt.subplot(1, 2, 2)
plt.scatter(xs,np.log(ys))
plt.show()

print("Power equations - general case of above.")
plt.subplot(1, 2, 1)
ys = A**(B*xs)
plt.scatter(xs,ys)
plt.subplot(1, 2, 2)
plt.scatter(xs,np.log(ys))
plt.show()

print("Inverse equations - plot log(x) vs log(y) becomes linear - fit with OLS")
ys = A/(B*xs)
plt.subplot(1, 2, 1)
plt.scatter(xs,ys)
plt.subplot(1, 2, 2)
plt.scatter(np.log(xs),np.log(ys))
plt.show()

print("Non-integer powers plot log(x) vs log(y) becomes linear - fit with OLS")
ys = A*(xs**3.32) 
plt.subplot(1, 2, 1)
plt.scatter(xs,ys)
plt.subplot(1, 2, 2)
plt.scatter(np.log(xs),np.log(ys))
plt.show()

## Support Vector Machines

Support Vector Machines (SVM) classify data by finding the points at the edge of classes and positioning a line equidistant between these points. 

They can account for overlapping classes, and work to minimise the errors by ignoring points.

The below are simple 1D and then 2D examples, play with the errors on the points to observe how the classfication boundaries change.

In [None]:
from matplotlib import ticker
N=20 # number of points
error_sigma = 1
good_centre = 5
bad_centre = -5
x=np.concatenate((np.random.normal(good_centre,error_sigma,N),np.random.normal(bad_centre,error_sigma,N))).reshape(-1, 1)
labels = np.concatenate((np.full(N,1),np.full(N,0)))
colors = np.concatenate((np.full(N,"green"),np.full(N,"red")))
plt.scatter(x,np.zeros(2*N), color = colors)
plt.show()

model = SVC(kernel='linear', C=100)
clf = model.fit(x,labels)
xs = np.linspace(-10,10,500).reshape(-1,1)
ys= clf.predict(xs)
boundary=0
for i in range(1,len(xs)):
    if ys[i]!=ys[i-1]:
        boundary = xs[i]
    
plt.scatter(x,np.zeros(2*N), color = colors)
plt.vlines(clf.support_vectors_,-0.1,0.1, color="darkgrey", linestyle="--", label="Support vectors")
plt.vlines(boundary,-0.1,0.1, color="black", linestyle="--", label="boundary")
plt.legend()
plt.show()


In [None]:
def decision_plot(X,y,clf,x_axis_label="PC1", y_axis_label="PC2"):
    """
        Helper to print the plots only ignore!
        Based on answer provided by S. Loukas on StackOverflow.
            https://stackoverflow.com/questions/51495819/how-to-plot-svm-decision-boundary-in-sklearn-python
    """
    def make_meshgrid(x, y, h=.02):
        x_min, x_max = x.min() - 1, x.max() + 1
        y_min, y_max = y.min() - 1, y.max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
        return xx, yy

    def plot_contours(ax, clf, xx, yy, **params):
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        out = ax.contourf(xx, yy, Z, **params)
        return out

    fig, ax = plt.subplots()
    # title for the plots
    title = ('Decision surface of linear SVC ')

    # Set-up grid for plotting.
    X0, X1 = X[:, 0], X[:, 1]
    xx, yy = make_meshgrid(X0, X1)

    plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
    ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
    ax.set_ylabel(y_axis_label)
    ax.set_xlabel(x_axis_label)
    ax.set_title(title)

    plt.show()

    
'''
    Start of example
'''    
N = 50
error_sigma = 3
good_centre = [5,3]
bad_centre = [-5,-5]
x1=np.concatenate((np.random.normal(good_centre[0],error_sigma,N),np.random.normal(bad_centre[0],error_sigma,N)))
x2=np.concatenate((np.random.normal(good_centre[1],error_sigma,N),np.random.normal(bad_centre[1],error_sigma,N)))
X = np.vstack((x1,x2)).T
labels = np.concatenate((np.full(N,1),np.full(N,0)))
colors = np.concatenate((np.full(N,"green"),np.full(N,"red")))
plt.scatter(x1,x2, color = colors)
plt.xlabel("X1")
plt.ylabel("X2")
plt.show()

model = SVC(kernel='linear', C=100)
clf = model.fit(X,labels)
decision_plot(X,labels,clf,"X1","X2")

### Body Temperature Example
This is a simple body temperature example, where we have a data that is both higher and lower than the normal range.

The data here is therefore not linearly separable. 

We standardise and then square to create a pseudo-dimension which transforms the problem into a linearly separable one.

In [None]:
N=20 # number of points in each zone, we have 4x this overall.

# generate equal amounts of data:   34-35.8C (n = 20), 36-38 (n=40), 38.2-40 (n=20)
t = np.r_[np.random.randint(340,358,N),np.random.randint(360,380,2*N), np.random.randint(382,400,N)]/10

# label the data, 0 for ok, and 1 for outside healthy range.
y = np.zeros(4*N)
y = np.where(t<36,1,y)
y = np.where(t>38,1,y)

# print line as before -we have two zones, so can't use an SVM!
colors = np.full(4*N,"green")
colors = np.where(y==1,"red",colors)
plt.scatter(t,np.zeros(4*N), color = colors)
plt.show()

# manual standardise
z=(t-np.mean(t))/np.std(t,ddof=1)

# add pseudo-dimension as a "Kernel trick"
X=np.c_[z, z**2]

# fit an SVM and plot, we set the regularisation parameter to 1
model = SVC(kernel='linear', C=100)
clf = model.fit(X,y)
# plot to show the classification.
decision_plot(X,y,clf,r"$Temp_{std}$",r"$Temp^2_{std}$")

### Wine Dataset Example
Here we use the wine dataset which we worked through in the slides and complete the example by using an SVM to split the data into 3 classes. The SVM is naturally a binary classify, so to make it multiclass we need to do either one-vs-one with the classes or one-vs-rest, the sckit-learn implementation is one-vs-one.

This is based on:
https://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html

In [None]:
# Load data and plit into test/train samples

X, y = load_wine(return_X_y=True, as_frame=True)
target_classes = range(0, 3)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42
)

for standardise in [False, True]:
    # Optionally standardise data, we iterate over both to compare.
    if standardise:
        msg ="With Standardisation"
        scaler = StandardScaler()#.set_output(transform="pandas")
        scaled_X_train = scaler.fit_transform(X_train)
        scaled_X_test = scaler.transform(X_test)
    else: 
        msg = "Without Standardisation"
        scaled_X_train = X_train
        scaled_X_test = X_test
    print("Example ran {}".format(msg))
    # perform PCA, we use 2 so that we can plot readily.
    print("PCA")
    n_components = 2
    pca = PCA(n_components=n_components, svd_solver="randomized", whiten=True).fit(scaled_X_train)  
    X_train_pca = pca.transform(scaled_X_train)
    X_test_pca = pca.transform(scaled_X_test)
    first_pca_component = pd.DataFrame(
    pca.components_[0], index=X.columns, columns=[msg]
    )
    first_pca_component.plot.bar(
    title="Weights of the first principal component", figsize=(6, 8)
    )
    # fit a linear SVM, note we do a randomize search on C between 1 and 100.
    # if you wish to observe rbf/poly then you need to include the gamma with sensible ranges.
    param_grid = {
        "C": loguniform(1e0, 1e2),
        #"gamma": loguniform(1e-4, 1e0),
    }
    clf = RandomizedSearchCV(
        SVC(kernel="linear", class_weight="balanced"), param_grid, n_iter=10
    )
    clf = clf.fit(X_train_pca, y_train)
    
    # print out the coefs for the best fit
    print("Best estimator found by grid search:")
    print(clf.best_estimator_)
    
    # plot the boundaries between classes in PC1 and PC2 space.
    decision_plot(X_train_pca, y_train,clf)
    y_pred = clf.predict(X_train_pca)
    
    # run classification reports on the training and test data
    print("Training data")
    print(classification_report(y_train, y_pred, target_names=np.char.mod("Class %d",target_classes)))
    y_pred = clf.predict(X_test_pca)
    print("Test data")
    print(classification_report(y_test, y_pred, target_names=np.char.mod("Class %d",target_classes)))
    print("-------------------")