# **SUPPORT VECTOR MACHINE IMPLEMENTATION**
**Dataset:** Statlog Landsat Satelite
--

***Spyrakis Angelos***, *ECE AUTh (9352)*

---
Table of Contents:

**1. Nearest-Neighbors & Nearest Centroid**

**2. Support Vector Machine**
>
>2.1. Linear SVM
>
>2.2. Polynomial SVM
>
>2.3. RBF SVM
> 
>2.4. Implementation with OVO


---
Execute the following cell to download the dataset.

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.trn https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.tst

# 1.Nearest-Neighbors & Nearest Centroid



In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid


def load_landsat():
    """Load the Landsat Satelite dataset from the current directory
    """
    train = pd.read_csv('./sat.trn', sep=' ', header=None)
    test = pd.read_csv('./sat.tst', sep=' ', header=None)

    X_train = train.loc[:, 0:35]
    y_train = train.loc[:, 36]

    X_test = test.loc[:, 0:35]
    y_test = test.loc[:, 36]

    # Normalize data
    X_train, X_test = X_train.astype('float32') / 255.0, X_test.astype('float32') / 255.0

    return  X_train, X_test, y_train, y_test


def knn(k, X_train, X_test, y_train, y_test):
    """K-nearest neighbor implementation
    """
    print('Executing KNN with k = %d...\n' % (k))
    start_time = time.time()

    knn_classifier = KNeighborsClassifier(n_neighbors=k, weights='uniform', algorithm='auto')
    knn_classifier.fit(X_train, y_train)
    y_predicted = knn_classifier.predict(X_test)

    accuracy = float(np.sum(y_predicted == y_test) / y_test.shape[0])
    elapsed_time = time.time() - start_time
    print('KNN Accuracy = %f%%' % (accuracy * 100))
    print('Elapsed time: %.2f seconds.\n' % elapsed_time)


def nearest_centroid(X_train, X_test, y_train, y_test):
    """Nearest-Centroid implementation
    """
    print('Executing nearest centroid...\n')
    start_time = time.time()

    centroid_classifier = NearestCentroid(metric='euclidean')
    centroid_classifier.fit(X_train, y_train)

    y_predicted = centroid_classifier.predict(X_test)

    accuracy = float(np.sum(y_predicted == y_test) / y_test.shape[0])
    elapsed_time = time.time() - start_time
    print('Nearest-centroid Accuracy = %f%%' % (accuracy * 100))
    print('Elapsed time: %.2f seconds.\n' % elapsed_time)


def main():
    
    X_train, X_test, y_train, y_test = load_landsat()

    #KNN implementation
    knn(3, X_train, X_test, y_train, y_test)

    #Nearest-Centroid implementation
    nearest_centroid(X_train, X_test, y_train, y_test)


if __name__ == "__main__":
    main()


# 2.Support Vector Machine

## 2.1. Linear SVM

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


def load_landsat():
    """Load the Landsat Satelite dataset from the current directory
    """
    train = pd.read_csv('./sat.trn', sep=' ', header=None)
    test = pd.read_csv('./sat.tst', sep=' ', header=None)

    X_train = train.loc[:, 0:35]
    y_train = train.loc[:, 36]

    X_test = test.loc[:, 0:35]
    y_test = test.loc[:, 36]

    # Normalize data
    X_train, X_test = X_train.astype('float32') / 255.0, X_test.astype('float32') / 255.0

    return  X_train, X_test, y_train, y_test


def inter_scores(X_train, y_train, X_test, y_test, C_values, class_w_values):
    """Calculates intermittent CV and test scores based on C and class_weight values given.
    """
    from sklearn.model_selection import cross_val_score

    for C in C_values:
        for cw in class_w_values:
            print('\nResults for (C, class_weights) = (', C, ', ', cw, ')')
            clf = SVC(kernel='linear', C=C, class_weight=cw, cache_size=500)
            scores = cross_val_score(clf, X_train, y_train, cv=5)
            print('Mean CV score: %.2f%%' % (scores.mean()*100))
            
            start_time = time.time()
            clf.fit(X_train, y_train)
            end_time = time.time()
            y_pred = clf.predict(X_test)
            print('Training time: %.2f seconds.' % (end_time - start_time))
            print('Linear SVM Accuracy = %f%%' % (metrics.accuracy_score(y_test, y_pred)*100))


def main():
    """Linear SVM implementation for the Statlog Landsat dataset.
    """
    X_train, X_test, y_train, y_test = load_landsat()
    time_start = time.time()
    
    # Parameter Range
    param_grid = {'C': [0.1, 1, 10, 20, 25, 30, 35, 40, 100, 1000],
              'class_weight': ['balanced', None]}
 
    grid = GridSearchCV(SVC(kernel='linear', cache_size=500), param_grid, refit = True, verbose = 1)
 
    # Fitting the model for grid search
    grid.fit(X_train, y_train)
    time_end = time.time()

    # Print best parameter after tuning
    print('Best parameters after grid-search:')
    print(grid.best_params_)
    print('\nGrid-Search time: %.2f seconds.' % (time_end - time_start))
    print('Best mean CV score: ', grid.best_score_)

    # Predict Xtest and print results
    time_start = time.time()
    y_pred = grid.predict(X_test)
    time_end = time.time()

    print('\nLinear SVM Accuracy = %f%%' % (metrics.accuracy_score(y_test, y_pred)*100))
    print('Testing time: %.2f seconds.' % (time_end - time_start))
    
    # Get intermittent CV scores
    """
    C_values = [0.1, 10, 50, 100, 1000]
    class_w_values = [None, 'balanced']
    inter_scores(X_train, y_train, X_test, y_test, C_values, class_w_values)
    """


if __name__ == "__main__":
    main()


## 2.2. Polynomial SVM

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


def load_landsat():
    """Load the Landsat Satelite dataset from the current directory
    """
    train = pd.read_csv('./sat.trn', sep=' ', header=None)
    test = pd.read_csv('./sat.tst', sep=' ', header=None)

    X_train = train.loc[:, 0:35]
    y_train = train.loc[:, 36]

    X_test = test.loc[:, 0:35]
    y_test = test.loc[:, 36]

    # Normalize data
    X_train, X_test = X_train.astype('float32') / 255.0, X_test.astype('float32') / 255.0

    return  X_train, X_test, y_train, y_test


def inter_scores(X_train, y_train, X_test, y_test, C_values, degree_values, gamma_values, coef_values):
    """Calculates intermittent CV and test scores based on C and class_weight values given.
    """
    from sklearn.model_selection import cross_val_score

    for C in C_values:
        for d in degree_values:
            for g in gamma_values:
                for coef in coef_values:
                    print('\nResults for (C, degree, gamma, coef0) = (', C, ', ', d, ', ', g, ', ', coef, ')')
                    clf = SVC(kernel='poly', C=C, degree=d, cache_size=1000, gamma=g, coef0=coef)
                    scores = cross_val_score(clf, X_train, y_train, cv=5)
                    print('Mean CV score: %.2f%%' % (scores.mean()*100))

                    start_time = time.time()
                    clf.fit(X_train, y_train)
                    end_time = time.time()
                    y_pred = clf.predict(X_test)
                    print('Training time: %.2f seconds.' % (end_time - start_time))
                    print('Polynomial SVM Accuracy = %.2f%%' % (metrics.accuracy_score(y_test, y_pred)*100))


def main():
    """Polynomial SVM implementation for the Statlog Landsat dataset.
    """
    X_train, X_test, y_train, y_test = load_landsat()
    time_start = time.time()
    
    # Parameter Range
    param_grid = {'C': [0.1, 1, 10, 100],
              'degree': [1, 2, 3, 4],
              'gamma': ['auto', 'scale', 0.1, 1, 10],
              'coef0': [0, 1]}
 
    grid = GridSearchCV(SVC(kernel='poly', cache_size=1000), param_grid, refit = True, verbose = 1)
 
    # Fitting the model for grid search
    grid.fit(X_train, y_train)
    time_end = time.time()

    # Print best parameter after tuning
    print('Best parameters after grid-search:')
    print(grid.best_params_)
    print('\nGrid-Search time: %.2f seconds.' % (time_end - time_start))
    print('Best mean CV score: ', grid.best_score_)

    # Predict Xtest and print results
    time_start = time.time()
    y_pred = grid.predict(X_test)
    time_end = time.time()

    print('\nPolynomial SVM Accuracy = %f%%' % (metrics.accuracy_score(y_test, y_pred)*100))
    print('Testing time: %.2f seconds.' % (time_end - time_start))
    
    # Get intermittent CV scores
    """
    C_values = [0.1, 1]
    degree_values = [3, 4, 5, 8]
    gamma_values = [1]
    coef_values = [1]
    class_w_values = [None, 'balanced']
    inter_scores(X_train, y_train, X_test, y_test, C_values, degree_values, gamma_values, coef_values)
    """


if __name__ == "__main__":
    main()


## 2.3. RBF SVM

In [None]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


def load_landsat():
    """Load the Landsat Satelite dataset from the current directory
    """
    train = pd.read_csv('./sat.trn', sep=' ', header=None)
    test = pd.read_csv('./sat.tst', sep=' ', header=None)

    X_train = train.loc[:, 0:35]
    y_train = train.loc[:, 36]

    X_test = test.loc[:, 0:35]
    y_test = test.loc[:, 36]

    # Normalize data
    X_train, X_test = X_train.astype('float32') / 255.0, X_test.astype('float32') / 255.0

    return X_train, X_test, y_train, y_test


class MidpointNormalize(Normalize):
    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))


def inter_scores(X_train, y_train, X_test, y_test, C_values, gamma_values):
    """Calculates intermittent CV and test scores based on C and class_weight values given.
    """
    from sklearn.model_selection import cross_val_score

    for C in C_values:
        for g in gamma_values:
            print('\nResults for (C, gamma) = (', C, ', ', g, ')')
            clf = SVC(kernel='rbf', C=C, cache_size=1000, gamma=g)
            scores = cross_val_score(clf, X_train, y_train, cv=5)
            print('Mean CV score: %.2f%%' % (scores.mean() * 100))

            start_time = time.time()
            clf.fit(X_train, y_train)
            end_time = time.time()
            y_pred = clf.predict(X_test)
            print('Training time: %.2f seconds.' % (end_time - start_time))
            print('RBF SVM Accuracy = %.2f%%' % (metrics.accuracy_score(y_test, y_pred) * 100))


def plot_heatmap(scores, gamma_values, C_values):
    plt.figure(figsize=(8, 6))
    plt.subplots_adjust(left=0.2, right=0.95, bottom=0.15, top=0.95)
    plt.imshow(
        scores,
        interpolation="nearest",
        cmap=plt.cm.hot,
        norm=MidpointNormalize(vmin=0.2, midpoint=0.83),
    )
    plt.xlabel("gamma")
    plt.ylabel("C")
    plt.colorbar()
    plt.xticks(np.arange(len(gamma_values)), gamma_values, rotation=45)
    plt.yticks(np.arange(len(C_values)), C_values)
    plt.title("Cross-Validation Accuracy")
    plt.show()
    

def main():
    """RBF SVM implementation for the Statlog Landsat dataset.
    """
    X_train, X_test, y_train, y_test = load_landsat()
    time_start = time.time()

    # Parameter Range
    C_values = np.logspace(-2, 10, 13)
    gamma_values = np.logspace(-9, 3, 13)
    param_grid = dict(gamma=gamma_values, C=C_values)

    grid = GridSearchCV(SVC(kernel='rbf', cache_size=1000), param_grid, refit=True, verbose=1)

    # Fitting the model for grid search
    grid.fit(X_train, y_train)
    time_end = time.time()

    # Print best parameter after tuning
    print('Best parameters after grid-search:')
    print(grid.best_params_)
    print('\nGrid-Search time: %.2f seconds.' % (time_end - time_start))
    print('Best mean CV score: ', grid.best_score_)

    # Predict Xtest and print results
    time_start = time.time()
    y_pred = grid.predict(X_test)
    time_end = time.time()

    print('\nRBF SVM Accuracy = %f%%' % (metrics.accuracy_score(y_test, y_pred) * 100))
    print('Testing time: %.2f seconds.' % (time_end - time_start))

    # Plot CV accuracy heatmap
    scores = grid.cv_results_["mean_test_score"].reshape(len(C_values), len(gamma_values))
    plot_heatmap(scores, gamma_values, C_values)
    
    # Get intermittent CV scores
    """
    C_values = [10]
    gamma_values = [10]
    inter_scores(X_train, y_train, X_test, y_test, C_values, gamma_values)
    """


if __name__ == "__main__":
    main()


## 2.4. Implementation with OVO

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


def load_landsat():
    """Load the Landsat Satelite dataset from the current directory
    """
    train = pd.read_csv('./sat.trn', sep=' ', header=None)
    test = pd.read_csv('./sat.tst', sep=' ', header=None)

    X_train = train.loc[:, 0:35]
    y_train = train.loc[:, 36]

    X_test = test.loc[:, 0:35]
    y_test = test.loc[:, 36]

    # Normalize data
    X_train, X_test = X_train.astype('float32') / 255.0, X_test.astype('float32') / 255.0

    return  X_train, X_test, y_train, y_test


def inter_scores(X_train, y_train, X_test, y_test, C_values, degree_values, gamma_values, coef_values):
    """Calculates intermittent CV and test scores based on C and class_weight values given.
    """
    from sklearn.model_selection import cross_val_score

    for C in C_values:
        for d in degree_values:
            for g in gamma_values:
                for coef in coef_values:
                    print('\nResults for (C, degree, gamma, coef0) = (', C, ', ', d, ', ', g, ', ', coef, ')')
                    clf = SVC(kernel='poly', C=C, degree=d, cache_size=1000, gamma=g, coef0=coef)
                    scores = cross_val_score(clf, X_train, y_train, cv=5)
                    print('Mean CV score: %.2f%%' % (scores.mean()*100))

                    start_time = time.time()
                    clf.fit(X_train, y_train)
                    end_time = time.time()
                    y_pred = clf.predict(X_test)
                    print('Training time: %.2f seconds.' % (end_time - start_time))
                    print('Polynomial SVM Accuracy = %.2f%%' % (metrics.accuracy_score(y_test, y_pred)*100))


def main():
    """Polynomial SVM implementation for the Statlog Landsat dataset.
    """
    X_train, X_test, y_train, y_test = load_landsat()
    time_start = time.time()
    
    # Parameter Range
    param_grid = {'C': [0.1, 1, 10],
              'degree': [3, 4, 5],
              'gamma': ['auto', 'scale', 0.1, 1, 10],
              'coef0': [0, 1]}
 
    grid = GridSearchCV(SVC(kernel='poly', decision_function_shape='ovo', cache_size=1000), param_grid, refit = True, verbose = 1)
 
    # Fitting the model for grid search
    grid.fit(X_train, y_train)
    time_end = time.time()

    # Print best parameter after tuning
    print('Best parameters after grid-search:')
    print(grid.best_params_)
    print('\nGrid-Search time: %.2f seconds.' % (time_end - time_start))
    print('Best mean CV score: ', grid.best_score_)

    # Predict Xtest and print results
    time_start = time.time()
    y_pred = grid.predict(X_test)
    time_end = time.time()

    print('\nPolynomial SVM Accuracy = %f%%' % (metrics.accuracy_score(y_test, y_pred)*100))
    print('Testing time: %.2f seconds.' % (time_end - time_start))
    
    # Get intermittent CV scores
    """
    C_values = [0.1, 1]
    degree_values = [3, 4, 5, 8]
    gamma_values = [1]
    coef_values = [1]
    class_w_values = [None, 'balanced']
    inter_scores(X_train, y_train, X_test, y_test, C_values, degree_values, gamma_values, coef_values)
    """


if __name__ == "__main__":
    main()
