# Chapter 14 - Learning Vector Quantization

In [1]:
from math import sqrt
from random import randrange, seed

### Euclidean Distance

In [2]:
# Calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1) - 1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

In [3]:
# Test Distance Function
dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]
row0 = dataset[0]
for row in dataset:
    distance = euclidean_distance(row0, row)
    print(distance)

0.0
1.3290173915275787
1.9494646655653247
1.5591439385540549
0.5356280721938492
4.850940186986411
2.592833759950511
4.214227042632867
6.522409988228337
4.985585382449795


### Best Matching Unit

In [4]:
def get_best_matching_unit(codebooks, test_row):
    distances = list()
    for codebook in codebooks:
        dist = euclidean_distance(codebook, test_row)
        distances.append((codebook, dist))
    distances.sort(key=lambda tup: tup[1])
    return distances[0][0]

In [5]:
# Test best matching unit function
dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]
test_row = dataset[0]
bmu = get_best_matching_unit(dataset, test_row)
print(bmu)

[2.7810836, 2.550537003, 0]


### Training Codebook Vectors

In [6]:
# Create a random codebook vector
def random_codebook(train):
    n_records = len(train)
    n_features = len(train[0])
    codebook = [train[randrange(n_records)][i] for i in range(n_features)]
    return codebook

In [7]:
# Train a set of codebook vectors
def train_codebooks(train, n_codebooks, lrate, epochs):
    codebooks = [random_codebook(train) for i in range(n_codebooks)]
    for epoch in range(epochs):
        rate = lrate * (1.0 - (epoch/float(epochs)))
        sum_error = 0.0
        for row in train:
            bmu = get_best_matching_unit(codebooks, row)
            for i in range(len(row)-1):
                error = row[i] - bmu[i]
                sum_error += error**2
                if bmu[-1] == row[-1]:
                    bmu[i] += rate * error
                else:
                    bmu[i] -= rate * error
        print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, rate, sum_error))
    return codebooks

In [8]:
# Test the training function
seed(1)

dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]

learn_rate = 0.3
n_epochs = 10
n_codebooks = 2
codebooks = train_codebooks(dataset, n_codebooks, learn_rate, n_epochs)
print('Codebooks: %s' % codebooks)

>epoch=0, lrate=0.300, error=43.270
>epoch=1, lrate=0.270, error=30.403
>epoch=2, lrate=0.240, error=27.146
>epoch=3, lrate=0.210, error=26.301
>epoch=4, lrate=0.180, error=25.537
>epoch=5, lrate=0.150, error=24.789
>epoch=6, lrate=0.120, error=24.058
>epoch=7, lrate=0.090, error=23.346
>epoch=8, lrate=0.060, error=22.654
>epoch=9, lrate=0.030, error=21.982
Codebooks: [[2.432316086217663, 2.839821664184211, 0], [7.319592257892681, 1.97013382654341, 1]]


### Ionosphere Case Study

In [9]:
from Codes.ch01_load_and_convert_data import load_csv, str_column_to_float, str_column_to_int
from Codes.ch06_algorithm_test_harnesses import evaluate_algorithm_kfold

In [10]:
# Make a prediction with codebook wectors
def predict(codebooks, test_row):
    bmu = get_best_matching_unit(codebooks, test_row)
    return bmu[-1]

In [11]:
# LVQ Algorithm
def learning_vector_quantization(train, test, n_codebooks, lrate, epochs):
    codebooks = train_codebooks(train, n_codebooks, lrate, epochs)
    predictions = list()
    for row in test:
        output = predict(codebooks, row)
        predictions.append(output)
    return(predictions)

In [13]:
# Test LVQ on Ionosphere dataset
# Test LVQ on Ionosphere dataset
seed(1)
# load and prepare data
filename = './data/ionosphere.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
n_folds = 5
learn_rate = 0.3
n_epochs = 50
n_codebooks = 20
scores = evaluate_algorithm_kfold(dataset, learning_vector_quantization, n_folds, n_codebooks,
learn_rate, n_epochs)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

>epoch=0, lrate=0.300, error=2106.606
>epoch=1, lrate=0.294, error=2033.558
>epoch=2, lrate=0.288, error=1931.089
>epoch=3, lrate=0.282, error=1899.031
>epoch=4, lrate=0.276, error=1906.729
>epoch=5, lrate=0.270, error=1881.332
>epoch=6, lrate=0.264, error=1869.365
>epoch=7, lrate=0.258, error=1857.928
>epoch=8, lrate=0.252, error=1839.964
>epoch=9, lrate=0.246, error=1839.005
>epoch=10, lrate=0.240, error=1824.272
>epoch=11, lrate=0.234, error=1825.209
>epoch=12, lrate=0.228, error=1807.112
>epoch=13, lrate=0.222, error=1798.466
>epoch=14, lrate=0.216, error=1788.285
>epoch=15, lrate=0.210, error=1776.740
>epoch=16, lrate=0.204, error=1763.173
>epoch=17, lrate=0.198, error=1755.526
>epoch=18, lrate=0.192, error=1747.907
>epoch=19, lrate=0.186, error=1740.369
>epoch=20, lrate=0.180, error=1729.157
>epoch=21, lrate=0.174, error=1721.589
>epoch=22, lrate=0.168, error=1714.244
>epoch=23, lrate=0.162, error=1706.964
>epoch=24, lrate=0.156, error=1699.747
>epoch=25, lrate=0.150, error=1692.

## Future Works

* Tune Parameters. The parameters in the above example were not tuned; try different values to improve the classification accuracy.
* Different Distance Measures. Experiment with different distance measures such as Manhattan distance and Minkowski distance.
* Multiple-Pass LVQ. The codebook vectors may be updated by multiple training runs. Experiment by training with large learning rates followed by a large number of epochs with smaller learning rates to fine tune the codebooks.
* Update More BMUs. Experiment with selecting more than one BMU when training and pushing and pulling them away from the training data.
* More Problems. Apply LVQ to more classification problems on the UCI Machine Learning Repository.