In [1]:
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

iris = pd.read_csv('Iris.csv')
column = ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]
_class = "Species"
iris

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
5,6,5.4,3.9,1.7,0.4,Iris-setosa
6,7,4.6,3.4,1.4,0.3,Iris-setosa
7,8,5.0,3.4,1.5,0.2,Iris-setosa
8,9,4.4,2.9,1.4,0.2,Iris-setosa
9,10,4.9,3.1,1.5,0.1,Iris-setosa


In [2]:
del iris['Id']
iris.describe()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [73]:
X, y = iris[column], iris[_class]

# split data into training and test data.
train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                    train_size=0.5,
                                                    test_size=0.5)

train_y = train_y.to_frame(_class)
test_y = test_y.to_frame(_class)

train = pd.concat([train_X, train_y], axis = 1).values
test = pd.concat([test_X, test_y], axis = 1).values

In [74]:
import numpy as np

class LVQ:
    train_row, train_col, test_col, test_row = 0, 0, 0, 0
    def closest_distance(in_data, weight):
            nrow, ncol = weight.shape
            dist_sum = np.zeros(nrow)
            for i in range(0, nrow):
                dist_sum[i] = np.sum(np.absolute(weight[i, 0:ncol-1]-in_data[0:ncol-1]))
            return np.where(dist_sum == np.min(dist_sum))[0][0] #a one-element array

    def __init__(self, train, test):
        self.train_row, self.train_col = train.shape
        self.test_row, self.test_col = test.shape
        self.classes = np.unique(train[range(0, self.train_row), self.train_col-1])
        
        def encode(data, classes):
            data_row, data_col = data.shape
            
            # Encoding Process
            mapping = {}
            for i in range(0, classes.size):
                mapping[classes[i]] = i
            for i in range(0, data_row):
                data[i, data_col-1] = mapping[data[i, data_col-1]]
            return data
        
        def decode(data, classes):
            data_row, data_col = data.shape
            
            # Decoding Process
            mapping = {}
            for i in range(0, classes.size):
                mapping[i] = classes[i]
            return mapping
            
        def weight(in_data, n_class):
            nrow, ncol = in_data.shape
            repr_neuron = np.random.random((n_class, ncol))
            
            w_it = 0
            in_data_it = 0
            while (w_it != n_class) & (in_data_it != nrow):
                if (w_it == 0) | (~np.any(in_data[in_data_it, ncol-1] == repr_neuron[:, ncol-1])):
                    repr_neuron[w_it] = in_data[in_data_it]
                    w_it += 1
                in_data_it += 1
            
            return repr_neuron[:,0:ncol] # representative neuron with class
        
        self.decoder = decode(train, self.classes)
        self.train = encode(train, self.classes)
        self.test = encode(test, self.classes)
        self.n_class = np.size(self.classes)
        self.weight = weight(self.train, self.n_class)
        #self.new_weight = np.zeros((self.n_class, self.train_col))
        self.test_result = np.zeros((self.test_row,1))
        
    def print_meta(self):
        print("Number of row and col (train): " + str(self.train_row) + " " + str(self.train_col) + "\n",  
              "Number of row and col (test): " + str(self.test_row) + " " + str(self.test_col) + "\n", 
              "Number of class: " + str(self.n_class) + "\n",
              "Weight:\n" + str(self.weight) + "\n")
        
    def get_train(self):
        return self.train
    
    def get_test(self):
        return self.test
    
    def learn(self, max_epoch, alpha, threshold):
        def approach(weight, winner, data_in, a):
            wrow, wcol = weight.shape
            incol = data_in.shape[0]
            w, guru = np.hsplit(weight, [wcol-1])
            target = guru[winner]
            cj = data_in[incol-1]

            if target == cj:
                w[winner] = w[winner] + a*(data_in[0:incol-1] - w[winner])
            else:
                w[winner] = w[winner] - a*(data_in[0:incol-1] - w[winner])
    
            return np.hstack([w, guru])
    
        self.alpha = alpha
        self.threshold = threshold
        self.epoch = 0
        
        while (self.alpha >= self.threshold) | (self.epoch <= max_epoch):
            self.epoch += 1
            beta = self.alpha*(1-(self.epoch/max_epoch))
            for i in range(0, self.train_row):
                self.winner = LVQ.closest_distance(self.train[i], self.weight)
                self.weight = approach(self.weight, self.winner, self.train[i], self.alpha)
            self.alpha *= beta
        return self.weight
    
    def testing(self):
        for i in range(0, self.test_row):
            self.test_result[i,0] = self.weight[LVQ.closest_distance(self.test[i], self.weight), 
                                                self.test_col-1]
        return self.test_result
    
    def get_accuracy(self, print_report = False):
        self.count = 0
        if print_report == True:
            print("Test Result per instance:")
        for i in range(0, self.test_row):
            if print_report == True:
                print(str(i+1) + ". ", "Result: " + str(self.decoder[self.test_result[i, 0]]), 
                      "| Expected: " + str(self.decoder[self.test[i, self.test_col-1]]))
            if self.test_result[i, 0] == self.test[i, self.test_col-1]:
                self.count += 1
        if print_report == True:
            print("--------------------------------------------")
            print(str(self.count) + " of " + str(self.test_row) + " are correctly classified.")
        return (self.count/self.test_row)*100
    
    def classify(self, input_data, weight = ""):
        # input_data is np array
        if weight == "":
            weight = self.weight
        if len(input_data.shape) == 1:
            in_row = 1
            print(self.decoder[weight[LVQ.closest_distance(input_data, weight), self.test_col-1]])
        else:
            in_row = input_data.shape[0]
            for i in range(0, in_row):
                print(self.decoder[weight[LVQ.closest_distance(input_data[i], weight), self.test_col-1]])

# Example on Iris Dataset
iris_LVQ = LVQ(train, test) # Use this once for building the model

iris_LVQ.print_meta()

iris_LVQ.learn(15, 0.5, 0.000001)

iris_LVQ.print_meta() # The weight is updated

iris_LVQ.testing()

print(iris_LVQ.get_accuracy(print_report = True))

unclassified = np.array([5.6,3.0,4.5,1.5])
iris_LVQ.classify(unclassified) 

Number of row and col (train): 75 5
 Number of row and col (test): 75 5
 Number of class: 3
 Weight:
[[5.6 2.5 3.9 1.1 1. ]
 [5.  3.3 1.4 0.2 0. ]
 [6.8 3.  5.5 2.1 2. ]]

Number of row and col (train): 75 5
 Number of row and col (test): 75 5
 Number of class: 3
 Weight:
[[5.97342077 2.78328124 4.05100943 1.1587376  1.        ]
 [5.04564777 3.39075578 1.45024655 0.24257105 0.        ]
 [6.9240767  3.01601323 5.90460447 2.16792339 2.        ]]

Test Result per instance:
1.  Result: Iris-versicolor | Expected: Iris-versicolor
2.  Result: Iris-virginica | Expected: Iris-virginica
3.  Result: Iris-versicolor | Expected: Iris-versicolor
4.  Result: Iris-virginica | Expected: Iris-virginica
5.  Result: Iris-versicolor | Expected: Iris-versicolor
6.  Result: Iris-versicolor | Expected: Iris-virginica
7.  Result: Iris-versicolor | Expected: Iris-virginica
8.  Result: Iris-virginica | Expected: Iris-versicolor
9.  Result: Iris-setosa | Expected: Iris-setosa
10.  Result: Iris-setosa | Expected:

In [None]:
'''
Some of the models
84%
[[4.92548029 3.25045979 1.4315049  0.23327154 0.        ]
 [5.74678342 2.76411386 3.96108036 1.18920147 1.        ]
 [6.97258051 3.14030234 5.93348244 2.16061733 2.        ]]

85%
[[5.79244309 2.81818586 4.21724371 1.32285756 1.        ]
 [4.89933104 3.2607812  1.44885467 0.2003244  0.        ]
 [6.82508317 3.08301215 5.92286724 2.2185947  2.        ]]
 
86.67%
[[4.89791382 3.31320698 1.49981237 0.21449365 0.        ]
 [6.96856155 3.1656027  5.92725697 2.1199773  2.        ]
 [5.86144511 2.76260337 4.05354714 1.20034838 1.        ]]
 
90.67%
[[4.92717148 3.37536645 1.37829692 0.23305119 0.        ]
 [5.59180592 2.67749876 3.9121231  1.20255267 1.        ]
 [6.76742085 3.0037558  5.91358345 2.14113184 2.        ]]
 
92%
[[6.78476668 2.94461847 5.73713541 1.98148365 2.        ]
 [4.96296507 3.32594289 1.4699922  0.23066397 0.        ]
 [6.14242095 2.76347355 4.0883332  1.08250017 1.        ]]

93%
[[5.90646495 2.80275035 4.07622329 1.25738911 1.        ]
 [6.56162371 3.00660689 5.53928203 2.21852438 2.        ]
 [5.02889911 3.46136815 1.53998743 0.23294479 0.        ]]

96%
[[5.86179531 2.77314256 4.0818171  1.26798413 1.        ]
 [4.95461095 3.3618605  1.470549   0.23050125 0.        ]
 [6.47141807 2.94382168 5.58744473 2.06318658 2.        ]]
'''

In [133]:
import numpy as np

class SOM:
    train_row, train_col, test_col, test_row = 0, 0, 0, 0
    def closest_distance(in_data, weight):
            nrow, ncol = weight.shape
            dist_sum = np.zeros(nrow)
            for i in range(0, nrow):
                dist_sum[i] = np.sum(np.absolute(weight[i, 0:ncol]-in_data[0:ncol]))
            return np.where(dist_sum == np.min(dist_sum))[0][0] #a one-element array

    def __init__(self, train, test, n_cluster):
        self.train_row, self.train_col = train.shape
        self.test_row, self.test_col = test.shape
        
        self.train = train
        self.test = test
        self.n_cluster = n_cluster
        self.weight = np.random.random_sample((self.n_cluster, self.train_col))
        
        self.cluster = []
        self.clst_count = {}
        j = 65 # cluster 'A'
        for i in range(0, self.n_cluster):
            self.cluster.insert(i, chr(j))
            self.clst_count[chr(j)] = 0
            j += 1
        
        self.test_result = []
        
    def print_meta(self):
        print("Number of row and col (train): " + str(self.train_row) + " " + str(self.train_col) + "\n",  
              "Number of row and col (test): " + str(self.test_row) + " " + str(self.test_col) + "\n", 
              "Number of cluster: " + str(self.n_cluster) + "\n",
              "Weight:\n" + str(self.weight) + "\n")
        
    def get_train(self):
        return self.train
    
    def get_test(self):
        return self.test
    
    def learn(self, max_epoch, alpha, threshold):
        def approach(weight, winner, data_in, a):
            wrow, wcol = weight.shape
            incol = data_in.shape[0]
            w = weight
            
            w[winner] = w[winner] + a*(data_in[0:incol] - w[winner])
    
            return w
    
        self.alpha = alpha
        self.threshold = threshold
        self.epoch = 0
        
        while (self.alpha >= self.threshold) | (self.epoch <= max_epoch):
            self.epoch += 1
            beta = self.alpha*(1-(self.epoch/max_epoch))
            for i in range(0, self.train_row):
                self.winner = SOM.closest_distance(self.train[i], self.weight)
                self.weight = approach(self.weight, self.winner, self.train[i], self.alpha)
            self.alpha *= beta
        return self.weight
    
    def testing(self):
        for i in range(0, self.test_row):
            self.test_result.insert(i, SOM.closest_distance(self.test[i], self.weight))
        return self.test_result

    def get_report(self):
        print("Test Result per instance:")
        for i in range(0, self.test_row):
            print(str(i+1) + ". ", "Cluster: " + str(self.cluster[self.test_result[i]]))
            self.clst_count[self.cluster[self.test_result[i]]] += 1
        print("--------------------------------------------")
        for i in range(0, len(self.clst_count)):
            print(str(self.clst_count[self.cluster[i]])
                  + " instance(s) is identified as " + str(self.cluster[i]))

    def classify(self, input_data, weight = ""):
        # input_data is np array
        if weight == "":
            weight = self.weight
        if len(input_data.shape) == 1:
            in_row = 1
            print(self.cluster[SOM.closest_distance(self.input_data[i], self.weight)])
        else:
            in_row = input_data.shape[0]
            for i in range(0, in_row):
                print(self.cluster[SOM.closest_distance(self.input_data, self.weight)])


SOM_Iris = SOM(train[:,0:4], test[:,0:4], 6)
SOM_Iris.print_meta()
SOM_Iris.learn(15, 0.5, 0.001)
SOM_Iris.print_meta()
SOM_Iris.testing()
SOM_Iris.get_report()
#SOM_Iris.classify()

Number of row and col (train): 75 4
 Number of row and col (test): 75 4
 Number of cluster: 6
 Weight:
[[0.17852687 0.16956183 0.57166996 0.61530898]
 [0.42234895 0.25236972 0.66332697 0.89055621]
 [0.54824343 0.85240384 0.56687257 0.85262544]
 [0.80809947 0.04312564 0.08988438 0.79305189]
 [0.64318424 0.47908669 0.02297971 0.97706567]
 [0.72987524 0.55639743 0.77922902 0.16828958]]

Number of row and col (train): 75 4
 Number of row and col (test): 75 4
 Number of cluster: 6
 Weight:
[[0.17852687 0.16956183 0.57166996 0.61530898]
 [0.42234895 0.25236972 0.66332697 0.89055621]
 [6.45686715 2.89216585 5.04646756 1.72481468]
 [0.80809947 0.04312564 0.08988438 0.79305189]
 [0.64318424 0.47908669 0.02297971 0.97706567]
 [5.04297178 3.33091909 1.54729022 0.28644736]]

Test Result per instance:
1.  Cluster: C
2.  Cluster: C
3.  Cluster: C
4.  Cluster: C
5.  Cluster: F
6.  Cluster: C
7.  Cluster: C
8.  Cluster: C
9.  Cluster: F
10.  Cluster: F
11.  Cluster: C
12.  Cluster: F
13.  Cluster: C
1

In [94]:
print("[]")

[]
