## Linear Model to Classify Iris Dataset

In [1]:
from sklearn.datasets import load_iris
import numpy as np
import keras
np.random.seed(10)

Using TensorFlow backend.


### Loading the Iris Dataset

In [2]:
iris = load_iris()
print(iris.keys())
X = iris['data']   # array([[5.1, 3.5, 1.4, 0.2], [4.9, 3. , 1.4, 0.2], ... ])
Y = iris['target'] # array([0, 1, 2, 0, ... ])
names = iris['target_names']  #['setosa', 'versicolor', 'virginica']
feature_names = iris['feature_names']  # ['sepal length (cm)', 
                                        # 'sepal width (cm)', 
                                        # 'petal length (cm)', 
                                        # 'petal width (cm)']
# To Track a few sample points
isamples = np.random.randint(len(Y), size = (5))
                   # array([ 9, 125, 15, 64, 113]) <-- random samples (example)

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [3]:
print(X.shape, Y.shape)
print(X[isamples])
print(Y[isamples])

(150, 4) (150,)
[[4.9 3.1 1.5 0.1]
 [7.2 3.2 6.  1.8]
 [5.7 4.4 1.5 0.4]
 [5.6 2.9 3.6 1.3]
 [5.7 2.5 5.  2. ]]
[0 2 0 1 2]


### Categorial One-Hot Encoding

In [4]:
# Convert lables to categorial one-hot encoding
Ny = len(np.unique(Y))  # Ny = 3
Y = keras.utils.to_categorical(Y[:], num_classes = Ny)
# Y is np.ndarray now
print("X:", X[isamples, :])
print("Y:", Y[isamples])

X: [[4.9 3.1 1.5 0.1]
 [7.2 3.2 6.  1.8]
 [5.7 4.4 1.5 0.4]
 [5.6 2.9 3.6 1.3]
 [5.7 2.5 5.  2. ]]
Y: [[1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


### Train Test Split (randomly into 80% - 20%)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                            test_size = 0.20, random_state = 1)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(120, 4)
(30, 4)
(120, 3)
(30, 3)


### Data Normalization: Zero-Mean & Unit-Variance

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)  # computes mean and std
X_train = scaler.transform(X_train)  # x = (x-mean)/std
X_test = scaler.transform(X_test)
print(X_train)

[[ 0.31553662 -0.04578885  0.44767531  0.23380268]
 [ 2.2449325  -0.04578885  1.29769171  1.39742892]
 [-0.2873996  -1.24028061  0.05100098 -0.15407273]
 [ 0.67729835 -0.52358555  1.01435291  1.13884531]
 [-0.04622511 -0.52358555  0.73101411  1.52672073]
 [-0.64916132  1.62649961 -1.30902526 -1.31769898]
 [-0.40798684 -1.71807731  0.10766874  0.10451088]
 [-0.76974857  0.90980456 -1.36569302 -1.31769898]
 [ 0.79788559 -0.52358555  0.44767531  0.36309449]
 [ 1.03906007 -1.24028061  1.12768843  0.7509699 ]
 [ 1.15964732 -0.04578885  0.95768515  1.13884531]
 [-0.89033581  1.14870291 -1.36569302 -1.18840717]
 [ 0.19494938 -1.95697567  0.67434635  0.36309449]
 [ 0.5567111  -0.2846872   1.01435291  0.7509699 ]
 [ 0.91847283 -0.2846872   0.44767531  0.10451088]
 [ 2.2449325  -1.00138226  1.75103379  1.39742892]
 [-0.16681235  1.86539796 -1.19568974 -1.18840717]
 [-1.01092305  1.38760126 -1.36569302 -1.31769898]
 [-1.25209754 -0.04578885 -1.36569302 -1.18840717]
 [-0.76974857 -0.76248391  0.05

### Least Square Solution

In [7]:
# Training with XW = Y
addlcol = lambda A: np.concatenate((A, np.ones((A.shape[0], 1))), axis = 1)
Ns, Nx = X_train.shape  # 120, 4
# XX = np.concatenate((X_train, np.ones((Ns, 1))), axis = 1)
XX = addlcol(X_train)
print(XX.shape)
YY = Y_train
print(YY.shape)
W = np.linalg.inv(XX.T.dot(XX)).dot(XX.T.dot(YY))
print(W)
def evaluate(X, W, Yd):
  ''' X is np.array (Nsamples, Nfeats);
     Yd is np.array (Nsamples, Nonehot) '''
  x = addlcol(X)
  yd = np.argmax(Yd, axis = 1)
  y = np.argmax(x.dot(W), axis = 1)
  print("CM:")
  print(confusion_matrix(yd, y))
evaluate(X_train, W, Y_train)
evaluate(X_test, W, Y_test)

(120, 5)
(120, 3)
[[ 0.06062508  0.05806364 -0.11868873]
 [ 0.10258458 -0.21112954  0.10854496]
 [-0.4070331   0.22897614  0.17805695]
 [-0.03856453 -0.30252108  0.34108561]
 [ 0.325       0.30833333  0.36666667]]
CM:
[[39  0  0]
 [ 0 22 15]
 [ 0  4 40]]
CM:
[[11  0  0]
 [ 0  6  7]
 [ 0  0  6]]


Since, in Lower-Dimension we are'nt able to separate the Data therefore, we now take the data to Higher Dimension.

In [8]:
addSqlcol = lambda A: np.concatenate((A, A**2, 
                    np.ones((A.shape[0], 1))), axis = 1)
Ns, Nx = X_train.shape  # 120, 4
XX = addSqlcol(X_train)
print(XX.shape)
YY = Y_train
print(YY.shape)
W = np.linalg.inv(XX.T.dot(XX)).dot(XX.T.dot(YY))
print(W)
def evaluate(X, W, Yd):
    '''  X is np.array (Nsamples, Nfeats);
        Yd is np.array (Nsamples, Nonehot) '''
    x = addSqlcol(X)
    yd = np.argmax(Yd, axis = 1)
    y = np.argmax(x.dot(W), axis = 1)
    print("CM:")
    print(confusion_matrix(yd, y))
evaluate(X_train, W, Y_train)
evaluate(X_test, W, Y_test)

(120, 9)
(120, 3)
[[ 0.02212851  0.1284788  -0.15060731]
 [ 0.02261966  0.00396191 -0.02658157]
 [-0.31265762 -0.00385162  0.31650925]
 [-0.05916917 -0.24649968  0.30566885]
 [-0.03340995  0.13233974 -0.09892979]
 [-0.00993575  0.01269958 -0.00276383]
 [ 0.21451737 -0.58556233  0.37104496]
 [ 0.04009892 -0.0947794   0.05468048]
 [ 0.1137294   0.84363575  0.04263485]]
CM:
[[39  0  0]
 [ 0 35  2]
 [ 0  2 42]]
CM:
[[11  0  0]
 [ 0 13  0]
 [ 0  0  6]]


Thus we obtained very high accuracy on the IRIS Dataset & also see that Data is mostly Linearly Separable in Higher Dimensional Space.

### Minimum Norm Solution

### For Under-determined System

In [9]:
addlcol = lambda A: np.concatenate((A, np.ones((A.shape[0], 1))), axis = 1)
ind = np.random.choice(range(X_train.shape[0]), size = 12, replace = False)
XX = X_train[ind, :]
XX = addlcol(XX)
YY = Y_train[ind, :]
W = XX.T.dot(np.linalg.inv(XX.dot(XX.T)).T.dot(YY)) # <--- X' (X X')^(-1) Y
print(W, XX.shape, YY.shape)
def evaluate(X, W, Yd):
  ''' X is np.array (Nsamples, Nfeats);
     Yd is np.array (Nsamples, Nonehot) '''
  x = addlcol(X)
  yd = np.argmax(Yd, axis = 1)
  y = np.argmax(x.dot(W), axis = 1)
  print("CM:")
  print(confusion_matrix(yd, y))
evaluate(X_train[ind, :], W, YY)
evaluate(X_test, W, Y_test)

[[ 0.20961103  0.58704205 -0.51231043]
 [ 0.3852091  -3.00142755  1.30941257]
 [ 1.25120866 -3.41458449  2.13456555]
 [ 0.14802282  1.96431021 -1.77695432]
 [-0.703125    2.4375      0.5625    ]] (12, 5) (12, 3)
CM:
[[0 2 1]
 [0 6 1]
 [1 1 0]]
CM:
[[ 0  4  7]
 [ 0 11  2]
 [ 1  3  2]]


Since, we did'nt use the entire data in the Minimum Norm Soln $(n=12)$ therefore, we don't get high accuracy.  

### For Under-determined System, with Square Input Features too

In [10]:
addSqlcol = lambda A: np.concatenate((A, A**2, np.ones((A.shape[0], 1))), axis = 1)
ind = np.random.choice(range(X_train.shape[0]), size = 25, replace = False)
XX = X_train[ind, :]
XX = addSqlcol(XX)
YY = Y_train[ind, :]
W = XX.T.dot(np.linalg.inv(XX.dot(XX.T)).T.dot(YY)) # <--- X' (X X')^(-1) Y
print(W, XX.shape, YY.shape)
def evaluate(X, W, Yd):
  ''' X is np.array (Nsamples, Nfeats);
     Yd is np.array (Nsamples, Nonehot) '''
  x = addSqlcol(X)
  yd = np.argmax(Yd, axis = 1)
  y = np.argmax(x.dot(W), axis = 1)
  print("CM:")
  print(confusion_matrix(yd, y))
evaluate(X_train[ind, :], W, YY)
evaluate(X_test, W, Y_test)

[[ 3.47226748e-01  3.63760253e+00 -3.68230723e+00]
 [ 1.97234620e-01 -1.75836953e+01  1.65363723e+01]
 [ 5.33253777e-01  1.29454482e+01 -1.25762651e+01]
 [-6.50822476e-01  2.04649360e+00 -9.84502537e-01]
 [ 7.00586907e-03 -6.63301314e+00  7.72945840e+00]
 [-4.14679850e-01 -5.32558420e+00  4.52391144e+00]
 [ 2.48117879e-01 -1.05201103e+01  1.04398216e+01]
 [ 3.44167834e-01 -1.04225873e+01  1.08279327e+01]
 [-3.12500000e-01  8.59375000e+00 -5.56250000e+00]] (25, 9) (25, 3)
CM:
[[0 0 8]
 [0 6 0]
 [0 4 7]]
CM:
[[ 0  0 11]
 [ 0 12  1]
 [ 0  1  5]]


Since, we did'nt use the entire data in the Minimum Norm Soln $(n=25)$ therefore, we don't get high accuracy. There isn't any reason to use Minimum Norm Solution (since for IRIS Dataset we have already have large data) Thus Least Square Solution/ Pseudo- Inverse Soln. gives a Higher Accuracy.
