In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("iris.data", header=None)
X = df[[0,1,2,3]].values
y = df[[4]].values

encoder = LabelEncoder()
label_encoder = encoder.fit(y)
y = label_encoder.transform(y)
label_dict = {0:'Iris-setosa',1:'Iris-versicolor',2:'Iris-virginica'}
df.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
### 1. Computing mean vector m of each class

np.set_printoptions(precision=4)

mean_vectors = []
for cl in range(3):
    mean_vectors.append(np.mean(X[y==cl],axis=0))
    print('Mean Vector class %s %s: \n%s' %(cl, label_dict[cl] ,mean_vectors[cl]))

Mean Vector class 0 Iris-setosa: 
[5.006 3.418 1.464 0.244]
Mean Vector class 1 Iris-versicolor: 
[5.936 2.77  4.26  1.326]
Mean Vector class 2 Iris-virginica: 
[6.588 2.974 5.552 2.026]


In [5]:
### 2. Computing the Scatter Matrices S_W

# 2.1 within-class scatters of each class S
scatter_list = [] # save scatter matrix of every class

for cl,mv in zip(range(3), mean_vectors):
    one_class_sc_mat = np.zeros((4,4))              # scatter matrix for every class
    for row in X[y==cl]:
        row, mv = row.reshape(4,1), mv.reshape(4,1) # make column vectors
        one_class_sc_mat += (row-mv).dot((row-mv).T)
    scatter_list.append(one_class_sc_mat)
    print('scatter of class %s %s : \n%s' % (cl, label_dict[cl], scatter_list[cl]))


scatter of class 0 Iris-setosa : 
[[6.0882 4.9146 0.7908 0.5168]
 [4.9146 7.1138 0.5724 0.5604]
 [0.7908 0.5724 1.4752 0.2792]
 [0.5168 0.5604 0.2792 0.5632]]
scatter of class 1 Iris-versicolor : 
[[13.0552  4.174   8.962   2.7332]
 [ 4.174   4.825   4.05    2.019 ]
 [ 8.962   4.05   10.82    3.582 ]
 [ 2.7332  2.019   3.582   1.9162]]
scatter of class 2 Iris-virginica : 
[[19.8128  4.5944 14.8612  2.4056]
 [ 4.5944  5.0962  3.4976  2.3338]
 [14.8612  3.4976 14.9248  2.3924]
 [ 2.4056  2.3338  2.3924  3.6962]]


In [6]:
# 2.2 computing the Scatter Matrices for each pair
# Iris-setosa +  Iris-versicolor - 
S_W_01 = scatter_list[0] + scatter_list[1]

# Iris-setosa  + Iris-virginica -
S_W_02 = scatter_list[0] + scatter_list[2]

# Iris-versicolor + Iris-virginica -
S_W_12 = scatter_list[1] + scatter_list[2]

In [7]:
### 3. calculate ww = (S_W)^-1 (m_1-m_2)

# Iris-setosa +   Iris-versicolor - 
ww_01 = np.linalg.inv(S_W_01).dot(mean_vectors[0]-mean_vectors[1])

# Iris-setosa +   Iris-virginica -
ww_02 = np.linalg.inv(S_W_01).dot(mean_vectors[0]-mean_vectors[2])

# Iris-versicolor +   Iris-virginica -
ww_12 = np.linalg.inv(S_W_01).dot(mean_vectors[1]-mean_vectors[2])

print ww_01
print ww_02
print ww_12

[ 0.029   0.1879 -0.2167 -0.3325]
[ 0.0417  0.2398 -0.2632 -0.6129]
[ 0.0126  0.0519 -0.0466 -0.2804]


In [84]:
# test boundary of class 0 Iris-setosa and 1 Iris-versicolor
acc = 0
total = 0 
for data in df.loc[df[4] != 'Iris-virginica'].values:
    if data[4] == 'Iris-setosa' and data[:4].dot(ww_01) > 0:
        acc += 1
    if data[4] == 'Iris-versicolor' and data[:4].dot(ww_01) < 0:
        acc += 1
    total += 1
print 'accuracy of classifying Iris-setosa and Iris-versicolor', acc * 1.0 / total

accuracy of classifying Iris-setosa and Iris-versicolor 1.0


In [85]:
# test boundary of class 0 Iris-setosa and 2 Iris-virginica
acc = 0
total = 0 
for data in df.loc[df[4] != 'Iris-versicolor'].values:
    if data[4] == 'Iris-setosa' and data[:4].dot(ww_02) > 0:
        acc += 1
    if data[4] == 'Iris-virginica' and data[:4].dot(ww_02) < 0:
        acc += 1
    total += 1
print 'accuracy of classifying Iris-setosa and Iris-virginica', acc * 1.0 / total

accuracy of classifying Iris-setosa and Iris-virginica 1.0


In [86]:
# test boundary of class 1 Iris-versicolor and 2 Iris-virginica
acc = 0
total = 0 
for data in df.loc[df[4] != 'Iris-setosa'].values:
    if data[4] == 'Iris-versicolor' and data[:4].dot(ww_12) > 0:
        acc += 1
    if data[4] == 'Iris-virginica' and data[:4].dot(ww_12) < 0:
        acc += 1
    total += 1
print 'accuracy of classifying Iris-versicolor and Iris-virginica', acc * 1.0 / total

accuracy of classifying Iris-versicolor and Iris-virginica 0.5
