<a href="https://colab.research.google.com/github/alzimna/Anmul/blob/main/chapters/Bab_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import numpy as np
import pandas as pd
import scipy
import math

def accuracy_rate(df) :
  trace = df.to_numpy().trace()
  total = df.to_numpy().sum()
  return(trace/total)

#**BAB 9**

##  9.2 CLASSIFICATION INTO TWO GROUPS

### **Example 9.2**

For the psychological data of Table 5.1, $y_1$, $y_2$, and $S_{pl}$ were obtained in Example 5.4.2. The discriminant function coefficients were obtained in Example 5.5 as $a' = (.5104, -.2032, .4660, -.3097)$. For $G_1$ (the male group), we find
$$\bar{\mathbf{z}}_1 = a'\bar{\mathbf{y}}_1 =  .5104(15.97) - .2032(15.91) + .4660(27.19) - .3097(22.75)\\
=10.5427$$

Similarly, for $G_2$ (the female group), $\mathbf{z}_2 = \mathbf{a}'
\bar{\mathbf{y}}_2 = 4.4426$. Thus we assign an observation vector $\mathbf{y}$ to $G_1$ if
$$
z = \mathbf{a}'\mathbf{y} >\frac{1}{2}(z_1+z_2) =  7.4927
$$
and assign $\mathbf{y}$ to $G_2$ if $z < 7.4927$.

There are no new observations available, so we will illustrate the procedure
by classifying two of the observations in $G_1$. For $\mathbf{y}'_{11} = (15, 17, 24, 14)$, the first observation in $G_1$, we have $z_{11} = \mathbf{a}'y_{11} = .5104(15) - .2032(17) + .4660(24) -.3097(14) = 11.0498$, which is greater than $7.4927$, and $y_{11}$ would be correctly classified as belonging to $G_1$. For $y_{14} = (13, 12, 10, 16)$, the fourth observation in $G_1$, we find $z_{14} = 3.9016$, which would misclassify $y_{14}$ into $G_2$.

In [31]:
url = 'https://github.com/alzimna/Anmul/raw/main/data/t5.1.xlsx'
data = pd.read_excel(url)
print(data.head())
y1_bar = round(data.iloc[:,:4].mean(),2).values
y2_bar = round(data.iloc[:,4:].mean(),2).values
S1 = round(pd.DataFrame.cov(data.iloc[:,:4]),3).values
S2 = round(pd.DataFrame.cov(data.iloc[:,4:]),3).values
S_pl = 1/(32+32-2)*((32-1)*S1+(32-1)*S2)
a = np.matmul(np.linalg.inv(S_pl),y1_bar-y2_bar)
z1_bar = np.matmul(a,y1_bar)
z2_bar = np.matmul(a,y2_bar)
z_batas = 1/2*(z1_bar+z2_bar)
print("z1_bar = ",z1_bar)
print("z2_bar = ",z2_bar)
print("z_batas = ",z_batas)

   X1  X2  X3  X4  Y1  Y2  Y3  Y4
0  15  17  24  14  13  14  12  21
1  17  15  32  26  14  12  14  26
2  15  14  29  23  12  19  21  21
3  13  12  10  16  12  13  10  16
4  20  17  26  28  11  20  16  16
z1_bar =  10.55008844072141
z2_bar =  4.444515352516764
z_batas =  7.497301896619087


In [30]:
S1 = data.iloc[:,:4].cov()
S2 = data.iloc[:,4:].cov()
S1.values

array([[ 5.19254032,  4.5453629 ,  6.52217742,  5.25      ],
       [ 4.5453629 , 13.18447581,  6.76008065,  6.26612903],
       [ 6.52217742,  6.76008065, 28.6733871 , 14.46774194],
       [ 5.25      ,  6.26612903, 14.46774194, 16.64516129]])

In [32]:
S1

array([[ 5.193,  4.545,  6.522,  5.25 ],
       [ 4.545, 13.184,  6.76 ,  6.266],
       [ 6.522,  6.76 , 28.673, 14.468],
       [ 5.25 ,  6.266, 14.468, 16.645]])

##9.3 CLASSIFICATION INTO SEVERAL GROUPS

### **Example 9.3**

In [None]:
url = 'https://github.com/alzimna/Anmul/raw/main/data/t8.3.xlsx'
data = pd.read_excel(url)
data["Group"] = data["Group"].astype('category')
Y = np.asmatrix(data.iloc[:,1:].values)
y1_bar = round(data[data["Group"]==1].iloc[:,1:].mean(),1).values
y2_bar = round(data[data["Group"]==2].iloc[:,1:].mean(),1).values
y3_bar = round(data[data["Group"]==3].iloc[:,1:].mean(),1).values
print("y1_bar =\n",y1_bar,sep = "")
print("y2_bar =\n",y2_bar,sep = "")
print("y3_bar =\n",y3_bar,sep = "")

#Menghitung E
p = int(data.shape[1]-1)
n = int(data.shape[0]/3)
E = np.asmatrix(np.zeros((p,p)))

for j in range(n*3) :
  if j<30 :
    E += np.transpose(Y[j]-y1_bar) @ (Y[j]-y1_bar)
  elif j<60 :
    E += np.transpose(Y[j]-y2_bar) @ (Y[j]-y2_bar)
  else :
    E += np.transpose(Y[j]-y3_bar) @ (Y[j]-y3_bar)
E

#Menghitung S_pl
S_pl = np.matrix.round(E/87,3)

#Mencari L_i(y)
from sympy import *
y1,y2,y3,y4,y5,y6 = symbols('y1 y2 y3 y4 y5 y6')
y = [y1,y2,y3,y4,y5,y6]
L_1 = (np.matrix.round(np.asmatrix(y1_bar @ np.linalg.inv(S_pl)),1) * Matrix(y))[0]-round((y1_bar @ np.linalg.inv(S_pl) @ np.transpose(y1_bar))/2,1)
L_2 = (np.matrix.round(np.asmatrix(y2_bar @ np.linalg.inv(S_pl)),1) * Matrix(y))[0]-round((y2_bar @ np.linalg.inv(S_pl) @ np.transpose(y2_bar))/2,1)
L_3 = (np.matrix.round(np.asmatrix(y3_bar @ np.linalg.inv(S_pl)),1) * Matrix(y))[0]-round((y3_bar @ np.linalg.inv(S_pl) @ np.transpose(y3_bar))/2,1)
print("L_1(y) = ",L_1)
print("L_2(y) = ",L_2)
print("L_3(y) = ",L_3)

#Mencari nilai L_i untuk y11
print("L_1(y11) = ",L_1.subs([(y1,13.5),(y2,57.2),(y3,19.5),(y4,12.5),(y5,14.0),(y6,11.0)]))
print("L_2(y11) = ",L_2.subs([(y1,13.5),(y2,57.2),(y3,19.5),(y4,12.5),(y5,14.0),(y6,11.0)]))
print("L_3(y11) = ",L_3.subs([(y1,13.5),(y2,57.2),(y3,19.5),(y4,12.5),(y5,14.0),(y6,11.0)]))

#Mencari nilai L_i untuk y13
print("L_1(y13) = ",L_1.subs([(y1,14.5),(y2,55.9),(y3,19),(y4,10),(y5,13),(y6,12)]))
print("L_2(y13) = ",L_2.subs([(y1,14.5),(y2,55.9),(y3,19),(y4,10),(y5,13),(y6,12)]))
print("L_3(y13) = ",L_3.subs([(y1,14.5),(y2,55.9),(y3,19),(y4,10),(y5,13),(y6,12)]))

y1_bar =
[15.2 59.  20.1 13.1 14.7 12.3]
y2_bar =
[15.4 57.4 19.8 10.1 13.5 11.9]
y3_bar =
[15.6 57.8 19.8 10.9 13.7 11.8]
L_1(y) =  7.7*y1 + 13.3*y2 + 3.9*y3 - 1.2*y4 + 14.7*y5 + 8.3*y6 - 640.6
L_2(y) =  10.2*y1 + 13.3*y2 + 4.0*y3 - 3.4*y4 + 13.4*y5 + 6.0*y6 - 607.1
L_3(y) =  11.0*y1 + 13.3*y2 + 3.9*y3 - 2.7*y4 + 13.2*y5 + 5.2*y6 - 614.2
L_1(y11) =  582.260000000000
L_2(y11) =  580.460000000000
L_3(y11) =  579.360000000000
L_1(y13) =  567.320000000000
L_2(y13) =  572.470000000000
L_3(y13) =  569.870000000000


## 9.4  ESTIMATING MISCLASSIFICATION RATES

### **Example 9.4(a)**

In [17]:
url = 'https://github.com/alzimna/Anmul/raw/main/data/t5.1.xlsx'
data = pd.read_excel(url)
print(data.head())
y1_bar = round(data.iloc[:,:4].mean(),2).values
y2_bar = round(data.iloc[:,4:].mean(),2).values
S1 = round(pd.DataFrame.cov(data.iloc[:,:4]),3).values
S2 = round(pd.DataFrame.cov(data.iloc[:,4:]),3).values
S_pl = 1/(32+32-2)*((32-1)*S1+(32-1)*S2)
a = np.matmul(np.linalg.inv(S_pl),y1_bar-y2_bar)
z1_bar = np.matmul(a,y1_bar)
z2_bar = np.matmul(a,y2_bar)
z_batas = 1/2*(z1_bar+z2_bar)

n_11 = 0
n_12 = 0
n_21 = 0
n_22 = 0
for i in range(32) :
  y1 = data.iloc[i,:4]
  z_y1 = np.matmul(a,y1)
  if z_y1 > z_batas :
    n_11+=1
  else :
    n_12+=1

  y2 = data.iloc[i,4:]
  z_y2 = np.matmul(a,y2)
  if z_y2 > z_batas :
    n_21+=1
  else :
    n_22+=1

#Membuat tabel hasil klasifikasi
print("Tabel klasifikasi data psikologi")
d = {'1': [n_11, n_21], '2': [n_12, n_22]}
df = pd.DataFrame(data=d,index = [1,2])
print(df)
acc = accuracy_rate(df)
print("Accuracy rate = ",acc)

   X1  X2  X3  X4  Y1  Y2  Y3  Y4
0  15  17  24  14  13  14  12  21
1  17  15  32  26  14  12  14  26
2  15  14  29  23  12  19  21  21
3  13  12  10  16  12  13  10  16
4  20  17  26  28  11  20  16  16
Tabel klasifikasi data psikologi
    1   2
1  28   4
2   4  28
Accuracy rate =  0.875


### **Example 9.4(b)**


In [18]:
url = 'https://github.com/alzimna/Anmul/raw/main/data/t8.3.xlsx'
data = pd.read_excel(url)
data["Group"] = data["Group"].astype('category')
Y = np.asmatrix(data.iloc[:,1:].values)
y1_bar = round(data[data["Group"]==1].iloc[:,1:].mean(),1).values
y2_bar = round(data[data["Group"]==2].iloc[:,1:].mean(),1).values
y3_bar = round(data[data["Group"]==3].iloc[:,1:].mean(),1).values

#Menghitung E
p = int(data.shape[1]-1)
n = int(data.shape[0]/3)
E = np.asmatrix(np.zeros((p,p)))

for j in range(n*3) :
  if j<30 :
    E += np.transpose(Y[j]-y1_bar) @ (Y[j]-y1_bar)
  elif j<60 :
    E += np.transpose(Y[j]-y2_bar) @ (Y[j]-y2_bar)
  else :
    E += np.transpose(Y[j]-y3_bar) @ (Y[j]-y3_bar)
E

#Menghitung S_pl
S_pl = np.matrix.round(E/87,3)

n_1 = [0]*3
n_2 = [0]*3
n_3 = [0]*3
for i in range(n*3) :
  L1 = float(np.asmatrix(y1_bar @ np.linalg.inv(S_pl)) @ np.transpose(Y[i]))-(y1_bar @ np.linalg.inv(S_pl) @ np.transpose(y1_bar))/2
  L2 = float(np.asmatrix(y2_bar @ np.linalg.inv(S_pl)) @ np.transpose(Y[i]))-(y2_bar @ np.linalg.inv(S_pl) @ np.transpose(y2_bar))/2
  L3 = float(np.asmatrix(y3_bar @ np.linalg.inv(S_pl)) @ np.transpose(Y[i]))-(y3_bar @ np.linalg.inv(S_pl) @ np.transpose(y3_bar))/2
  temp = list([L1,L2,L3])
  if i<30 :
    n_1[temp.index(max(temp))]+=1
  elif i<60 :
    n_2[temp.index(max(temp))]+=1
  else :
    n_3[temp.index(max(temp))]+=1

#Membuat tabel hasil klasifikasi
print("Tabel klasifikasi data football (Linear Classification)")
dfn = [n_1,n_2,n_3]
df = pd.DataFrame(dfn,columns = [1,2,3],index = [1,2,3])
print(df)
acc = accuracy_rate(df)
print("Accuracy rate = ",acc)

Tabel klasifikasi data football (Linear Classification)
    1   2   3
1  26   1   3
2   1  21   8
3   2   8  20
Accuracy rate =  0.7444444444444445


In [19]:
n_1 = [0]*3
n_2 = [0]*3
n_3 = [0]*3
S1 = data[data["Group"]==1].iloc[:,1:].cov()
S2 = data[data["Group"]==2].iloc[:,1:].cov()
S3 = data[data["Group"]==3].iloc[:,1:].cov()
dS1 = np.linalg.det(S1)
dS2 = np.linalg.det(S2)
dS3 = np.linalg.det(S3)
for i in range(n*3) :
  Q1 = -1/2*np.log(abs(dS1))-1/2*float((Y[i]-y1_bar) @ np.linalg.inv(S1) @ np.transpose(Y[i]-y1_bar))
  Q2 = -1/2*np.log(abs(dS2))-1/2*float((Y[i]-y2_bar) @ np.linalg.inv(S2) @ np.transpose(Y[i]-y2_bar))
  Q3 = -1/2*np.log(abs(dS3))-1/2*float((Y[i]-y3_bar) @ np.linalg.inv(S3) @ np.transpose(Y[i]-y3_bar))
  temp = list([Q1,Q2,Q3])
  if i<30 :
    n_1[temp.index(max(temp))]+=1
  elif i<60 :
    n_2[temp.index(max(temp))]+=1
  else :
    n_3[temp.index(max(temp))]+=1

#Membuat tabel hasil klasifikasi
print("Tabel klasifikasi data football (Quadratic Classification)")
dfn = [n_1,n_2,n_3]
df = pd.DataFrame(dfn,columns = [1,2,3],index = [1,2,3])
print(df)
acc = accuracy_rate(df)
print("Accuracy rate = ",acc)

Tabel klasifikasi data football (Quadratic Classification)
    1   2   3
1  27   1   2
2   2  24   4
3   1   5  24
Accuracy rate =  0.8333333333333334


## 9.5 IMPROVED ESTIMATES OF ERROR RATES

### **Example 9.5.2**

In [20]:
url = 'https://github.com/alzimna/Anmul/raw/main/data/t8.3.xlsx'
data = pd.read_excel(url)
data["Group"] = data["Group"].astype('category')
Y = np.asmatrix(data.iloc[:,1:].values)

def holdout(i) :
  temp = data.drop(i)
  y = data.iloc[i,1:]
  y1_bar = round(temp[temp["Group"]==1].iloc[:,1:].mean(),1).values
  y2_bar = round(temp[temp["Group"]==2].iloc[:,1:].mean(),1).values
  y3_bar = round(temp[temp["Group"]==3].iloc[:,1:].mean(),1).values

  Y = np.asmatrix(temp.iloc[:,1:].values)
  #Menghitung E
  p = int(temp.shape[1]-1)
  n = int(temp.shape[0])
  E = np.asmatrix(np.zeros((p,p)))

  for j in range(n) :
    if j<30 :
      E += np.transpose(Y[j]-y1_bar) @ (Y[j]-y1_bar)
    elif j<60 :
      E += np.transpose(Y[j]-y2_bar) @ (Y[j]-y2_bar)
    else :
      E += np.transpose(Y[j]-y3_bar) @ (Y[j]-y3_bar)

  #Menghitung S_pl
  S_pl = np.matrix.round(E/86,3)

  L1 = float(np.asmatrix(y1_bar @ np.linalg.inv(S_pl)) @ np.transpose(y))-(y1_bar @ np.linalg.inv(S_pl) @ np.transpose(y1_bar))/2
  L2 = float(np.asmatrix(y2_bar @ np.linalg.inv(S_pl)) @ np.transpose(y))-(y2_bar @ np.linalg.inv(S_pl) @ np.transpose(y2_bar))/2
  L3 = float(np.asmatrix(y3_bar @ np.linalg.inv(S_pl)) @ np.transpose(y))-(y3_bar @ np.linalg.inv(S_pl) @ np.transpose(y3_bar))/2
  L = list([L1,L2,L3])
  return(L.index(max(L)))

n_1 = [0]*3
n_2 = [0]*3
n_3 = [0]*3
for j in range(90) :
  grup = holdout(j)
  if j<30 :
    n_1[grup]+=1
  elif j<60 :
    n_2[grup]+=1
  else :
    n_3[grup]+=1
#Membuat tabel hasil klasifikasi
print("Tabel klasifikasi data football (Linear Classification With holdout)")
dfn = [n_1,n_2,n_3]
df = pd.DataFrame(dfn,columns = [1,2,3],index = [1,2,3])
print(df)
acc = accuracy_rate(df)
print("Accuracy rate = ",acc)

Tabel klasifikasi data football (Linear Classification With holdout)
    1   2   3
1  26   1   3
2   1  16  13
3   2   8  20
Accuracy rate =  0.6888888888888889


## 9.6 SUBSET SELECTION




### **Example 9.6**

In [21]:
url = 'https://github.com/alzimna/Anmul/raw/main/data/t8.3.xlsx'
data = pd.read_excel(url)[["Group","EYEHD","WDIM","JAW","EARHD"]]
data["Group"] = data["Group"].astype('category')
Y = np.asmatrix(data.iloc[:,1:].values)
y1_bar = round(data[data["Group"]==1].iloc[:,1:].mean(),1).values
y2_bar = round(data[data["Group"]==2].iloc[:,1:].mean(),1).values
y3_bar = round(data[data["Group"]==3].iloc[:,1:].mean(),1).values

#Menghitung E
p = int(data.shape[1]-1)
n = int(data.shape[0]/3)
E = np.asmatrix(np.zeros((p,p)))

for j in range(n*3) :
  if j<30 :
    E += np.transpose(Y[j]-y1_bar) @ (Y[j]-y1_bar)
  elif j<60 :
    E += np.transpose(Y[j]-y2_bar) @ (Y[j]-y2_bar)
  else :
    E += np.transpose(Y[j]-y3_bar) @ (Y[j]-y3_bar)
E

#Menghitung S_pl
S_pl = np.matrix.round(E/87,3)

n_1 = [0]*3
n_2 = [0]*3
n_3 = [0]*3
for i in range(n*3) :
  L1 = float(np.asmatrix(y1_bar @ np.linalg.inv(S_pl)) @ np.transpose(Y[i]))-(y1_bar @ np.linalg.inv(S_pl) @ np.transpose(y1_bar))/2
  L2 = float(np.asmatrix(y2_bar @ np.linalg.inv(S_pl)) @ np.transpose(Y[i]))-(y2_bar @ np.linalg.inv(S_pl) @ np.transpose(y2_bar))/2
  L3 = float(np.asmatrix(y3_bar @ np.linalg.inv(S_pl)) @ np.transpose(Y[i]))-(y3_bar @ np.linalg.inv(S_pl) @ np.transpose(y3_bar))/2
  temp = list([L1,L2,L3])
  if i<30 :
    n_1[temp.index(max(temp))]+=1
  elif i<60 :
    n_2[temp.index(max(temp))]+=1
  else :
    n_3[temp.index(max(temp))]+=1

#Membuat tabel hasil klasifikasi
print("Tabel klasifikasi data football (Linear Classification With Subset Selection)")
dfn = [n_1,n_2,n_3]
df = pd.DataFrame(dfn,columns = [1,2,3],index = [1,2,3])
print(df)
acc = accuracy_rate(df)
print("Accuracy rate = ",acc)

Tabel klasifikasi data football (Linear Classification With Subset Selection)
    1   2   3
1  26   1   3
2   1  21   8
3   2   7  21
Accuracy rate =  0.7555555555555555


## 9.7 NONPARAMETRIC PROCEDURES

### **Example 9.7.3**

In [33]:
url = 'https://github.com/alzimna/Anmul/raw/main/data/t8.3.xlsx'
data = pd.read_excel(url)
data["Group"] = data["Group"].astype('category')
Y = np.asmatrix(data.iloc[:,1:].values)
y1_bar = round(data[data["Group"]==1].iloc[:,1:].mean(),1).values
y2_bar = round(data[data["Group"]==2].iloc[:,1:].mean(),1).values
y3_bar = round(data[data["Group"]==3].iloc[:,1:].mean(),1).values

#Menghitung E
p = int(data.shape[1]-1)
n = int(data.shape[0]/3)
E = np.asmatrix(np.zeros((p,p)))

for j in range(n*3) :
  if j<30 :
    E += np.transpose(Y[j]-y1_bar) @ (Y[j]-y1_bar)
  elif j<60 :
    E += np.transpose(Y[j]-y2_bar) @ (Y[j]-y2_bar)
  else :
    E += np.transpose(Y[j]-y3_bar) @ (Y[j]-y3_bar)
E

#Menghitung S_pl
S_pl = np.matrix.round(E/87,3)

n_1 = [0]*3
n_2 = [0]*3
n_3 = [0]*3
for i in range(n*3) :
  y = Y[i]
  distance = []
  for k in range(90) :
    d = float((y-Y[k]) @ np.linalg.inv(S_pl) @ np.transpose((y-Y[k])))
    distance.append(d)
  id = np.array(distance).argsort()[1:6]//30
  vote = [0]*3
  for item in id :
    vote[item]+=1
  m = max(vote)
  if vote.count(m)==1 :
    if i<30 :
      n_1[vote.index(max(vote))]+=1
    elif i<60 :
      n_2[vote.index(max(vote))]+=1
    else :
      n_3[vote.index(max(vote))]+=1
  else : print(i,vote)
#Membuat tabel hasil klasifikasi
print("Tabel klasifikasi data football (knn)")
dfn = [n_1,n_2,n_3]
df = pd.DataFrame(dfn,columns = [1,2,3],index = [1,2,3])
print(df)
acc = accuracy_rate(df)
print("Accuracy rate = ",acc)

0 [2, 2, 1]
18 [2, 1, 2]
20 [2, 1, 2]
42 [1, 2, 2]
46 [1, 2, 2]
48 [1, 2, 2]
67 [1, 2, 2]
79 [2, 1, 2]
82 [1, 2, 2]
88 [1, 2, 2]
Tabel klasifikasi data football (knn)
    1   2   3
1  23   0   4
2   1  14  12
3   2   6  18
Accuracy rate =  0.6875


In [23]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
X = data.iloc[:,1:].values
y = data.iloc[:,0].values
knn_model.fit(X, y)
train_preds = knn_model.predict(X)
n_1 = [0]*3
n_2 = [0]*3
n_3 = [0]*3
for i in range(90) :
  if i<30 :
    n_1[train_preds[i]-1]+=1
  elif i<60 :
    n_2[train_preds[i]-1]+=1
  else :
    n_3[train_preds[i]-1]+=1
#Membuat tabel hasil klasifikasi
print("Tabel klasifikasi data football (Package KNN)")
dfn = [n_1,n_2,n_3]
df = pd.DataFrame(dfn,columns = [1,2,3],index = [1,2,3])
print(df)
acc = accuracy_rate(df)
print("Accuracy rate = ",acc)

Tabel klasifikasi data football (Package KNN)
    1   2   3
1  28   1   1
2   1  21   8
3   3   9  18
Accuracy rate =  0.7444444444444445
