In [89]:
################################################
## EE559 HW Wk2, Prof. Jenkins, Spring 2018
## Created by Arindam Jati, TA
## Tested in Python 3.6.3, OSX El Captain
################################################

import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist

def plotDecBoundaries(training, label_train, sample_mean):

    #Plot the decision boundaries and data points for minimum distance to
    #class mean classifier
    #
    # training: traning data
    # label_train: class lables correspond to training data
    # sample_mean: mean vector for each class
    #
    # Total number of classes
    nclass =  max(np.unique(label_train))

    # Set the feature range for ploting
    max_x = np.ceil(max(training[:, 0])) + 1
    min_x = np.floor(min(training[:, 0])) - 1
    max_y = np.ceil(max(training[:, 1])) + 1
    min_y = np.floor(min(training[:, 1])) - 1

    xrange = (min_x, max_x)
    yrange = (min_y, max_y)

    # step size for how finely you want to visualize the decision boundary.
    inc = 0.005

    # generate grid coordinates. this will be the basis of the decision
    # boundary visualization.
    (x, y) = np.meshgrid(np.arange(xrange[0], xrange[1]+inc/100, inc), np.arange(yrange[0], yrange[1]+inc/100, inc))

    # size of the (x, y) image, which will also be the size of the
    # decision boundary image that is used as the plot background.
    image_size = x.shape
    xy = np.hstack( (x.reshape(x.shape[0]*x.shape[1], 1, order='F'), y.reshape(y.shape[0]*y.shape[1], 1, order='F')) ) # make (x,y) pairs as a bunch of row vectors.

    # distance measure evaluations for each (x,y) pair.
    dist_mat = cdist(xy, sample_mean)
    pred_label = np.argmin(dist_mat, axis=1)

    # reshape the idx (which contains the class label) into an image.
    decisionmap = pred_label.reshape(image_size, order='F')

    #show the image, give each coordinate a color according to its class label
    plt.imshow(decisionmap, extent=[xrange[0], xrange[1], yrange[0], yrange[1]], origin='lower')

    # plot the class training data.
    plt.plot(training[label_train == 1, 0],training[label_train == 1, 1], 'rx')
    plt.plot(training[label_train == 2, 0],training[label_train == 2, 1], 'go')
    if nclass == 3:
        plt.plot(training[label_train == 3, 0],training[label_train == 3, 1], 'b*')

    # include legend for training data
    if nclass == 3:
        l = plt.legend(('Class 1', 'Class 2', 'Class 3'), loc=2)
    else:
        l = plt.legend(('Class 1', 'Class 2'), loc=2)
    plt.gca().add_artist(l)

    # plot the class mean vector.
    m1, = plt.plot(sample_mean[0,0], sample_mean[0,1], 'rd', markersize=12, markerfacecolor='r', markeredgecolor='w')
    m2, = plt.plot(sample_mean[1,0], sample_mean[1,1], 'gd', markersize=12, markerfacecolor='g', markeredgecolor='w')
    if nclass == 3:
        m3, = plt.plot(sample_mean[2,0], sample_mean[2,1], 'bd', markersize=12, markerfacecolor='b', markeredgecolor='w')

    # include legend for class mean vector
    if nclass == 3:
        l1 = plt.legend([m1,m2,m3],['Class 1 Mean', 'Class 2 Mean', 'Class 3 Mean'], loc=4)
    else:
        l1 = plt.legend([m1,m2], ['Class 1 Mean', 'Class 2 Mean'], loc=4)

    plt.gca().add_artist(l1)

    plt.show()



#Calculate the sample_mean for given XY coordinate and label
def Sample_Mean(xy_data,label):
    nclass =int(max(np.unique(label))) #number of classes
    sample=np.array([[float(0)]*len(xy_data[0]) for _ in range(nclass)])

    for i in range(len(label)):
        index=int(label[i])-1
        sample[index]= sample[index]+xy_data[i] #sum of the same class
    for i in range(nclass):
        sample[i] =sample[i]/list(label).count(i+1) #calculate average
    sample_mean=np.array(sample)
    return sample_mean


# Train the first synthetic1_train
csvFile = np.loadtxt("synthetic1_train.csv", dtype=np.float, delimiter=",")
sample_mean_1 = Sample_Mean(csvFile[:,:-1],csvFile[:,-1])

for i in range(len(sample_mean_1)):
    print ("class %d mean is : %s"%(i,str(sample_mean_1[i])))
plotDecBoundaries(csvFile[:,:2],csvFile[:,-1],sample_mean_1)


#Calculate the error rate for given XY coordinate, label and sample_mean 
def Error_Rate(xy_data,label,sample_mean):
    golden_label = label
    dist_mat = cdist(xy_data, sample_mean)
    pred_label = np.argmin(dist_mat, axis=1)+1
    error_num=0
    for i in range(len(pred_label)):   # count the number of errors
        if pred_label[i] != golden_label[i]:
            error_num+=1                
    return error_num/float(len(golden_label))   

#Output the error rate for training and testing data
error_rate_1=Error_Rate(csvFile[:,:-1],csvFile[:,-1],sample_mean_1)
print ("error rate for synthetic1_train data is %s%%"%(error_rate_1*100))

testFile = np.loadtxt("synthetic1_test.csv", dtype=np.float, delimiter=",")
error_rate_test_1=Error_Rate(testFile[:,:-1],testFile[:,-1],sample_mean_1)
print ("error rate for synthetic1_test data is %s%%"%(error_rate_test_1*100))
print ("\n")

# Train the synthetic2 data
training_data_2=np.loadtxt("synthetic2_train.csv", dtype=np.float, delimiter=",")
sample_mean_2 = Sample_Mean(training_data_2[:,:-1],training_data_2[:,-1])

for i in range(len(sample_mean_2)):
    print ("class %d mean is : %s"%(i,str(sample_mean_2[i])))
plotDecBoundaries(training_data_2[:,:-1],training_data_2[:,-1],sample_mean_2)

#Output the error rate for training and testing data
error_rate = Error_Rate(training_data_2[:,:-1],training_data_2[:,-1],sample_mean_2)
print ("error rate for synthetic2_train data is %s%%"%(error_rate*100))

test_data_2=np.loadtxt("synthetic2_test.csv", dtype=np.float, delimiter=",")
error_rate = Error_Rate(test_data_2[:,:-1],test_data_2[:,-1],sample_mean_2)
print ("error rate for synthetic2_test data is %s%%"%(error_rate*100))
print ("\n")

# Train the wine data, just use the first two columns as XY data
wine_data=np.loadtxt("wine_train.csv", dtype=np.float, delimiter=",")
wine_xy=wine_data[:,:2]
wine_label=wine_data[:,-1]
sample_mean_wine = Sample_Mean(wine_xy,wine_label)

for i in range(len(sample_mean_wine)):
    print ("class %d mean is : %s"%(i,str(sample_mean_wine[i])))
plotDecBoundaries(wine_xy,wine_label,sample_mean_wine)

#Output the error rate for training and testing data
error_rate = Error_Rate(wine_xy,wine_label,sample_mean_wine)
print ("error rate for wine train data is %s%%"%(error_rate*100))

wine_test_data=np.loadtxt("wine_test.csv", dtype=np.float, delimiter=",")
wine_test_xy=wine_test_data[:,:2]
wine_test_label=wine_test_data[:,-1]
error_rate = Error_Rate(wine_test_xy,wine_test_label,sample_mean_wine)
print ("error rate for wine test data is %s%%"%(error_rate*100))
print ("\n")



#Go through all the possible pairs and calculate the error rate for each pair.

min_ij=[]
min_error_rate=float("inf")
for i in range(len(wine_data[0])-2):
    for j in range(i+1,len(wine_data[0])-1):
        wine_x=wine_data[:,i]
        wine_y=wine_data[:,j]
        wine_new_xy=np.vstack((wine_x,wine_y))
        wine_new_xy=wine_new_xy.T
        sample_mean_wine = Sample_Mean(wine_new_xy,wine_label)
        error_rate=Error_Rate(wine_new_xy,wine_label,sample_mean_wine)
        
        wine_test_x=wine_test_data[:,i]
        wine_test_y=wine_test_data[:,j]
        wine_test_new_xy=np.vstack((wine_test_x,wine_test_y))
        wine_test_new_xy=wine_test_new_xy.T
        sample_mean_wine = Sample_Mean(wine_test_new_xy,wine_test_label)
        test_error_rate=Error_Rate(wine_test_new_xy,wine_test_label,sample_mean_wine)
        
        '''
        # Use this to output every error_rate
        print ([i,j])
        print ("%s%%"%(error_rate*100))
        print ("%s%%"%(test_error_rate*100))
        '''
        if error_rate < min_error_rate:
            min_ij=[i,j]
            min_error_rate= error_rate
       
print ("The minimum error_rate for wine training data is: %s%%"%(min_error_rate*100))
print ("Which means the error number is: %s"%(min_error_rate*len(wine_data)))
print ("And the index is: %s" %min_ij)
wine_x=wine_data[:,min_ij[0]]
wine_y=wine_data[:,min_ij[1]]
wine_new_xy=np.vstack((wine_x,wine_y))
wine_new_xy=wine_new_xy.T

sample_mean_wine_new = Sample_Mean(wine_new_xy,wine_label)
for i in range(len(sample_mean_wine)):
    print ("class %d mean is : %s"%(i,str(sample_mean_wine_new[i])))
plotDecBoundaries(wine_new_xy,wine_label,sample_mean_wine_new)

#Calculate the error rate for [0,11] data
wine_x=wine_test_data[:,min_ij[0]]
wine_y=wine_test_data[:,min_ij[1]]
wine_new_xy=np.vstack((wine_x,wine_y))
wine_new_xy=wine_new_xy.T
error_rate=Error_Rate(wine_new_xy,wine_test_label,sample_mean_wine_new)
print ("The minimum error rate for wine test data is %s%%"%(error_rate*100))

