In [11]:
import urllib
import numpy as np
import scipy.spatial.distance as ssd

def read_data():
    #read in red wine data
    urllib.urlretrieve('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', 'winequality-red.csv')
    with open('winequality-red.csv') as f:
        lines = (line for line in f)
        data = np.loadtxt(lines, delimiter=';', skiprows=1)
    return data

def knn(k, X_train, X_test, q_train, dist=1):
    """ k-nearest neighbors """
 
    # initialize list to store predicted class
    pred_class = []
    # for each instance in data testing,
    # calculate distance in respect to data training
    for ii, di in enumerate(X_test):
        distances = []  # initialize list to store distance
        for ij, dj in enumerate(X_train):
            # calculate distances
            distances.append((calc_dist(di,dj,dist), ij))
        # k-neighbors
        k_nn = sorted(distances)[:k]
        # predict the class for the instance
        pred_class.append(classify(k_nn, q_train))
 
    # return prediction class
    return pred_class
 
def calc_dist(di,dj,i=1):
    """ Distance calculation for every
        distance functions in use"""
    if i == 1:
        return ssd.euclidean(di,dj) # built-in Euclidean fn
    elif i == 2:
        return ssd.cityblock(di,dj) # built-in Manhattan fn
    elif i == 3:
        return ssd.cosine(di,dj)    # built-in Cosine fn
 
def evaluate(result):
    """ Evaluate the prediction class"""
 
    # create eval result array to store evaluation result
    eval_result = np.zeros(2,int)
    for x in result:
        # increment the correct prediction by 1
        if x == 0:
            eval_result[0] += 1
        # increment the wrong prediction by 1
        else:
            eval_result[1] += 1
    # return evaluation result
    return eval_result

def classify(k_nn, q_train):
    """ Classify instance data test into class"""
 
    qlabel = []
    for dist, idx in k_nn:
        # retrieve label class and store into qlabel
        qlabel.append(q_train[idx])
 
    # return prediction class
    return np.argmax(np.bincount(qlabel))
 
def main():
    """ k-nearest neighbors classifier """

    # read dataset of red wine
    data = read_data()
    N = data.shape[0] #get tupple (numRows, numCols)
    np.random.shuffle(data)
    train = data[:int(N*0.7)]
    test = data[int(N*0.7):]
    X_train = train[:,:11]
    q_train = train[:,11] 
    X_test = test[:,:11]
    q_test = test[:,11]
    
    # initialize K
    K = [1,3,7,11,19]
 
    # distance function for euclidean (1), manhattan (2),
    # and cosine (3)
    dist_fn = [1,2,3]

    print "k-NN classification results for red wine data set:"

    print
    print "    Number of correct / wrong classified test records"
    print "k  | Euclidean dist | Manhattan dist | Cosine dist"
 
    # run knn classifier for each k and distance function
    for i in range(len(K)):
        # classification result for each distance function
        results = []
        for j in range(len(dist_fn)):
            # predict the data test into class
            pred_class = knn(K[i], X_train, X_test, q_train, dist_fn[j])
            #print len(pred_class), len(q_test)
            # evaluate the predicted result
            eval_result = evaluate(pred_class-q_test)
            #print eval_result
            # assign the evaluated result into classification result
            results.append(eval_result[0])
            results.append(eval_result[1])

        # print the classification result into the screen
        print K[i], " |     ", results[0], "/", results[1], \
            "    |    ", results[2], "/", results[3], \
            "     |    ", results[4], "/", results[5]
        results = []

main()

k-NN classification results for red wine data set:

    Number of correct / wrong classified test records
k  | Euclidean dist | Manhattan dist | Cosine dist
1  |      269 / 211     |     271 / 209      |     268 / 212
3  |      227 / 253     |     235 / 245      |     241 / 239
7  |      231 / 249     |     238 / 242      |     241 / 239
11  |      229 / 251     |     249 / 231      |     237 / 243
19  |      238 / 242     |     252 / 228      |     238 / 242

Runtime: 208.673616193
