In [64]:
import math
class Classifier:

    def __init__(self, filename):

        self.medianAndDeviation = []
        
        # reading the data in from the file
        f = open(filename)
        lines = f.readlines()
        f.close()
        self.format = lines[0].strip().split('\t')
        self.data = []
        for line in lines[1:]:
            fields = line.strip().split('\t')
            ignore = []
            vector = []
            for i in range(len(fields)):
                if self.format[i] == 'num':
                    vector.append(int(fields[i]))
                elif self.format[i] == 'comment':
                    ignore.append(fields[i])
                elif self.format[i] == 'class':
                    classification = fields[i]
            self.data.append((classification, vector, ignore))
        self.rawData = list(self.data)
        self.vlen = len(self.data[0][1])
        for i in range(self.vlen):
            self.normalizeColumn(i)
    
    def getMedian(self, alist):
        if alist == []:
            return []
        sortedList = sorted(alist)
        length = len(alist)
        if length %2 == 1:
            return sortedList[math.floor(length/2)]
        else:
            return (sortedList[math.floor(length/2) - 1] + sortedList[math.floor(length/2)])/2.0
            
    def getAbsoluteStandardDeviation(self, alist, median):
        isum = 0
        for item in alist:
            isum += abs(item - median)
        return isum/len(alist)
        
    def normalizeColumn(self, columnNumber):
        """given a column number, normalize that column in self.data
        using the Modified Standard Score"""

        """ TO BE DONE"""
        col = [v[1][columnNumber] for v in self.data]
        median = self.getMedian(col)
        asd = self.getAbsoluteStandardDeviation(col, median)
        self.medianAndDeviation.append((median, asd))
        for v in self.data:
            v[1][columnNumber] = (v[1][columnNumber] - median ) /asd
            
    def normalizeVector(self, v):
        """We have stored the median and asd for each column
        we now use them to normalize vector v"""
        vector = list(v)
        for i in range(len(vector)):
            (median, asd) = self.medianAndDeviation[i]
            vector[i]  = (vector[i] - median)/asd
        return vector
    
    def manhattan(self, vector1, vector2):
        return sum(map(lambda v1, v2: abs(v1-v2), vector1, vector2))
    
    def nearestNeighbor(self, itemvector):
        """return nearest neighbor to itemVector"""
        return min([(self.manhattan(itemvector, item[1]), item)
                   for item in self.data])
    
    def classify(self, itemvector):
        """Return class we think item vector is in"""
        return (self.nearestNeighbor(self.normalizeVector(itemvector))[1][0])

In [38]:
def unitTest():
    classifier = Classifier('athletesTrainingSet.txt')
    #
    #  test median and absolute standard deviation methods
    list1 = [54, 72, 78, 49, 65, 63, 75, 67, 54, 76, 68,
             61, 58, 70, 70, 70, 63, 65, 66, 61]
    list2 = [66, 162, 204, 90, 99, 106, 175, 123, 68,
             200, 163, 95, 77, 108, 155, 155, 108, 106, 97, 76]
    m1 = classifier.getMedian(list1)
    assert(round(m1, 3) == 65.5)
    m2 = classifier.getMedian(list2)
    assert(round(m2, 3) == 107)
    assert(round(classifier.getAbsoluteStandardDeviation(list1, m1),3) == 5.95)
    assert(round(classifier.getAbsoluteStandardDeviation(list2, m2),3) == 33.65)
    print("getMedian and getAbsoluteStandardDeviation are OK")

    # test normalizeColumn
    list1 = [[-1.9328, -1.2184], [1.0924, 1.6345], [2.1008, 2.8826],
             [-2.7731, -0.5052], [-0.084, -0.2377], [-0.4202, -0.0297],
             [1.5966, 2.0208], [0.2521, 0.4755], [-1.9328, -1.159],
             [1.7647, 2.7637], [0.4202, 1.6642], [-0.7563, -0.3566],
             [-1.2605, -0.8915], [0.7563, 0.0297], [0.7563, 1.4264],
             [0.7563, 1.4264], [-0.4202, 0.0297], [-0.084, -0.0297],
             [0.084, -0.2972], [-0.7563, -0.9212]]
    
    #classifier.normalizeColumn(0)
    for i in range(len(list1)):
        assert(round(classifier.data[i][1][0],4) == list1[i][0])
        assert(round(classifier.data[i][1][1],4) == list1[i][1])
    print("normalizeColumn is OK")
    

In [39]:
unitTest()

getMedian and getAbsoluteStandardDeviation are OK
normalizeColumn is OK


In [62]:
def unitTest2():
    classifier = Classifier('athletesTrainingSet.txt')
    #print(classifier.rawData)
    br = ('Basketball', [72, 162], ['Brittainey Raven'])
    nl = ('Gymnastics', [61, 76], ['Viktoria Komova'])
    cl = ("Basketball", [74, 190], ['Crystal Langhorne'])
    # first check normalize function
    brNorm = classifier.normalizeVector(br[1])
    nlNorm = classifier.normalizeVector(nl[1])
    clNorm = classifier.normalizeVector(cl[1])
    assert(brNorm == classifier.data[1][1])
    assert(nlNorm == classifier.data[-1][1])
    print(classifier.data[-1][1])
    print('normalizeVector fn OK')
    # check distance
    assert (round(classifier.manhattan(clNorm, classifier.data[1][1]), 5) == 1.16823)
    assert(classifier.manhattan(brNorm, classifier.data[1][1]) == 0)
    assert(classifier.manhattan(nlNorm, classifier.data[-1][1]) == 0)
    print('Manhattan distance fn OK')
    # Brittainey Raven's nearest neighbor should be herself
    result = classifier.nearestNeighbor(brNorm)
    assert(result[1][2]== br[2])
    # Nastia Liukin's nearest neighbor should be herself
    result = classifier.nearestNeighbor(nlNorm)
    assert(result[1][2]== nl[2])
    # Crystal Langhorne's nearest neighbor is Jennifer Lacy"
    assert(classifier.nearestNeighbor(clNorm)[1][2][0] == "Jennifer Lacy")
    print("Nearest Neighbor fn OK")
    # Check if classify correctly identifies sports
    assert(classifier.classify(br[1]) == 'Basketball')
    assert(classifier.classify(cl[1]) == 'Basketball')
    assert(classifier.classify(nl[1]) == 'Gymnastics')
    print('Classify fn OK')

In [65]:
unitTest2()

0 [54, 66]
0 [72, 162]
0 [78, 204]
0 [49, 90]
0 [65, 99]
0 [63, 106]
0 [75, 175]
0 [67, 123]
0 [54, 68]
0 [76, 200]
0 [68, 163]
0 [61, 95]
0 [58, 77]
0 [70, 108]
0 [70, 155]
0 [70, 155]
0 [63, 108]
0 [65, 106]
0 [66, 97]
0 [61, 76]
1 [-1.9327731092436975, 66]
1 [1.0924369747899159, 162]
1 [2.100840336134454, 204]
1 [-2.773109243697479, 90]
1 [-0.08403361344537814, 99]
1 [-0.42016806722689076, 106]
1 [1.5966386554621848, 175]
1 [0.25210084033613445, 123]
1 [-1.9327731092436975, 68]
1 [1.7647058823529411, 200]
1 [0.42016806722689076, 163]
1 [-0.7563025210084033, 95]
1 [-1.2605042016806722, 77]
1 [0.7563025210084033, 108]
1 [0.7563025210084033, 155]
1 [0.7563025210084033, 155]
1 [-0.42016806722689076, 108]
1 [-0.08403361344537814, 106]
1 [0.08403361344537814, 97]
1 [-0.7563025210084033, 76]
[-0.7563025210084033, -0.9212481426448738]
normalizeVector fn OK
Manhattan distance fn OK
Nearest Neighbor fn OK
Classify fn OK


In [49]:
def test(training_filename, test_filename):
    """Test the classifier on a test set of data"""
    classifier = Classifier(training_filename)
    f = open(test_filename)
    lines = f.readlines()
    f.close()
    numCorrect = 0.0
    for line in lines:
        data = line.strip().split('\t')
        vector = []
        classInColumn = -1
        for i in range(len(classifier.format)):
              if classifier.format[i] == 'num':
                  vector.append(float(data[i]))
              elif classifier.format[i] == 'class':
                  classInColumn = i
        theClass= classifier.classify(vector)
        prefix = '-'
        if theClass == data[classInColumn]:
            # it is correct
            numCorrect += 1
            prefix = '+'
        print("%s  %12s  %s" % (prefix, theClass, line))
    print("%4.2f%% correct" % (numCorrect * 100/ len(lines)))

In [50]:
test("athletesTrainingSet.txt", "athletesTestSet.txt")

-         Track  Aly Raisman	Gymnastics	62	115

+    Basketball  Crystal Langhorne	Basketball	74	190

+    Basketball  Diana Taurasi	Basketball	72	163

+    Basketball  Erin Thorn	Basketball	69	144

-         Track  Hannah Whelan	Gymnastics	63	117

+    Gymnastics  Jaycie Phelps	Gymnastics	60	97

+    Basketball  Kelly Miller	Basketball	70	140

+    Gymnastics  Kerri Strug	Gymnastics	57	87

+    Gymnastics  Koko Tsurumi	Gymnastics	55	75

-         Track  Li Shanshan	Gymnastics	64	101

+    Basketball  Lindsay Whalen	Basketball	69	169

-    Gymnastics  Lisa Jane Weightman	Track	62	97

+    Basketball  Maya Moore	Basketball	72	174

+         Track  Paula Radcliffe	Track	68	120

+    Basketball  Penny Taylor	Basketball	73	165

+         Track  Priscah Jeptoo	Track	65	108

+         Track  Shalane Flanagan	Track	65	106

+         Track  Xiaolin Zhu	Track	67	121

+         Track  Xueqin Wang	Track	64	110

+         Track  Zhu Xiaolin	Track	67	123

80.00% correct
