### Naive bayes classifier

In [39]:
class BayesClassifier:
    def __init__(self, bucketPrefix, testBucketNumber, dataFormat):
        """a classifier will be built from files with the bucketPrefix
        excluding the file with textBucketNumber. dataFormat is a
        string that describes how to interpret each line of the data
        files. For example, for the iHealth data the format is:
        "attr    attr    attr    attr    class"
        """
        total = 0
        classes = {}
        counts = {}
        
        # reading the data in from the file
        self.format = dataFormat.strip().split('\t')
        self.prior = {}
        self.conditional = {}
        # for each of the buckets numbered 1 through 10:
        for i in range(1,11):
            if i != testBucketNumber:
                filename = "%s-%02i" % (bucketPrefix, i)
                f = open(filename)
                lines = f.readlines()
                f.close()
                for line in lines:
                    fields = line.strip().split('\t')
                    ignore = []
                    vector = []
                    for i in range(len(fields)):
                        if self.format[i] == 'num':
                            vector.append(float(fields[i]))
                        elif self.format[i] == 'attr':
                            vector.append(fields[i])
                        elif self.format[i] == 'comment':
                            ignore.append(fields[i])
                        elif self.format[i] == 'class':
                            category = fields[i]
                    # now process this instance
                    total += 1
                    classes.setdefault(category, 0)
                    counts.setdefault(category, {})
                    classes[category] += 1
                    # process each attribute of the instance
                    col = 0
                    for columnValue in vector:
                        col += 1
                        counts[category].setdefault(col, {})
                        counts[category][col].setdefault(columnValue, 0)
                        counts[category][col][columnValue] += 1
                    
                    # compute probabilites
            for (category, count) in classes.items():
                self.prior[category] = count/ total
                
            # compute the conditional probabilites p(h|D)
            for (category, columns) in counts.items():
                self.conditional.setdefault(category, {})
                for (col, valueCounts) in columns.items():
                    self.conditional[category].setdefault(col, {})
                    for (attrValue, count) in valueCounts.items():
                        print(category, col, attrValue, count, classes[category], count/classes[category])
                        self.conditional[category][col][attrValue] = count / classes[category]
            
    def classify(self, itemVector):
        """Return class we think item Vecotr is in"""
        results = []
        for (category, prior) in self.prior.items():
            prob = prior
            col = 1
            for attrValue in itemVector:
                if not attrValue in self.conditional[category][col]:
                    # if no any instances of this value
                    prob = 0
                else:
                    prob = prob * self.conditional[category][col][attrValue]
                    col += 1
            results.append((prob,category))
        # return the category with the highest probability
        return max(results)[1]
        

In [40]:
c = BayesClassifier("C:\\Users\\Howard\\Desktop\\Learnbyself\\python\\A Programmer's Guide to Data Mining\\iHealth\\i", 
               10, "attr\tattr\tattr\tattr\tclass")
print(c.classify(['health', 'moderate', 'moderate', 'yes']))

i100 1 both 3 6 0.5
i100 1 appearance 2 6 0.3333333333333333
i100 1 health 1 6 0.16666666666666666
i100 2 sedentary 3 6 0.5
i100 2 moderate 1 6 0.16666666666666666
i100 2 active 2 6 0.3333333333333333
i100 3 moderate 5 6 0.8333333333333334
i100 3 aggressive 1 6 0.16666666666666666
i100 4 yes 2 6 0.3333333333333333
i100 4 no 4 6 0.6666666666666666
i500 1 health 4 9 0.4444444444444444
i500 1 appearance 3 9 0.3333333333333333
i500 1 both 2 9 0.2222222222222222
i500 2 sedentary 2 9 0.2222222222222222
i500 2 active 4 9 0.4444444444444444
i500 2 moderate 3 9 0.3333333333333333
i500 3 moderate 3 9 0.3333333333333333
i500 3 aggressive 6 9 0.6666666666666666
i500 4 yes 6 9 0.6666666666666666
i500 4 no 3 9 0.3333333333333333
i100 1 both 3 6 0.5
i100 1 appearance 2 6 0.3333333333333333
i100 1 health 1 6 0.16666666666666666
i100 2 sedentary 3 6 0.5
i100 2 moderate 1 6 0.16666666666666666
i100 2 active 2 6 0.3333333333333333
i100 3 moderate 5 6 0.8333333333333334
i100 3 aggressive 1 6 0.16666666666

In [None]:
"C:\\Users\\Howard\\Desktop\\Learnbyself\\python\\A Programmer's Guide to Data Mining\\pimaSmall\\pimaSmall"

###  Gaussian Distribution

In [3]:
import math

def pdf(mean, ssd, x):
    """Probability Density Function computing P(x|y)
    input is the mean, sample standard deviation for all the items in y and x."""
    ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2))
    return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart

In [4]:
pdf(106.111, 21.327, 100)

0.017953602706962717

### Case 2: president election in United State 

In [16]:
import math

class Classifier:
    def __init__(self, bucketPrefix, testBucketNumber, dataFormat):

        """ a classifier will be built from files with the bucketPrefix
        excluding the file with textBucketNumber. dataFormat is a string that
        describes how to interpret each line of the data files. For example,
        for the iHealth data the format is:
        "attr	attr	attr	attr	class"
        """
   
        total = 0
        classes = {}
        # counts used for attributes that are not numeric
        counts = {}
        # totals used for attributes that are numereric
        # we will use these to compute the mean and sample standard deviation for
        # each attribute - class pair.
        totals = {}
        numericValues = {}
        
        
        # reading the data in from the file
        
        self.format = dataFormat.strip().split('\t')
        # 
        self.prior = {}
        self.conditional = {}
 
        # for each of the buckets numbered 1 through 10:
        for i in range(1, 11):
            # if it is not the bucket we should ignore, read in the data
            if i != testBucketNumber:
                filename = "%s-%02i" % (bucketPrefix, i)
                f = open(filename)
                lines = f.readlines()
                f.close()
                for line in lines:
                    fields = line.strip().split('\t')
                    ignore = []
                    vector = []
                    nums = []
                    for i in range(len(fields)):
                        if self.format[i] == 'num':
                            nums.append(float(fields[i]))
                        elif self.format[i] == 'attr':
                            vector.append(fields[i])                           
                        elif self.format[i] == 'comment':
                            ignore.append(fields[i])
                        elif self.format[i] == 'class':
                            category = fields[i]
                    # now process this instance
                    total += 1
                    classes.setdefault(category, 0)
                    counts.setdefault(category, {})
                    totals.setdefault(category, {})
                    numericValues.setdefault(category, {})
                    classes[category] += 1
                    # now process each non-numeric attribute of the instance
                    col = 0
                    for columnValue in vector:
                        col += 1
                        counts[category].setdefault(col, {})
                        counts[category][col].setdefault(columnValue, 0)
                        counts[category][col][columnValue] += 1
                    # process numeric attributes
                    col = 0
                    for columnValue in nums:
                        col += 1
                        totals[category].setdefault(col, 0)
                        #totals[category][col].setdefault(columnValue, 0)
                        totals[category][col] += columnValue
                        numericValues[category].setdefault(col, [])
                        numericValues[category][col].append(columnValue)
                    
        
        #
        # ok done counting. now compute probabilities
        #
        # first prior probabilities p(h)
        #
        for (category, count) in classes.items():
            self.prior[category] = count / total
        #
        # now compute conditional probabilities p(h|D)
        #
        for (category, columns) in counts.items():
            self.conditional.setdefault(category, {})
            for (col, valueCounts) in columns.items():
                self.conditional[category].setdefault(col, {})
                for (attrValue, count) in valueCounts.items():
                    self.conditional[category][col][attrValue] = (count / classes[category])
        self.tmp =  counts               
        #
        # now compute mean and sample standard deviation
        #
        self.means = {}
        self.ssd = {}
        self.totals = totals
        # ADD YOUR CODE HERE
        for (category, columns) in totals.items():
            self.means.setdefault(category, {})
            for (col, cTotal) in columns.items():
                self.means[category][col] = cTotal / classes[category]
        # standard deviation
        
        for (category, columns) in numericValues.items():
            self.ssd.setdefault(category, {})
            for (col, values) in columns.items():
                SumOfSquareDifferences = 0
                theMean = self.means[category][col]
                for value in values:
                    SumOfSquareDifferences += (value - theMean)**2
                columns[col] = 0
                self.ssd[category][col] = math.sqrt(SumOfSquareDifferences / (classes[category]  - 1)) 
                
    def classify(self, itemVector, numVector):
        """Return class we think item Vecotr is in"""
        results = []
        for (category, prior) in self.prior.items():
            prob = prior
            col = 1
            for attrValue in itemVector:
                if not attrValue in self.conditional[category][col]:
                    # if no any instances of this value
                    prob = 0
                else:
                    prob = prob * self.conditional[category][col][attrValue]
                    col += 1

            col = 1
            for x in numVector:
                mean = self.means[category][col]
                ssd = self.ssd[category][col]
                ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2))
                prob = prob * (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart
                col += 1
            results.append((prob,category))
        # return the category with the highest probability
        return max(results)[1]

In [15]:
c = Classifier("C:\\Users\\Howard\\Desktop\\Learnbyself\\python\\A Programmer's Guide to Data Mining\\pimaSmall\\pimaSmall",
               1, "num	num	num	num	num	num	num	num	class")

# test means computation
# test means computation
assert(c.means['1'][1] == 5.25)
assert(round(c.means['1'][2], 4) == 146.0556)
assert(round(c.means['0'][2], 4) == 111.9057)

# test standard deviation
assert(round(c.ssd['0'][1], 4) == 2.5469)
assert(round(c.ssd['1'][8], 4) == 10.9218)
print("Means and SSD computation appears OK")

Means and SSD computation appears OK
