In [1]:
%%writefile chineseExample.txt
D1	1	Chinese Beijing	Chinese
D2	1	Chinese Chinese	Shanghai
D3	1	Chinese	Macao
D4	0	Tokyo Japan	Chinese
D5	0	Chinese Chinese	Chinese Tokyo Japan

Overwriting chineseExample.txt


In [2]:
%%writefile MRNaiveBayesTrainer.py

"""An implementation of a multinomial Naive Bayes learner as an MRJob.
   This is meant as an example of why mapper_final is useful.
   
   This learning algorithm implementation can be further optimised. HOW?
   
   Use a cool pattern to do this!

"""
from mrjob.job import MRJob

class MRNaiveBayesTrainer(MRJob):

    def __init__(self, *args, **kwargs):
        super(MRNaiveBayesTrainer, self).__init__(*args, **kwargs)
        self.modelStats = {}
        self.classTotalFreq = [0, 0]
        self.vocab=0

    def jobconf(self):
        
        orig_jobconf = super(MRNaiveBayesTrainer, self).jobconf()        
        custom_jobconf = {
            'mapred.output.key.comparator.class': 'org.apache.hadoop.mapred.lib.KeyFieldBasedComparator',
            'mapred.text.key.comparator.options': '-k1rn',
            'mapred.reduce.tasks': '1',
        }
        combined_jobconf = orig_jobconf
        combined_jobconf.update(custom_jobconf)
        self.jobconf = combined_jobconf
        return combined_jobconf

    def mapper(self, _, line):
        
        docID, docClass,text = line.split("\t",2)   
        words = text.split()
        vocab = {}
        if docID != "D5":
            if docClass == "1":
                yield("PriorProbs", "0,1")
                yield("*classTotalFreq", ("0, " + str(len(words))))
                for word in words:
                    vocab[word] = 1
                    yield(word, "0,1")
            else:
                yield("PriorProbs", "1,0")
                yield("*classTotalFreq", (str(len(words)) + ", 0"))
                for word in words:
                    vocab[word] = 1
                    yield(word, "1,0")
        for k in vocab.keys():
            yield "*!"+k, "1,0"

    def reducer(self, word, values):    
        #aggregate counts for Pr(Word|Class)
        #yield("number of values for "+word, str(values))
        w0Total=0
        w1Total=0
        c0Total=0
        c1Total=1
        for value in values:
            w0, w1 =  value.split(",")
            w0Total += float(w0)
            w1Total += float(w1)  
        if word == "*classTotalFreq":
            self.modelStats[word] = [w0Total, w1Total]
        elif word.startswith("*!"):
            self.vocab += 1
        elif word == "TomsPriors":
            yield("TomsPriors", ','.join(str(j) for j in [w0Total,w1Total,w0Total/(w0Total+w1Total),w1Total/(w0Total+w1Total)]))
        else:
            yield(word, ','.join(str(j) for j in [w0Total,w1Total,(w0Total+1)/(self.modelStats["*classTotalFreq"][0] + self.vocab),(w1Total+1)/(self.modelStats["*classTotalFreq"][1] + self.vocab)]))
        #yield("JIMI "+word, [w0Total, w1Total])"""
        
if __name__ == '__main__':
    MRNaiveBayesTrainer.run()

Overwriting MRNaiveBayesTrainer.py


In [3]:
#------------------------------------------------------------------------------------
# We have two ways to run the Naive Bayes algorithm
# 1. Run using the command line (shown Above)
# 2. Run using a MRJob Runner from python (very sweet way to do business). See Here
#------------------------------------------------------------------------------------
#HW 1.3
%reload_ext autoreload
%autoreload 2


from MRNaiveBayesTrainer import MRNaiveBayesTrainer 

# STEP 1: Train a mulitnomial Naive Bayes      
trainingData = 'chineseExample.txt'

# create an instance of the Trainer class
# and initiatialize it
mr_job = MRNaiveBayesTrainer(args=[trainingData])
modelStats={}
with mr_job.make_runner() as runner: 
    runner.run()
    # stream_output: get access to the output reducer/reducer_final of 
    # the last step in MRNaiveBayesTrainer
    for line in runner.stream_output():
        key,value =  mr_job.parse_output_line(line)
        print key, value
        modelStats[key] = value            
    # Store model locally
    with open('StatelessModel1.txt', 'w') as f:
        for k in modelStats.keys():
            f.writelines( k + "\t"+ str(modelStats[k]) +"\n")
print modelStats

Beijing 0.0,1.0,0.111111111111,0.142857142857
Chinese 1.0,5.0,0.222222222222,0.428571428571
Japan 1.0,0.0,0.222222222222,0.0714285714286
Macao 0.0,1.0,0.111111111111,0.142857142857
PriorProbs 1.0,3.0,0.222222222222,0.285714285714
Shanghai 0.0,1.0,0.111111111111,0.142857142857
Tokyo 1.0,0.0,0.222222222222,0.0714285714286
{'Beijing': '0.0,1.0,0.111111111111,0.142857142857', 'Chinese': '1.0,5.0,0.222222222222,0.428571428571', 'Tokyo': '1.0,0.0,0.222222222222,0.0714285714286', 'Shanghai': '0.0,1.0,0.111111111111,0.142857142857', 'PriorProbs': '1.0,3.0,0.222222222222,0.285714285714', 'Japan': '1.0,0.0,0.222222222222,0.0714285714286', 'Macao': '0.0,1.0,0.111111111111,0.142857142857'}


In [4]:
%%writefile MRNaiveBayesClassifier.py
 
from mrjob.job import MRJob
import sys, re, string, operator, math, os


regex = re.compile('[%s]' % re.escape(string.punctuation))

class MRNaiveBayesClassifier(MRJob):


    def __init__(self, *args, **kwargs):
        super(MRNaiveBayesClassifier, self).__init__(*args, **kwargs)
        self.zeroProb = 0

    def jobconf(self):
        orig_jobconf = super(MRNaiveBayesClassifier, self).jobconf()        
        custom_jobconf = {
            'mapred.output.key.comparator.class': 'org.apache.hadoop.mapred.lib.KeyFieldBasedComparator',
            'mapred.text.key.comparator.options': '-k1rn',
            'mapred.reduce.tasks': '1',
        }
        combined_jobconf = orig_jobconf
        combined_jobconf.update(custom_jobconf)
        self.jobconf = combined_jobconf
        return combined_jobconf    

    def mapper_init(self):
        self.modelStats = {}
        
        recordStrs = [s.split('\n')[0].split('\t') for s in open("StatelessModel1.txt").readlines()]
        for word, statsStr in recordStrs:
            self.modelStats[word] = map(float, statsStr.split(","))
        
        self.prC0 = math.log(self.modelStats["PriorProbs"][2])
        self.prC1 = math.log(self.modelStats["PriorProbs"][3])
        

    
    def mapper(self, _, line):
        
        docID, docClass,text = line.split("\t",2)
        text = text.strip()
        text = regex.sub(' ', text.lower())
        text = re.sub( '\s+', ' ', text )
        words = text.split()
        
        for word in words:
            p0 = self.modelStats[word][2]
            if self.modelStats[word][2] == 0.0:
                self.zeroProb += 1
            p1 = self.modelStats[word][3]
            if self.modelStats[word][3] == 0.0:
                self.zeroProb += 1
            wordGivenHam = math.log(p0) if p0>0.0 else math.log(1)
            wordGivenSpam = math.log(p1) if p1>0.0 else math.log(1)
            prHAMGivenDoc = self.prC0 + wordGivenHam
            prSPAMGivenDoc = self.prC1 + wordGivenSpam
        
        predictedClass = 1 #SPAM
        if(prHAMGivenDoc > prSPAMGivenDoc):
            predictedClass = 0 #HAM
        if int(docClass) == predictedClass:
            yield (docID, 0)  #no error
        else: 
            yield (docID, 1) # error    
        yield("zero", self.zeroProb)
    
    def combiner(self, word, values):
        for value in values:
            yield ("t", value)
            
    def reducer(self, word, values):
        zero = 0
        numberOfRecords = 0
        numberWrong = 0
        for value in values:
            if value > 1:
                zero = value
            else:    
                numberOfRecords += 1
                numberWrong += value
        #print (numberOfRecords, numberWrong)
        print ('Error rate: %.4f' %(1.0*numberWrong/float(numberOfRecords)))
        print ('Number Wrong %d, Total Records %d'  %(numberWrong, numberOfRecords))
        print ('number of word|class with 0 probability: %d' %(zero))

if __name__ == '__main__':
    MRNaiveBayesClassifier.run()

Overwriting MRNaiveBayesClassifier.py


In [6]:
%reload_ext autoreload
%autoreload 2

from numpy import random
from MRNaiveBayesClassifier import MRNaiveBayesClassifier 

trainingData = 'chineseExample.txt'      
#make the model file available to the workers as "model"
modelFile   = 'StatelessModel1.txt'  

mr_job = MRNaiveBayesClassifier(args=[trainingData, modelFile])
modelStats={}
with mr_job.make_runner() as runner: 
    runner.run()
        # stream_output: get access of the output 
    for line in runner.stream_output():
        key,value =  mr_job.parse_output_line(line)
        print key, value

IOError: [Errno 2] No such file or directory: 'StatelessModel1.txt'