### Multinomial Naive Bayes

In [223]:
import scipy.io
import numpy as np
from scipy.sparse import csc_matrix

In [35]:
mdict = scipy.io.loadmat('20newsgroup.mat')
X_train = mdict['X_train']
X_test = mdict['X_test']
y_train = mdict['y_train'].T
y_test = mdict['y_test'].T
word_list = mdict['word_list']

In [36]:
# function of generating dummy response variables
def dummy_response(y,nclass):
    # create dummy var matrix
    z = np.zeros((np.shape(y)[0],nclass))
    
    # change corresponding values
    for i in range(np.shape(y)[0]):
        z[i,int(y[i])] = 1
    
    return z 

In [110]:
# function of calculating p(x/y)
def bayes_prob_x_y(x_train,y_train,n_class,alpha):
    # number of variables (x)
    n_var = np.shape(x_train)[1]
    # convert y_train to dummy response variables
    dummy_train = dummy_response(y_train,n_class)
    
    # initiate count matrix
    prob = np.array([]).reshape(0,n_var)
    
    # for each class of k
    for k in range(n_class):
        # obtain count vector of class k
        count_k_mat = x_train.multiply(csc_matrix(dummy_train[:,k:k+1]))
        count_k = count_k_mat.sum(axis=0)
        
        # calculate prob vector
        sum_count_k = count_k.sum(axis=1)[0,0] + alpha * n_var
        count_alpha = count_k + np.ones((1,n_var))*alpha
        prob_k = count_alpha / sum_count_k
        prob_k = prob_k
        
        # stack prob_k
        prob = np.vstack((prob,prob_k))
    
    return prob

In [38]:
# function of calculating p(y=k)
def bayes_prob_y(y_train,n_class):
    # number of observations
    n_obs = np.shape(y_train)[0]
    
    # initiate count vector
    count_y = np.zeros((n_class,1))
    
    # for each observations
    for i in range(n_obs):
        class_num = int(y_train[i])
        count_y[class_num] += 1
    
    # calculate p(y=k)
    prob_y = count_y / n_obs
    
    return prob_y

In [116]:
# function of training Bayes
def train_bayes_multinomial(x_train,y_train,n_class,alpha):
    prob_y = bayes_prob_y(y_train,n_class)
    prob_x_y = bayes_prob_x_y(x_train,y_train,n_class,alpha)
    prob_model = [prob_y,np.asarray(prob_x_y),n_class]
    
    return prob_model

In [255]:
# function of predictint probs of unlabelled x
# model_prob is the list of [prob_y,prob_x_y,n_class]
# x_test is a single vector of observation
def bayes_predict_prob(x_test,model_prob):
    x_test = x_test.toarray()
    # get trained probs
    prob_y = model_prob[0]
    prob_x_y = model_prob[1]
    n_class = model_prob[2]
    
    # calculate probs of classes
    log_prob_y_x = np.log(prob_y) + ((np.log(prob_x_y)*x_test).sum(axis=1)).reshape(n_class,1)

    return log_prob_y_x

In [233]:
# function of predict class by probs
def bayes_predict_class(pre_prob):
    # get max prob
    pred_class = np.argmax(pre_prob)
    
    return pred_class

In [262]:
# function of calculating test accuracy
def accuracy(x_test,y_test,model_prob):
    # initiate number of correct prediction
    correct_count = 0
    # number of test observation
    n_obs = np.shape(x_test)[0]
    
    # process percent
    percent = 1
    
    # for each observation in test set
    for i in range(n_obs):
        pred_y_prob = bayes_predict_prob(x_test[i:i+1,:],model_prob)
        pred_y = bayes_predict_class(pred_y_prob)
        if y_test[i]==pred_y:
            correct_count += 1
        
        if (i/n_obs)>=(percent/10):
            print(str(percent*10)+'%......')
            percent += 1
    
    # calculate accuracy
    acc = correct_count / n_obs
    
    return acc

In [185]:
n_class = np.shape(word_list)[1]
model_train = train_bayes_multinomial(X_train,y_train,n_class,1)

In [263]:
acc = accuracy(X_test,y_test,model_train)
acc

10%......
20%......
30%......
40%......
50%......
60%......
70%......
80%......
90%......


0.7728359001593202

In [286]:
train_nword = X_train.sum(axis=1)
test_nword = X_test.sum(axis=1)
train_div = 1/train_nword
test_div = 1/test_nword
X_train_t = X_train.multiply(csc_matrix(train_div))
X_test_t = X_test.multiply(csc_matrix(test_div))

In [308]:
alpha_t = 1/train_nword.sum(axis=0)[0,0]

2.9676470090718004e-07

In [309]:
model_train_t = train_bayes_multinomial(X_train_t,y_train,n_class,alpha_t)

In [310]:
acc_t = accuracy(X_test_t,y_test,model_train_t)
acc_t

10%......
20%......
30%......
40%......
50%......
60%......
70%......
80%......
90%......


0.8062931492299522