In [1]:
from __future__ import print_function
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score

In [2]:
def read_data(data_fn, label_fn, nwords):
    
    with open(nwords, 'r', encoding = 'utf-8') as f:
        content = f.readlines()
    nwords = [int(x.strip()) for x in content][0]
    ## read label_fn
    with open(label_fn, 'r', encoding = 'utf-8') as f:
        content = f.readlines()
    label = [int(x.strip()) for x in content]
    ## read data_fn
    with open (data_fn, 'r', encoding = 'utf-8') as f:
        content = f.readlines()
    # remove ’\n’ at the end of each line
    content = [x.strip() for x in content]
    dat = np.zeros((len(content), 3), dtype = int)
    for i, line in enumerate(content):
        a = line.split(' ')
        dat[i, :] = np.array([int(a[0]), int(a[1]), int(a[2])])
    # remember to -1 at coordinate since we’re in Python
    data = coo_matrix((dat[:, 2], (dat[:, 0] , dat[:, 1] )),shape=(len(label), nwords))
    return (data, label)

In [3]:
sum_acc_test = 0
aver_acc_test = 0
sum_acc_train = 0
aver_acc_train = 0
for i in range(1, 6):
    train_data_fn = 'data_thua/train_fn{}.txt'.format(i)
    train_label_fn = 'data_thua/train_label{}.txt'.format(i)
    test_data_fn = 'data_thua/test_fn{}.txt'.format(i)
    test_label_fn = 'data_thua/test_label{}.txt'.format(i)
    nwords = 'data_thua/nword{}.txt'.format(i)
    (train_data, train_label) = read_data(train_data_fn, train_label_fn, nwords)
    (test_data, test_label) = read_data(test_data_fn, test_label_fn, nwords)
    clf = MultinomialNB()
    clf.fit(train_data, train_label)
    y_pred_test = clf.predict(test_data)
    
    y_pred_train = clf.predict(train_data)
    
    acc_test = accuracy_score(y_pred_test,test_label)
    
    acc_train = accuracy_score(y_pred_train,train_label)
    
    sum_acc_test = sum_acc_test + acc_test
    sum_acc_train = sum_acc_train + acc_train
    
    print('Length dict =',train_data.shape[1],'. accuracy_test_folder{} ='.format(i), acc_test*100)
    
    print('Length dict =',train_data.shape[1],'. accuracy_train_folder{} ='.format(i), acc_train*100)
    
    print('----------------------------------------')

aver_acc_train = sum_acc_train/5
aver_acc_test = sum_acc_test/5
print('average accurracy_train : ', aver_acc_train)
print('average accurracy_test : ', aver_acc_test)

Length dict = 1888 . accuracy_test_folder1 = 67.5
Length dict = 1888 . accuracy_train_folder1 = 89.38906752411575
----------------------------------------
Length dict = 1967 . accuracy_test_folder2 = 67.5
Length dict = 1967 . accuracy_train_folder2 = 91.31832797427653
----------------------------------------
Length dict = 1875 . accuracy_test_folder3 = 62.5
Length dict = 1875 . accuracy_train_folder3 = 89.06752411575563
----------------------------------------
Length dict = 1867 . accuracy_test_folder4 = 56.25
Length dict = 1867 . accuracy_train_folder4 = 91.31832797427653
----------------------------------------
Length dict = 1929 . accuracy_test_folder5 = 66.19718309859155
Length dict = 1929 . accuracy_train_folder5 = 90.34267912772586
----------------------------------------
average accurracy_train :  0.9028718534323005
average accurracy_test :  0.6398943661971831


In [4]:
import numpy as np
import math

In [5]:
def training(X_train, y_train):
    N,D = X_train.shape
    classes, noElementOfClass = np.unique(y_train, return_counts=True)
    K=classes.size
    theta = np.ones((K,1))
    mu_jk = np.ones((K,D))
    sigma_jk = np.ones((K,D))
    for k in range(0,K):
        theta[k]= noElementOfClass[k] 
        ide = np.where(y_train==classes[k])[-1]
        X=[]
        for i in ide:
             X.append(X_train[i][:])
        X=np.array(X)
        mu_jk[k][:]= np.sum(X,0)/X.shape[0]
        sigma_jk[k][:]= np.square(X[k][:]-mu_jk[k][:])/X.shape[0]
    theta/=N
    return theta, mu_jk, sigma_jk

In [6]:
def classify(theta, mu_jk, sigma_jk, x):
    K,D = mu_jk.shape
    p= np.ones((K,1))
    for k in range(0, K):
        l=np.log(1/np.sqrt(2*np.pi*sigma_jk[k][:]))-(np.square(x[:]-mu_jk[k][:])/(2*sigma_jk[k][:]))
        p[k] = np.log(theta[k])+np.sum(l)
    return np.argmax(p)

In [7]:
# loss function
def loss(w, X, y, lam):
    z = prob(w,X)
    return -np.mean(y*np.log(z) + (1-y)*np.log(1-z) + 0.5*lam/X.shape[0]*np.sum(w*w))

In [8]:
def accuracy(theta, mu_jk, sigma_jk,X, y):
    K, D = sigma_jk.shape
    N, D  = X.shape
    y_pre = np.ones((N,1))
    count =0;
    for j in range(N):
        y_pre[j] = classify(theta, mu_jk, sigma_jk, X[j][:])
        if y_pre[j]==y[j]:
            count+=1
    return count/N

In [9]:
sum_acc_test = 0
aver_acc_test = 0
sum_acc_train = 0
aver_acc_train = 0
for i in range(1, 6):
    A = np.loadtxt('data_thua/train{}.txt'.format(i))
    X_train = np.array(A[:, 1: ])
    y_train = np.array(A[:, 0])
    print('len of dict folder {}: '.format(i), X_train.shape[1])
    T = np.loadtxt('data_thua/test{},txt'.format(i))
    X_test=np.array(T[:, 1: ])
    y_test=np.array(T[:, 0])
    theta, mu_jk, sigma_jk=training(X_train, y_train)
    
    acc_test = accuracy(theta, mu_jk, sigma_jk,X_test, y_test)
    
    acc_train = accuracy(theta, mu_jk, sigma_jk,X_train, y_train)
    
    sum_acc_test = sum_acc_test + acc_test
    sum_acc_train = sum_acc_train + acc_train
    
    print('accurracy_test of folder {}: '.format(i), acc_test, '')
    print('accurracy_train of folder {}: '.format(i), acc_train)
    
aver_acc_train = sum_acc_train/5
aver_acc_test = sum_acc_test/5

print('average accurracy_train : ', aver_acc_train)
print('average accurracy_test : ', aver_acc_test)

len of dict folder 1:  1888


  """
  """
  """


accurracy_test of folder 1:  0.5125 
accurracy_train of folder 1:  0.5176848874598071
len of dict folder 2:  1967
accurracy_test of folder 2:  0.55 
accurracy_train of folder 2:  0.5112540192926045
len of dict folder 3:  1875
accurracy_test of folder 3:  0.4 
accurracy_train of folder 3:  0.5434083601286174
len of dict folder 4:  1867
accurracy_test of folder 4:  0.65 
accurracy_train of folder 4:  0.4855305466237942
len of dict folder 5:  1929
accurracy_test of folder 5:  0.4647887323943662 
accurracy_train of folder 5:  0.5264797507788161
average accurracy_train :  0.5168715128567278
average accurracy_test :  0.5154577464788732
