In [1]:
# import stuff
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
from nltk import FreqDist
import numpy as np

In [2]:
#load dataset
x_train_all = np.load('imdb/x_train.npy')
x_test_all = np.load('imdb/x_test.npy')
y_train = np.load('imdb/y_train.npy')
y_test = np.load('imdb/y_test.npy')

In [3]:
# function to calc freqdist

def freqdist(x_all):
    for row in x_all:
        row.sort()
        
    fdist = [FreqDist(row) for row in x_all]
    
    return fdist

In [4]:
#function to obtain top-K

def topk(k,fdistx):
    new_fdx = []
    
    for freqdist in fdistx:
        new_fdx.append([freqdist[i] for i in range(1,k+1)])  
    
    return np.array(new_fdx)


In [5]:
def GaussianNaiveBayes():
    gnb = GaussianNB()
    gnb.fit(x_train,y_train)
    y_pred = gnb.predict(x_test)
    #print((y_pred != y_test).sum())
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred) 
    recall = recall_score(y_test,y_pred) 
    
    return accuracy,precision,recall

In [None]:
print('Calculating freqdist of x_train & x_test...',end='')
fd_xtrain = freqdist(x_train_all)
fd_xtest = freqdist(x_test_all)
print('done.')

for k in [100,1000,10000]: 

    print('~' * 10, ' K = %d '% k, '~' * 10)
    
    print('Obtaining frequency of top-%d words in x_train...' % k, end='')
    x_train = topk(k,fd_xtrain)
    print('done.')
    
    print('Obtaining frequency of top-%d words in x_test...' % k,end='')    
    x_test = topk(k,fd_xtest)
    print('done.')
    
    print('Training gnb model...',end='')
    accuracy,precision,recall = GaussianNaiveBayes()
    print('done.')
    print('Accuracy = %.5f, Precision = %.5f, Recall = %.5f\n' % (accuracy,precision,recall))
    
input('Press any key to exit.')

Calculating freqdist of x_train & x_test...done.
~~~~~~~~~~  K = 100  ~~~~~~~~~~
Obtaining frequency of top-100 words in x_train


100%|█████████████████████████████████████████████████████████████████████████| 25000/25000 [00:00<00:00, 46512.61it/s]


Obtaining frequency of top-100 words in x_test


100%|█████████████████████████████████████████████████████████████████████████| 25000/25000 [00:00<00:00, 52933.95it/s]


Training gnb model...done.
Accuracy = 0.69, Precision = 0.71, Recall = 0.66

~~~~~~~~~~  K = 1000  ~~~~~~~~~~
Obtaining frequency of top-1000 words in x_train


100%|██████████████████████████████████████████████████████████████████████████| 25000/25000 [00:05<00:00, 4378.90it/s]


Obtaining frequency of top-1000 words in x_test


100%|██████████████████████████████████████████████████████████████████████████| 25000/25000 [00:05<00:00, 4182.05it/s]


Training gnb model...done.
Accuracy = 0.81, Precision = 0.82, Recall = 0.79

~~~~~~~~~~  K = 10000  ~~~~~~~~~~
Obtaining frequency of top-10000 words in x_train


100%|███████████████████████████████████████████████████████████████████████████| 25000/25000 [00:59<00:00, 422.48it/s]


Obtaining frequency of top-10000 words in x_test


100%|███████████████████████████████████████████████████████████████████████████| 25000/25000 [01:03<00:00, 395.20it/s]


Training gnb model...done.
Accuracy = 0.66, Precision = 0.77, Recall = 0.46

