In [1]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /home/arpanmangal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import sys
import json
import time
import pickle
import numpy as np
import json

def json_reader(fname):
    """
        Read multiple json files
        Args:
            fname: str: input file
        Returns:
            generator: iterator over documents 
    """
    count = 0
    for line in open(fname, mode="r"):
        if (count > 100000):
            break
        count += 1
        
        data = json.loads(line)
        rating = int(data['stars'])
        review = np.array(word_tokenize(data['text']))
        yield {'rating': rating, 'review': review}
        

In [3]:
trainset = './dataset/NB/test.json'

In [4]:
# Making of the dictionary

tick = time.time()
dictionary = {}
for data in json_reader(trainset):
    for word in data['review']:
        dictionary[word] = 0
        
dictionary = { word : idx for idx, word in enumerate(dictionary)}
print (len(dictionary))

print ("Time Taken: ", time.time() - tick)

143206
Time Taken:  173.86873245239258


In [5]:
# Computing Phi's
Phi = np.zeros(5)

totalRatings = 0
for data in json_reader(trainset):
    totalRatings += 1
    Phi[data['rating'] - 1] += 1
    
Phi /= totalRatings
print (Phi, totalRatings)

Phi = np.log(Phi)
print (Phi)

[0.15221848 0.08171918 0.10862891 0.21806782 0.43936561] 100001
[-1.88243844 -2.50446651 -2.21981767 -1.52294917 -0.8224234 ]


In [6]:
# Computing ThetaWK's
V = len(dictionary)
print (V)

tick = time.time()

ThetaNum = np.zeros((V, 5)) + 1
ThetaDeno = np.zeros((V, 5)) + V 

def computeFreq (doc, rating):
    k = rating - 1
    ThetaDeno[:, k] += len(doc)
    
    for word in doc:
        w = dictionary[word]
        ThetaNum[w][k] += 1
    
    return 0
    
for data in json_reader(trainset):
    computeFreq (data['review'], data['rating'])
    
Theta = np.log(ThetaNum / ThetaDeno)

np.set_printoptions(precision=2)
print (ThetaNum)
print (ThetaDeno)
print (Theta)

print ("Time Taken: ", time.time() - tick)


143206
[[4.10e+01 4.20e+01 5.90e+01 1.17e+02 1.95e+02]
 [5.36e+03 3.26e+03 4.58e+03 9.03e+03 1.41e+04]
 [9.27e+02 3.45e+02 2.76e+02 5.00e+02 1.22e+03]
 ...
 [1.00e+00 2.00e+00 1.00e+00 1.00e+00 1.00e+00]
 [1.00e+00 2.00e+00 1.00e+00 1.00e+00 1.00e+00]
 [1.00e+00 1.00e+00 1.00e+00 2.00e+00 1.00e+00]]
[[2611316. 1469889. 1792062. 3023462. 4555988.]
 [2611316. 1469889. 1792062. 3023462. 4555988.]
 [2611316. 1469889. 1792062. 3023462. 4555988.]
 ...
 [2611316. 1469889. 1792062. 3023462. 4555988.]
 [2611316. 1469889. 1792062. 3023462. 4555988.]
 [2611316. 1469889. 1792062. 3023462. 4555988.]]
[[-11.06 -10.46 -10.32 -10.16 -10.06]
 [ -6.19  -6.11  -5.97  -5.81  -5.78]
 [ -7.94  -8.36  -8.78  -8.71  -8.23]
 ...
 [-14.78 -13.51 -14.4  -14.92 -15.33]
 [-14.78 -13.51 -14.4  -14.92 -15.33]
 [-14.78 -14.2  -14.4  -14.23 -15.33]]
Time Taken:  283.6357834339142


In [7]:
# Predictions
def predictClass (doc):
    probs = np.zeros(5)
    for k in range(0, 5):
        probs[k] += Phi[k];
        for word in doc:
            w = dictionary[word]
            probs[k] += Theta[w][k]
#     print (probs)
    return np.argmax(probs) + 1

tick = time.time()

correctPredictions = 0
totalPredictions = 0
for data in json_reader(trainset):
    prediction = predictClass (data['review'])
#     print (prediction, end=' ')
    totalPredictions += 1
    if (prediction == data['rating']):
        correctPredictions += 1
    
print (totalPredictions)
print ("Accuracy: %.2f%%" % (correctPredictions * 100 / totalPredictions))
print ("Time Taken: ", time.time() - tick)

100001
Accuracy: 70.16%
Time Taken:  263.9955232143402
