In [1]:
import re
from tqdm import tqdm
from sklearn.utils import shuffle
import numpy as np
from tqdm import tqdm
import bz2
from sklearn import model_selection, preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import nltk
import pandas as pd

In [2]:
def splitReviewsLabels(lines):
    reviews = []
    labels = []
    for review in tqdm(lines):
        rev = reviewToX(review)
        label = reviewToY(review)
        reviews.append(rev[:512])
        labels.append(label)
    return reviews, labels

In [3]:
def reviewToY(review):
    return 0 if review.split(' ')[0] == '__label__1' else 1

In [4]:
def reviewToX(review):
    review = review.split(' ', 1)[1][:-1].lower()
    review = re.sub('\d','0',review)
    if 'www.' in review or 'http:' in review or 'https:' in review or '.com' in review:
        review = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", review)
    return review

In [6]:
train_file = bz2.BZ2File('train.ft.txt.bz2')
test_file = bz2.BZ2File('test.ft.txt.bz2')

In [7]:
train_lines = train_file.readlines()
test_lines = test_file.readlines()

In [8]:
train_lines = [x.decode('utf-8') for x in train_lines[:20000]]
test_lines = [x.decode('utf-8') for x in test_lines[:2000]]

In [9]:
train_x, train_y = splitReviewsLabels(train_lines)
test_x,test_y = splitReviewsLabels(test_lines)

100%|█████████████████████████████████████████████████████████████████████████| 20000/20000 [00:00<00:00, 85975.28it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 74185.57it/s]


In [10]:
train_x[8]
print(train_y[8])

1


In [11]:
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

In [12]:
print(np.unique(train_y))

[0 1]


In [13]:
pi=np.array([sum(train_y==0)/len(train_y),sum(train_y==1)/len(train_y)])
pi

array([0.48715, 0.51285])

In [14]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(train_x)
# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x).todense()
xtest_count =  count_vect.transform(test_x).todense()

In [16]:
wordFreq = pd.DataFrame(columns=['words','class1','class2'])
wordFreq['words'] = count_vect.get_feature_names()

x_train_class1 = xtrain_count[train_y==0]
x_train_class2 = xtrain_count[train_y==1]

count_class1 = np.sum(x_train_class1,axis=0)
count_class2 = np.sum(x_train_class2,axis=0)

vocab_size1 = len(np.where(count_class1==0)[1])
vocab_size2 = len(np.where(count_class2==0)[1])

alpha=10
count_class1 = np.array( (count_class1+alpha) /(np.sum(count_class1)+vocab_size1 +1))
count_class2 = np.array( (count_class2+alpha) /(np.sum(count_class2)+vocab_size2 +1))

wordFreq['class1'] = pd.Series(count_class1.ravel())
wordFreq['class2'] = pd.Series(count_class2.ravel())

In [17]:
train_preds = np.zeros(len(xtrain_count))
for i in range(len(xtrain_count)):
    idx = np.where(xtrain_count[i,:]!=0)[1]
    lh1 = wordFreq['class1'].iloc[idx].prod()
    lh2 = wordFreq['class2'].iloc[idx].prod()
    posterior1 = lh1*pi[0]
    posterior2 = lh2 * pi[1]

    if posterior1>posterior2:
        train_preds[i] = 0
    else:
        train_preds[i] = 1


matches = np.sum(train_y==train_preds)
print('Train accuracy is: '+str(matches/len(train_preds)))

Train accuracy is: 0.8814


In [18]:
test_preds = np.zeros(len(xtest_count))
for i in range(len(xtest_count)):
    idx = np.where(xtest_count[i,:]!=0)[1]
    lh1 = wordFreq['class1'].iloc[idx].prod()
    lh2 = wordFreq['class2'].iloc[idx].prod()
    posterior1 = lh1*pi[0]
    posterior2 = lh2 * pi[1]

    if posterior1>posterior2:
        test_preds[i] = 0
    else:
        test_preds[i] = 1

    temp = 1

matches = np.sum(test_y==test_preds)
print('Validation accuracy is: '+str(matches/len(test_preds)))

Validation accuracy is: 0.8385
