# Data preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
train_df = pd.read_csv("reddit-comment-classification-comp-551/reddit_train.csv")

In [3]:
test_df = pd.read_csv("reddit-comment-classification-comp-551/reddit_test.csv")

In [4]:
# label each categories
train_df.subreddits = pd.Categorical(train_df.subreddits)
train_df['label'] = train_df.subreddits.cat.codes
label_mapping = dict( enumerate(train_df['subreddits'].cat.categories ) )
train_df

Unnamed: 0,id,comments,subreddits,label
0,0,"Honestly, Buffalo is the correct answer. I rem...",hockey,11
1,1,Ah yes way could have been :( remember when he...,nba,14
2,2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends,12
3,3,He wouldn't have been a bad signing if we woul...,soccer,16
4,4,Easy. You use the piss and dry technique. Let ...,funny,9
...,...,...,...,...
69995,69995,"Thank you, you confirm Spain does have nice pe...",europe,8
69996,69996,Imagine how many he would have killed with a r...,leagueoflegends,12
69997,69997,Yes. Only. As in the guy I was replying to was...,canada,6
69998,69998,Looking for something light-hearted or has a v...,anime,4


In [5]:
# split test and train data
X_train, X_test, y_train, y_test = train_test_split(train_df['comments'], train_df['label'], random_state=1)

In [None]:
# init vectorizer
cv = CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=True, stop_words='english')

In [6]:
# vectorize the words
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)


<52500x61620 sparse matrix of type '<class 'numpy.int64'>'
	with 960433 stored elements in Compressed Sparse Row format>

<17500x61620 sparse matrix of type '<class 'numpy.int64'>'
	with 308971 stored elements in Compressed Sparse Row format>

# Implementing BNB

In [69]:
from sklearn.preprocessing import Binarizer
from sklearn.utils.validation import check_X_y, check_array
from sklearn.utils.extmath import safe_sparse_dot
from scipy import sparse

In [141]:
class BernoulliNB:
    def __init__(self):
        self.prob_class = None
        self.prob_x_c = None
        self.prob_x_cp = None

    def fit(self, X, y):
        X = self.binarize(X)
        classes = np.unique(y)
        probs_class = []
        probs_x_c = []
        probs_x_cp = []

        for c in classes:
            prob_class, prob_x_c, prob_x_cp = self.populate_class_stats(X, y, c)
            probs_class.append(prob_class)
            probs_x_c.append(prob_x_c)
            probs_x_cp.append(prob_x_cp)
        
        self.prob_class = np.asarray(probs_class)
        self.prob_x_c = np.asarray(probs_x_c)
        self.prob_x_cp = np.asarray(probs_x_cp)

    def binarize(self, X):
        binarizer = Binarizer().fit(X)
        return binarizer.transform(X)
    
    def populate_class_stats(self, X, y, c):
        X, y = check_X_y(X, y, accept_sparse="csr")
        Xc = X[y == c, :]
        Xcp = X[y != c, :]
        
        # P(c)
        prob_class = Xc.shape[0] / X.shape[0]
        
        # number of word x in class c
        Nxc = np.asarray(Xc.sum(axis=0))
        
        # number of word x not in class c
        Nxcp = np.asarray(Xcp.sum(axis=0))
        
        n_feat = Nxc.shape[1]
        
        # number of word x' in class c
        s_nxc = Nxc.sum()
        Nxpc = np.zeros(n_feat)
        for i in range(n_feat):
            Nxpc[i] = s_nxc - Nxc[0][i]
            
        # number of word x' in class c'
        s_nxcp = Nxcp.sum()
        Nxpcp = np.zeros(n_feat)
        for i in range(n_feat):
            Nxpcp[i] = s_nxcp - Nxc[0][i]
        
        # P(x | c)
        prob_x_c = (Nxc + 1) / (Nxpc + 2)
        
        # P(x | c')
        prob_x_cp = (Nxcp + 1) / (Nxpcp  + 2)

        return prob_class, prob_x_c.reshape(n_feat), prob_x_cp.reshape(n_feat)

    def predict(self, X):
        X = check_array(X, accept_sparse="csr")
        X = self.binarize(X)
        preds = []
        one = np.ones(X.shape[1])
        l1 = np.log(self.prob_x_c/self.prob_x_cp).T
        l2 = np.log((1-self.prob_x_c)/(1-self.prob_x_cp)).T
        for i in range(X.shape[0]):
            x = X.getrow(i)
            pred = np.argmax(np.log(self.prob_class) + \
                + np.sum(safe_sparse_dot(x, l1)) \
                + (one - x) @ l2)
            preds.append(pred)
        return np.asarray(preds)

# Running Bernoulli Naive Bayes

In [142]:
# from bnb.model import BernoulliNB
cls = BernoulliNB1()
cls.fit(X_train_cv, y_train)

In [143]:
cls.prob_class.shape
pred_y = cls.predict(X_test_cv)

In [144]:
pred_y

array([2, 2, 2, ..., 2, 2, 2])

In [145]:
print('Accuracy score: ', accuracy_score(y_test, pred_y))

Accuracy score:  0.05217142857142857


6670     Yeah but euron's about to bring cersei tyrion ...
49567    All of his videos are sarcastic and funny...hi...
50796    I love those scenes but it wouldn't have made ...
22310    You do get a smidge of hp for every point of c...
54037    New MMORPG lets you play as someone playing a ...
                               ...                        
32138    Don't want to get into a fight here, but all I...
53648    Its the key that opens the gate to the Obsidia...
64554    That Barry Zito would be a key part to the Gia...
33812    Inb4 triggered Lijang Tower suckers come and c...
30231    Is eating in public illegal too?\n\nIt is a my...
Name: comments, Length: 17500, dtype: object

# Running BNB

In [8]:
# from bnb.model import ClassStatistic
# cs = ClassStatistic()

In [8]:
from bnb.model import BernoulliNB
cls = BernoulliNB()
cls.fit(X_train_cv, y_train)

In [9]:
cls.prob_class

array([0.05049524, 0.04994286, 0.04927619, 0.05011429, 0.04954286,
       0.05066667, 0.05045714, 0.05028571, 0.05028571, 0.0499619 ,
       0.05064762, 0.05030476, 0.04942857, 0.04965714, 0.04948571,
       0.04990476, 0.04927619, 0.0495619 , 0.05038095, 0.05032381])

In [11]:
cls.prob_x_c.shape

(20, 1, 61620)

In [12]:
cls.prob_x_cp.shape

(20, 1, 61620)

In [20]:
from sklearn.utils.validation import check_X_y, check_array
# for i in range(X_test_cv.shape[0]):
#     x = X_test_cv.getrow(1)
#     print(x)

X_test_cv.getrow(1)

<1x61620 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [43]:
from sklearn.utils.validation import check_array
from sklearn.utils.validation import check_X_y
X, y = check_X_y(b_X, y_train, accept_sparse="csr")

<52500x61620 sparse matrix of type '<class 'numpy.int64'>'
	with 960433 stored elements in Compressed Sparse Row format>

In [44]:
X_i = X[y == 0, :]
X_i

<2651x61620 sparse matrix of type '<class 'numpy.int64'>'
	with 48968 stored elements in Compressed Sparse Row format>

In [42]:
X_train[y_train == 0]

4183     I saw a (I am guessing) 10 year old get hit in...
19950    Yes same here I was just bored and decided to ...
23570    Nothing in that Wikipedia article says anythin...
22072    Not a doctor or nurse, but I have a kind of we...
18008    "Well that just creams my corn" -almost father...
                               ...                        
37774    If you're gonna do some fruit-based drink at l...
64337    I think men age so much better than women. A g...
9552     **PLEASE READ THIS MESSAGE IN ITS ENTIRETY BEF...
1844     If the food police would arrest people who eat...
8444     Instructions unclear, could not locate the pen...
Name: comments, Length: 2651, dtype: object

In [None]:
def prob_class(X, y, c):
    t = X.shape[0]
    Xc = X[[y=c]]
    d = Xc.shape[0]
    return d/t

In [78]:
clf = BernoulliNB()

In [79]:
clf.fit(X_train_cv, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [80]:
preds = clf.predict(X_test_cv)

In [85]:
print('Accuracy score: ', accuracy_score(y_test, preds))

Accuracy score:  0.4948571428571429
