In [1]:
import numpy as np
from scipy.sparse import coo_matrix # for sparse matrix
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score # for evaluating results

- Number of words 

In [31]:
NWORDS = 2500

- Read labels 

In [9]:
def read_label_file(path):
    with open(path, "r") as f:
        content = f.readlines()

    label = [int(line.strip()) for line in content]

    return label


In [14]:
train_labels = read_label_file("../../data/spam-datas/train-labels.txt")
test_labels = read_label_file("../../data/spam-datas/test-labels.txt")
print(len(train_labels))

print(len(test_labels))

700
260


- read features

In [20]:
def read_feature_file(path):
    with open(path, "r") as f:
        content = f.readlines()

    # strip the "\n".
    content = [line.strip() for line in content]

    # create place holder matrix.
    matrix = np.zeros((len(content), 3), dtype=int)

    # loop through the content as well as the index of line and convert the data to matrix.
    for i, line in enumerate(content):
        l = line.split()

        matrix[i, :] = np.array([int(l[0]), int(l[1]), int(l[2])])

    return matrix

In [29]:
train_features = read_feature_file("../../data/spam-datas/train-features.txt")
test_features = read_feature_file("../../data/spam-datas/test-features.txt")
print(len(train_features))
print(len(test_features))


80248
27979


- Coordinate matrix:

![Alt text](image.png)

example:

![Alt text](image-1.png)


<img src="image-2.png"/>



In [36]:
def construct_coordinate_matrix(matrix, row_size, col_size):
    row_idx = matrix[:, 0] - 1 # Python!!!
    col_idx = matrix[:, 1] - 1 # Python!!!

    return coo_matrix((matrix[:, 2], (row_idx, col_idx)), shape = (row_size, col_size))



In [56]:
feat_train_vectors = construct_coordinate_matrix(matrix=train_features, row_size=len(train_labels), col_size=NWORDS)
feat_test_vectors = construct_coordinate_matrix(matrix=test_features, row_size=len(test_labels), col_size=NWORDS)

In [54]:
idx = 0
for _ in feat_train_vectors.toarray()[0]:
    print(f"word index: {idx} ||| count: {_}")
    idx += 1

word index: 0 ||| count: 0
word index: 1 ||| count: 0
word index: 2 ||| count: 0
word index: 3 ||| count: 0
word index: 4 ||| count: 0
word index: 5 ||| count: 0
word index: 6 ||| count: 0
word index: 7 ||| count: 0
word index: 8 ||| count: 0
word index: 9 ||| count: 0
word index: 10 ||| count: 0
word index: 11 ||| count: 0
word index: 12 ||| count: 0
word index: 13 ||| count: 0
word index: 14 ||| count: 0
word index: 15 ||| count: 0
word index: 16 ||| count: 0
word index: 17 ||| count: 0
word index: 18 ||| count: 2
word index: 19 ||| count: 0
word index: 20 ||| count: 0
word index: 21 ||| count: 0
word index: 22 ||| count: 0
word index: 23 ||| count: 0
word index: 24 ||| count: 0
word index: 25 ||| count: 0
word index: 26 ||| count: 0
word index: 27 ||| count: 0
word index: 28 ||| count: 0
word index: 29 ||| count: 0
word index: 30 ||| count: 0
word index: 31 ||| count: 0
word index: 32 ||| count: 0
word index: 33 ||| count: 0
word index: 34 ||| count: 0
word index: 35 ||| count: 0
wo

- Classifier

In [58]:
classifier = MultinomialNB()

In [60]:
classifier.fit(feat_train_vectors, train_labels)

In [62]:
classifier.predict(feat_test_vectors)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])