# Build Matrix of Token Counts

In [1]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

## Custom Count Vectorizer

In [2]:
import re

def get_words(doc):
    return re.findall('\w{1,}', doc.lower())

get_words(corpus[0])

['this', 'is', 'the', 'first', 'document']

In [3]:
corpus_words = [get_words(s) for s in corpus]
corpus_vocab = sorted(set([w for words in corpus_words for w in words ]))
corpus_vocab

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

In [4]:
corpus_dict = {word: index for index, word in enumerate(corpus_vocab)}
corpus_dict

{'and': 0,
 'document': 1,
 'first': 2,
 'is': 3,
 'one': 4,
 'second': 5,
 'the': 6,
 'third': 7,
 'this': 8}

In [5]:
from scipy.sparse import lil_matrix
import numpy as np

num_samples = len(corpus_words)
num_features = len(corpus_vocab)

X1 = lil_matrix((num_samples, num_features), dtype=np.int64)
for i, sample in enumerate(corpus_words):
    for word in sample:
        X1[i, corpus_dict[word]] += 1

X1.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

## Scikit-learn Count Vectorizer

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X2 = vectorizer.fit_transform(corpus)
X2.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

Test if both matrices are the same

In [7]:
from numpy.linalg import norm

norm((X1-X2).toarray())

0.0

## Spam Corpus

In [8]:
import sys
sys.path.append('..')

from spam_corpus import read_spam_corpus
spam_corpus = read_spam_corpus()
spam_corpus.dropna(inplace = True)
spam_corpus.head()

Unnamed: 0,text,label
0,\n> From: fork-admin@xent.com [mailto:fork-adm...,0
1,"Hi,\n\nOn Sun, 01 Sep 2002 00:05:03 MDT Reg Cl...",0
2,"On Fri Sep 13 2002 at 02:03, Robert Elz wrote:...",0
3,"On Thu, 2002-08-29 at 01:06, Matthias Saou wro...",0
4,"Hi, I'm building an rpm for the resin webserve...",0


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(spam_corpus.text)
X

<10606x81895 sparse matrix of type '<class 'numpy.int64'>'
	with 1609110 stored elements in Compressed Sparse Row format>