In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
import math
import scipy
%matplotlib inline
plt.style.use('seaborn')

In [2]:
import sklearn
import matplotlib
import sys
libraries = (('Matplotlib', matplotlib), ('Numpy', np), ('Pandas', pd), ('Scipy', scipy), ('Sklearn', sklearn))

print("Python Version:", sys.version, '\n')
for lib in libraries:
    print('{0} Version: {1}'.format(lib[0], lib[1].__version__))

Python Version: 3.6.2 |Anaconda custom (64-bit)| (default, Sep 21 2017, 18:29:43) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] 

Matplotlib Version: 2.0.2
Numpy Version: 1.12.1
Pandas Version: 0.20.3
Scipy Version: 0.19.1
Sklearn Version: 0.19.0


In [125]:
import numpy as np
from collections import Counter
from string import punctuation

class count_vectorizer:
    
    def __init__(self, max_features=None, ngrams = 1):
        self.max_features = max_features
        self.vocabulary = {}
        self.ngrams = ngrams
        
    def token_generator(self, X):
        """
        """
        for ix, val in enumerate(X):
            ngrams_to_return = [val]
            for i in range(1,self.ngrams):
                if ix+i < len(X):
                    ngrams_to_return.append(' '.join(X[ix:ix+1+i]))
            yield ngrams_to_return
    
    def tokenizer(self, X):
        """
        """
        for symbol in punctuation:
            X = X.replace(symbol,'')
        return X.lower().split()
    
    def fit(self, X):
        """
        """
        for document in X:
            tokens = self.tokenizer(document)
            for token_grams in self.token_generator(tokens):
                for token in token_grams:
                    if token in self.vocabulary.keys():
                        self.vocabulary[token] += 1
                    else:
                        self.vocabulary[token] = 1
        
        if self.max_features != None:
            temp_vocab = {}
            for key, value in Counter(self.vocabulary).most_common(self.max_features):
                temp_vocab[key] = value
            self.vocabulary = temp_vocab
            del temp_vocab
            
        self.token_to_id = {ky: ix for ix, ky in enumerate(sorted(self.vocabulary.keys()))}
        self.id_to_token = {ix: ky for ix, ky in enumerate(sorted(self.vocabulary.keys()))}
        
        
    def transform(self, X):
        """
        """
        vectorized_docs = np.zeros(len(self.vocabulary.keys()))
        for document in X:
            tokens = self.tokenizer(document)
            vectorized_doc = np.zeros(len(self.vocabulary.keys()))
            for token_grams in self.token_generator(tokens):
                for token in token_grams:
                    if token in self.vocabulary:
                        word_id = self.token_to_id[token]
                        vectorized_doc[word_id] += 1
            vectorized_docs = np.vstack((vectorized_docs,vectorized_doc))
        return vectorized_docs[1:]
    
    def fit_transform(self, X):
        """
        """
        self.fit(X)
        return self.transform(X)
                        

In [126]:
len(cv.vocabulary.keys())

20

In [139]:
cv = count_vectorizer(ngrams=3, max_features=None)
data = ['bob went to the store','dana, did not go to the store']
cv.fit(data)

In [140]:
cv.vocabulary

{'bob': 1,
 'bob went': 1,
 'bob went to': 1,
 'dana': 1,
 'dana did': 1,
 'dana did not': 1,
 'did': 1,
 'did not': 1,
 'did not go': 1,
 'go': 1,
 'go to': 1,
 'go to the': 1,
 'not': 1,
 'not go': 1,
 'not go to': 1,
 'store': 2,
 'the': 2,
 'the store': 2,
 'to': 2,
 'to the': 2,
 'to the store': 2,
 'went': 1,
 'went to': 1,
 'went to the': 1}

In [141]:
cv.transform(data)

array([[ 1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
         1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.]])

In [142]:
cv.vocabulary

{'bob': 1,
 'bob went': 1,
 'bob went to': 1,
 'dana': 1,
 'dana did': 1,
 'dana did not': 1,
 'did': 1,
 'did not': 1,
 'did not go': 1,
 'go': 1,
 'go to': 1,
 'go to the': 1,
 'not': 1,
 'not go': 1,
 'not go to': 1,
 'store': 2,
 'the': 2,
 'the store': 2,
 'to': 2,
 'to the': 2,
 'to the store': 2,
 'went': 1,
 'went to': 1,
 'went to the': 1}

In [143]:
from sklearn.feature_extraction.text import CountVectorizer

cv_sk = CountVectorizer(max_features=None, ngram_range=(1,3))
cv_sk.fit(data)
cv_sk.transform(data).todense()

matrix([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
         1, 1],
        [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
         0, 0]])

In [144]:
np.allclose(cv_sk.transform(data).todense(), cv.transform(data))

True