# Term Frequency - Inverse Document Frequency Vectorizer

In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
import pandas as pd
import math
import scipy
%matplotlib inline
plt.style.use('seaborn')

In [2]:
import sklearn
import matplotlib
import sys
libraries = (('Matplotlib', matplotlib), ('Numpy', np), ('Pandas', pd), ('Scipy', scipy), ('Sklearn', sklearn))

print("Python Version:", sys.version, '\n')
for lib in libraries:
    print('{0} Version: {1}'.format(lib[0], lib[1].__version__))

Python Version: 3.6.2 |Anaconda custom (64-bit)| (default, Sep 21 2017, 18:29:43) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] 

Matplotlib Version: 2.0.2
Numpy Version: 1.12.1
Pandas Version: 0.20.3
Scipy Version: 0.19.1
Sklearn Version: 0.19.0


In [94]:
import numpy as np
from collections import Counter
from string import punctuation

class tfidf_vectorizer:
    
    def __init__(self, max_features=None, ngrams = (1,1), tokenizer=None, remove_stopwords=False):
        """
        Term frequency, inverse document frequency vectorizer 
        reads the text provided, tokenizes it with the provided 
        tokenizer (or the default), then generates ngrams keeping 
        track of all ngrams as the vocabulary. Then it takes provided 
        texts and converts them into vectors by counting the 
        appearance of each ngram and tracking that for every document. 
        The counts are then scaled by the max term frequency and the
        inverse document frequency (see converter method). This new
        result is better than counts at picking out how important
        words are based on both usage and uniqueness. 
        ---
        KWargs:
        max_features: how many ngrams to allow in the vector, using the
        most common features first. If None, defaults to using all
        ngrams (int)
        ngrams: how many tokens to combine to form features. First element
        of tuple is starting point, second is ending point.
        tokenizer: what function to use to create tokens (must return 
        list of tokens)
        remove_stopwords: whether to include very common english words that
        do not add much value due to their commonness.
        """
        self.max_features = max_features
        self.vocabulary = {}
        self.ngrams = ngrams
        if tokenizer == None:
            self.tokenizer = self.tokenize
        else:
            self.tokenizer = tokenizer
        self.remove_stopwords = remove_stopwords
        self.stopwords = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 
                          'there', 'about', 'once', 'during', 'out', 'very', 'having', 
                          'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 
                          'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 
                          'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 
                          'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 
                          'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 
                          'himself', 'this', 'down', 'should', 'our', 'their', 'while', 
                          'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 
                          'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 
                          'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 
                          'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 
                          'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 
                          'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 
                          'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 
                          'was', 'here', 'than'}
        
    def token_generator(self, X):
        """
        Generator that returns joined tokens as a single
        string to act as a feature. It generates the tokens
        by iterating through the allowed ngrams and combining
        the appropriate number of tokens into a string.
        """
        for i in range(self.ngrams[0],self.ngrams[1]+1):
            for ix, _ in enumerate(X):
                if ix+i < len(X)+1:
                    yield ' '.join(X[ix:ix+i])
    
    def tokenize(self, X):
        """
        Simple tokenizer that removes punctuation,
        lowercases the text, and breaks on spaces.
        Also removes stopwords and numeric values
        from being treated as words.
        """
        for symbol in punctuation:
            X = X.replace(symbol,'')
        final_token_list = [] 
        for token in X.lower().split():
            if self.remove_stopwords:
                if not self.check_stopwords(token):
                    try:
                        int(token)
                        float(token)
                    except:
                        final_token_list.append(token)  
            else:
                final_token_list.append(token)
        return final_token_list
        
    def check_stopwords(self, token):
        """
        Checks if the token is in our list of common
        stopwords, and returns a boolean.
        """
        return token in self.stopwords
    
    def fit(self, X):
        """
        Go through all provided training documents and
        create the list of vocabulary for known documents
        by looking at all ngrams and tracking how often
        those ngrams appear. If max_features is defined,
        only keep the most common tokens. Afterward,
        generate a token_to_id mapper and an id_to_token
        mapper.
        """
        for document in X:
            tokens = self.tokenizer(document)
            for token in self.token_generator(tokens):
                if token in self.vocabulary.keys():
                    self.vocabulary[token] += 1
                else:
                    self.vocabulary[token] = 1
        
        if self.max_features != None:
            temp_vocab = {}
            for key, value in Counter(self.vocabulary).most_common(self.max_features):
                temp_vocab[key] = value
            self.vocabulary = temp_vocab
            del temp_vocab
            
        self.token_to_id = {ky: ix for ix, ky in enumerate(sorted(self.vocabulary.keys()))}
        self.id_to_token = {ix: ky for ix, ky in enumerate(sorted(self.vocabulary.keys()))}
        
        
    def transform(self, X):
        """
        Go through all provided documents and use the known
        vocabulary to track how often each ngram appears in
        the document. At the end, stack all of the generated
        document vectors together. Convert them to tf-idf
        and skip the initial vector that's all 0's, which 
        is just there to act as a template.
        """
        vectorized_docs = np.zeros(len(self.vocabulary.keys()))
        for document in X:
            tokens = self.tokenizer(document)
            vectorized_doc = np.zeros(len(self.vocabulary.keys()))
            for token in self.token_generator(tokens):
                if token in self.vocabulary:
                    word_id = self.token_to_id[token]
                    vectorized_doc[word_id] += 1
            vectorized_docs = np.vstack((vectorized_docs,vectorized_doc))
        return self.convert_counts_to_tf_idf(vectorized_docs)[1:]
    
    def convert_counts_to_tf_idf(self, docs):
        """
        To convert from counts to TF-IDF, we first scale
        each value by the maximum in it's own column. This 
        lowers dependence on document length. Then we calculate
        log(number of documents/(1+documents containing this ngram)).
        This is the inverse document frequency (the one is to make
        combat division by 0). Each value is scaled as:
        term_frequency*inverse_document_frequency.
        """
        number_of_columns = docs.shape[1]
        number_of_docs = docs.shape[0]
        frequency_scalers = np.ones(number_of_columns)
        idf_terms = np.ones(number_of_columns)
        for col in range(number_of_columns):
            column_vals = docs.T[col]
            frequency_scalers[col] = np.max(column_vals)
            number_of_docs_containing = np.sum((column_vals > 0).astype(int))
            idf_terms[col] = np.log(number_of_docs/(1+number_of_docs_containing))
        docs = docs/frequency_scalers
        docs = docs*idf_terms
        
        return docs           
    
    def fit_transform(self, X):
        """
        Fit on X and then transform X and return it as vectors.
        """
        self.fit(X)
        return self.transform(X)
                        

In [95]:
cv = tfidf_vectorizer(ngrams=(1,3), max_features=None)
data = ['bob went to the store','dana, did not go to the store', 'the dog ran quickly toward the stoplight']
cv.fit(data)

In [96]:
cv.vocabulary

{'bob': 1,
 'bob went': 1,
 'bob went to': 1,
 'dana': 1,
 'dana did': 1,
 'dana did not': 1,
 'did': 1,
 'did not': 1,
 'did not go': 1,
 'dog': 1,
 'dog ran': 1,
 'dog ran quickly': 1,
 'go': 1,
 'go to': 1,
 'go to the': 1,
 'not': 1,
 'not go': 1,
 'not go to': 1,
 'quickly': 1,
 'quickly toward': 1,
 'quickly toward the': 1,
 'ran': 1,
 'ran quickly': 1,
 'ran quickly toward': 1,
 'stoplight': 1,
 'store': 2,
 'the': 4,
 'the dog': 1,
 'the dog ran': 1,
 'the stoplight': 1,
 'the store': 2,
 'to': 2,
 'to the': 2,
 'to the store': 2,
 'toward': 1,
 'toward the': 1,
 'toward the stoplight': 1,
 'went': 1,
 'went to': 1,
 'went to the': 1}

In [97]:
cv.transform(data)

array([[ 0.69314718,  0.69314718,  0.69314718,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.28768207,  0.        ,  0.        ,  0.        ,  0.        ,
         0.28768207,  0.28768207,  0.28768207,  0.28768207,  0.        ,
         0.        ,  0.        ,  0.69314718,  0.69314718,  0.69314718],
       [ 0.        ,  0.        ,  0.        ,  0.69314718,  0.69314718,
         0.69314718,  0.69314718,  0.69314718,  0.69314718,  0.        ,
         0.        ,  0.        ,  0.69314718,  0.69314718,  0.69314718,
         0.69314718,  0.69314718,  0.69314718,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.28768207,  0.        ,  0.        ,  0.

# Now let's turn on stopwords

In [98]:
cv = tfidf_vectorizer(ngrams=(1,3), max_features=None, remove_stopwords=True)
data = ['bob went to the store','dana, did not go to the store','the dog ran quickly toward the stoplight']
cv.fit(data)

In [99]:
cv.transform(data)

array([[ 0.69314718,  0.69314718,  0.69314718,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.28768207,  0.        ,
         0.        ,  0.69314718,  0.69314718],
       [ 0.        ,  0.        ,  0.        ,  0.69314718,  0.69314718,
         0.69314718,  0.        ,  0.        ,  0.        ,  0.69314718,
         0.69314718,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.28768207,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.69314718,  0.69314718,  0.69314718,  0.        ,
         0.        ,  0.69314718,  0.69314718,  0.69314718,  0.69314718,
         0.69314718,  0.69314718,  0.69314718,  0.        ,  0.69314718,
         0.69314718,  0.    

In [102]:
cv = tfidf_vectorizer(ngrams=(1,3), max_features=3, remove_stopwords=True)
data = ['bob went to the store','dana, did not go to the store', 'the dog ran quickly toward the stoplight']
cv.fit(data)

In [103]:
cv.transform(data)

array([[ 0.69314718,  0.28768207,  0.69314718],
       [ 0.        ,  0.28768207,  0.        ],
       [ 0.        ,  0.        ,  0.        ]])

# Now a larger dataset

In [115]:
from sklearn import datasets

categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball']
ng_train = datasets.fetch_20newsgroups(subset='train', 
                                       categories=categories, 
                                       remove=('headers', 
                                               'footers', 'quotes'))
data = ng_train.data
targets = ng_train.target

In [106]:
cv = tfidf_vectorizer(ngrams=(1,3), max_features=100, remove_stopwords=True)
cv.fit(data)

In [109]:
pd.DataFrame(cv.transform(data), columns=cv.token_to_id.keys())

Unnamed: 0,3d,also,another,anyone,argument,atheists,available,back,believe,best,...,using,version,want,way,well,without,work,would,year,years
0,0.0,0.060062,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.156879,0.000000
1,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.338281,...,0.000000,0.000000,0.000000,0.000000,0.107740,0.000000,0.000000,0.156603,0.000000,0.000000
5,0.0,0.000000,0.000000,0.677094,0.000000,0.000000,0.000000,0.000000,0.203246,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.338281,...,0.000000,0.000000,0.000000,0.168879,0.107740,0.000000,0.000000,0.000000,0.000000,0.000000
7,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,0.0,0.000000,0.000000,0.677094,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


We can see this is a pretty sparse set of vectors, so if we wanted to store this in a smaller format for transferring, we could convert it to a sparse matrix

In [111]:
vectors = cv.transform(data)

In [112]:
from scipy import sparse

sparse_matrix = sparse.csr_matrix(vectors)

In [114]:
print(sparse_matrix)

  (0, 1)	0.0600622718746
  (0, 30)	0.0676541449126
  (0, 44)	0.250110111277
  (0, 50)	0.190984416334
  (0, 53)	0.209208516415
  (0, 98)	0.156878796159
  (1, 58)	0.0373506549157
  (2, 32)	0.100777675501
  (2, 44)	0.250110111277
  (2, 80)	0.740940041508
  (3, 32)	0.0503888377504
  (3, 55)	0.145554801657
  (3, 73)	0.251050219698
  (4, 9)	0.338280846763
  (4, 12)	0.235298435836
  (4, 28)	0.139215386892
  (4, 45)	0.301699623291
  (4, 59)	0.0761633302385
  (4, 79)	1.60037511919
  (4, 84)	0.133016131743
  (4, 94)	0.107739634638
  (4, 97)	0.156602646202
  (5, 3)	0.677093970875
  (5, 8)	0.203246372177
  (5, 12)	0.235298435836
  :	:
  (1658, 70)	0.0938138908455
  (1658, 71)	0.215828160339
  (1658, 94)	0.215479269276
  (1658, 97)	0.156602646202
  (1659, 4)	0.112669626527
  (1659, 5)	0.0491388435469
  (1659, 8)	0.101623186089
  (1659, 46)	0.122745695774
  (1659, 51)	0.056707062465
  (1659, 55)	0.145554801657
  (1659, 58)	0.0373506549157
  (1659, 59)	0.0761633302385
  (1659, 65)	0.178638568468
  (1

##### Now let's use the output in a model.

In [120]:
import sys
sys.path.append('../../zwml/naive_bayes/')

In [122]:
from multinomial_naive_bayes import multinomial_naive_bayes

nb = multinomial_naive_bayes()

In [123]:
nb.fit(vectors, targets)

In [124]:
nb.score(vectors,targets)

0.7302829620710415

In [125]:
from sklearn.dummy import DummyClassifier

dc = DummyClassifier()
dc.fit(vectors,targets)
dc.score(vectors,targets)

0.33955448524984949