# Hashing trick
Hashing trick works by applying a hash function to the features and using their hash values 
as indices directly, rather than building a dictionary.

In [1]:
import numpy as np
from stemming.porter2 import stem
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer

## Create a set of documents

In [2]:
docs = [
    'The dog ate a sandwich and I ate a sandwich',
    'The wizard transfigured a sandwich']

## Define a stemming tokenizer

In [3]:
def tokenizer_porter(doc):
  return [stem(word) for word in doc.split()]

## Vectorize the documents with Hashing trick

In [4]:
vectorizer =HashingVectorizer(stop_words='english', tokenizer = tokenizer_porter, 
                              n_features=10, norm=None, non_negative=True)
y = vectorizer.transform(docs).toarray()
y

array([[ 2.,  2.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.]])

n_features default value is $2^{20}$ , which is adequate for most problems. 
Here, it is set to 10 so that the matrix will be small enough to print and still display all of the nonzero features.

## Vectorize the documents with tf-idf

In [5]:
tr = TfidfTransformer()
z = tr.fit_transform(y).toarray()
z

array([[ 0.53689271,  0.75458397,  0.        ,  0.37729199,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.44943642,  0.        ,  0.        ,  0.        ,  0.6316672 ,
         0.6316672 ,  0.        ,  0.        ,  0.        ,  0.        ]])