# Tokenization and vectorization

This example demonstrates tokenization and vectorization with several different method:

- nltk
- scikit-learn
- gensim

## With NLTK

In [34]:
import nltk
import string

def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()
    
    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)

In [35]:
from collections import defaultdict

def vectorize(doc):
    features = defaultdict(int)
    for token in tokenize(doc):
        features[token] += 1
    return features

In [36]:
corpus = [
    'The elephant sneezed at the sight of potatoes.',
    'Bats can see via echolocation. See the bat sight sneeze!',
    'Wondering, she opened the door to the studio.'
]

vectors = map(vectorize, corpus)
list(vectors)

[defaultdict(int,
             {'the': 2,
              'eleph': 1,
              'sneez': 1,
              'at': 1,
              'sight': 1,
              'of': 1,
              'potato': 1}),
 defaultdict(int,
             {'bat': 2,
              'can': 1,
              'see': 2,
              'via': 1,
              'echoloc': 1,
              'the': 1,
              'sight': 1,
              'sneez': 1}),
 defaultdict(int,
             {'wonder': 1,
              'she': 1,
              'open': 1,
              'the': 2,
              'door': 1,
              'to': 1,
              'studio': 1})]

## With Scikit-learn

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(corpus)
vectors

<3x20 sparse matrix of type '<class 'numpy.int64'>'
	with 23 stored elements in Compressed Sparse Row format>

## With Gensim

In [29]:
import sys

!{sys.executable} -m pip install gensim



In [37]:
import gensim

corpus = [list(tokenize(doc)) for doc in corpus]
id2word = gensim.corpora.Dictionary(corpus)
vectors = [
    id2word.doc2bow(doc) for doc in corpus
]
vectors

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2)],
 [(4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 2), (11, 1)],
 [(6, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]]