# Stemming
Stemming is the process of transforming a word into its root form, this allows us to map related
words to the same stem.

In [1]:
import numpy as np
from stemming.porter2 import stem
from sklearn.feature_extraction.text import CountVectorizer

## Stemming example

In [2]:
stem('run')

'run'

In [3]:
stem('running')

'run'

## Create a set of documents

In [4]:
docs = [
    'The sun is shining and thus it shines',
    'The weather is sweet',
    'The sun is shining and the weather is sweet']

## Define a stemming tokenizer

In [5]:
def tokenizer_porter(doc):
  return [stem(word) for word in doc.split()]

## Vectorize the documents

In [6]:
vectorizer = CountVectorizer(stop_words='english', tokenizer = tokenizer_porter)

In [7]:
bag = vectorizer.fit_transform(docs).toarray()
bag

array([[2, 1, 0, 0],
       [0, 0, 1, 1],
       [1, 1, 1, 1]])

## Vocabulary

In [8]:
vectorizer.vocabulary_

{u'shine': 0, u'sun': 1, u'sweet': 2, u'weather': 3}