Inspiration from: 
- https://www.datahubbs.com/tf-idf-starting-learning-text/
- https://www.datacamp.com/community/tutorials/stemming-lemmatization-python

I decided to use the Porter Stemmer because it handles more than just removing s (i.e. kids -> kid), but still remains pretty legible and doesn't overstem too much.

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time

In [7]:
cv = CountVectorizer(stop_words='english',
                     ngram_range=(1, 2))

In [19]:
testdoc = pd.Series(data = ['There are kids playing' + ' ' + ' '.join(['kids' for _ in range(10000000)]),
                            'Oh to be a kid who played'])
testdoc

0    There are kids playing kids kids kids kids kid...
1                            Oh to be a kid who played
dtype: object

In [21]:
start = time.time()
transformed_nostem = cv.fit_transform(testdoc)
print(cv.get_feature_names())
print(transformed_nostem.toarray())
print(time.time() - start)

['kid', 'kid played', 'kids', 'kids kids', 'kids playing', 'oh', 'oh kid', 'played', 'playing', 'playing kids']
[[       0        0 10000001  9999999        1        0        0        0
         1        1]
 [       1        1        0        0        0        1        1        1
         0        0]]
12.208767890930176


In [22]:
class StemmedDict(dict):
    def __missing__(self, key):
        res = self[key] = port.stem(key)
        return res

stemmed = StemmedDict()

In [23]:
from nltk.stem import PorterStemmer
port = PorterStemmer()
analyzer = CountVectorizer(stop_words='english', ngram_range=(1, 2)).build_analyzer()

def stem_words_dict(doc):
    return [' '.join([stemmed[word] for word in ngram.split()]) for ngram in analyzer(doc)]

def stem_words(doc):
    return [' '.join([port.stem(word) for word in ngram.split()]) for ngram in analyzer(doc)]

In [24]:
cv_stem = CountVectorizer(analyzer=stem_words)

In [25]:
start = time.time()
transformed_withstem = cv_stem.fit_transform(testdoc)
print(cv_stem.get_feature_names())
print(transformed_withstem.toarray())
print(time.time() - start)

['kid', 'kid kid', 'kid play', 'oh', 'oh kid', 'play', 'play kid']
[[10000001  9999999        1        0        0        1        1]
 [       1        0        1        1        1        1        0]]
638.0937504768372


In [26]:
cv_stem = CountVectorizer(analyzer=stem_words_dict)
start = time.time()
transformed_withstem = cv_stem.fit_transform(testdoc)
print(cv_stem.get_feature_names())
print(transformed_withstem.toarray())
print(time.time() - start)

['kid', 'kid kid', 'kid play', 'oh', 'oh kid', 'play', 'play kid']
[[10000001  9999999        1        0        0        1        1]
 [       1        0        1        1        1        1        0]]
26.224079132080078
