In [None]:
import pandas as pd

In [None]:
# toy example: 4 documents
X_train = [
    'call you tonight',
    'call me a cab',
    'please call me... PLEASE',
    'he called the police'
]
X_train

We will be using countvectorizer to convert text into a matrix of word counts

In [None]:
# import countvectorizer
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer() # with default parameters

In [None]:
# "learn the vocabulary"
vect.fit(X_train)

In [None]:
# exmaine the fitted vocabulary
vect.get_feature_names_out()

In [None]:
# converting training data into a 'document-term matrix'
X_train_dtm = vect.transform(X_train)
X_train_dtm

In [None]:
X_train_dtm.toarray()

In [None]:
pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names_out(), index=X_train)

In [None]:
X_test = ['please don\'t call me']
X_test_dtm = vect.transform(X_test)
pd.DataFrame(X_test_dtm.toarray(), columns=vect.get_feature_names_out(), index=X_test)

### Tuning the Vectorizer

**stop_words:** Stop words are words like [I, a, an, this, the, ...] that don't add much meaning to a sentence. We can remove them to reduce the number of features.

In [None]:
vect = CountVectorizer(stop_words='english')
vect.fit(X_train)
vect.get_feature_names_out()

In [None]:
# list of scikit learn stop words
from sklearn.feature_extraction import _stop_words

sorted(list(_stop_words.ENGLISH_STOP_WORDS))

**ngram_range**: An n-gram is a sequence of n words. For example, "apple juice" is a 2-gram (aka a bigram), and "I love apple juice" is a 4-gram (aka a four-gram). The ngram_range parameter lets us specify the range of n-gram sizes we want to include in our features. In the example above, we included unigrams (ngram_range=(1,1)) and bigrams (ngram_range=(2,2)).

In [14]:
vect = CountVectorizer(ngram_range=(1, 3)) # 1 grams, 2 grams, 3 grams
vect.fit(X_train)
vect.get_feature_names_out()

array(['cab', 'call', 'call me', 'call me cab', 'call me please',
       'call you', 'call you tonight', 'called', 'called the',
       'called the police', 'he', 'he called', 'he called the', 'me',
       'me cab', 'me please', 'please', 'please call', 'please call me',
       'police', 'the', 'the police', 'tonight', 'you', 'you tonight'],
      dtype=object)

**max_df / min_df:** When building the vocabulary, we can set the maximum document frequency (max_df) and minimum document frequency (min_df). If the word frequency is below min_df OR above max_df, the word is ignored. This allows us to exclude words that are too rare or too common to be useful.

In [None]:
# ignore items that appear in more than 50% of the documents
vect = CountVectorizer(max_df=0.5)
vect.fit(X_train)
vect.get_feature_names_out()