## Setup


In [None]:
# import warnings
# warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib
import matplotlib.pyplot as plt

## Default Style Settings
matplotlib.rcParams['figure.dpi'] = 150
pd.options.display.max_colwidth = 200

In [None]:
# Downloads
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Bag-of-Words Representation
In `CountVectorizer()`, we can utilize its parameters:

- `min_df`: When building the vocabulary, the vectorizer will ignore terms that have a **document frequency** strictly lower than the given threshold. `float` = the parameter represents a proportion of documents; `integer` = absolute counts.

- `max_df`: When building the vocabulary, the vectorizer will ignore terms that have a **document frequency** strictly higher than the given threshold (corpus-specific stop words). `float` = the parameter represents a proportion of documents; `integer` = absolute counts.

- `max_features` : Build a vocabulary that only consider the top `max_features` ordered by term frequency across the corpus.

The CountVectorizer will select the words/features/terms which occur the most frequently. It takes absolute values so if you set the ‘max_features = 3’, it will select the 3 most common words in the data.

- `ngram_range` : The lower and upper boundary of the range of n-values for different word n-grams. `tuple` (min_n, max_n), default=(1, 1).

- `token_pattern`: Regular expression denoting what constitutes a "token" in vocabulary. The default regexp select tokens of 2 or more alphanumeric characters (Note: **punctuation** is completely ignored and always treated as a token separator).

- `lower` Converts all characters to lowercase before tokenizing. Default is set to true and takes boolean value.

- `stop_words`: sklearn built-in stop words list 
CountVectorizer(stop_words=’english’

- `binary`: Binary Term Frequency
Binary Term Frequency captures presence (1) or absence (0) of term in document. 
By setting ‘binary = True’, the CountVectorizer no more takes into consideration the frequency of the term/word. If it occurs it’s set to 1 otherwise 0. By default, binary is set to False. This is usually used when the count of the term/word does not provide useful information to the machine learning model.

### Word encodings

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
words = ['robot', 'woman', 'man']
# BOW features in sparse format
# Create vectorizer
cv = CountVectorizer(min_df=0., max_df=1.)

# Create vector by passing the text corpus into the vectorizer to get back counts
cv_matrix = cv.fit_transform(words)
cv_matrix

<3x3 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [None]:
# Now, we can inspect how our vectorizer vectorized the text. Output a list of words used, and their index in the vectors
cv.vocabulary_

{'man': 0, 'robot': 1, 'woman': 2}

In [None]:
# Check out your corpus
vocab = cv.get_feature_names_out()
vocab

array(['man', 'robot', 'woman'], dtype=object)

In [None]:
print(cv_matrix) # document, index, count
# Read as follows: The document zero, which is 'robot', has the vocabulary item 'robot' indexed by 1 once.

  (0, 1)	1
  (1, 2)	1
  (2, 0)	1


In [None]:
cv_matrix = cv_matrix.toarray()
print(cv_matrix)

[[0 1 0]
 [0 0 1]
 [1 0 0]]


In [None]:
pd.DataFrame(cv_matrix, columns=vocab) # rows are the documents, coloumns the features

Unnamed: 0,man,robot,woman
0,0,1,0
1,0,0,1
2,1,0,0
3,1,0,0


In [None]:
# Try out changing the input words
# Add a new word
# Add the same word

In [None]:
words_a = ['robot', 'woman', 'man', 'man']
cv_matrix = cv.fit_transform(words_a).toarray()
pd.DataFrame(cv_matrix, columns=vocab) # rows are the documents, coloumns the features

Unnamed: 0,man,robot,woman
0,0,1,0
1,0,0,1
2,1,0,0
3,1,0,0


In [None]:
words_b = ['robot', 'woman', 'man, man']
#cv = CountVectorizer(min_df=0., max_df=1., binary=True)
cv_matrix = cv.fit_transform(words_b)
print(cv_matrix)

# Check out your vocab
vocab = cv.get_feature_names_out()
print(vocab)

  (0, 1)	1
  (1, 2)	1
  (2, 0)	2
['man' 'robot' 'woman']


In [None]:
# Inspect new dataframe
cv_matrix = cv.fit_transform(words_b).toarray()
pd.DataFrame(cv_matrix, columns=vocab)               # rows are the documents, coloumns the features

Unnamed: 0,man,robot,woman
0,0,1,0
1,0,0,1
2,2,0,0


### Binarizer 
In Scikit-Learn, one-hot encoding is implemented with the Binarizer transformer in the preprocessing module. The Binarizer takes only numeric data, so the text data must be transformed into a numeric space using the CountVectorizer ahead of one-hot encoding. The Binarizer class uses a threshold value (0 by default) such that all values of the vector that are less than or equal to the threshold are set to zero, while those that are greater than the threshold are set to 1. Therefore, by default, the Binarizer converts all frequency values to 1 while maintaining the zero-valued frequencies.

In [None]:
from sklearn.preprocessing import Binarizer

cv   = CountVectorizer()

cv_matrix = cv.fit_transform(words_b)                 # words_b = ['robot', 'woman', 'man, man']
vocab = cv.get_feature_names_out()

onehot = Binarizer()
oh_matrix = onehot.fit_transform(cv_matrix.toarray())
print(oh_matrix)
pd.DataFrame(oh_matrix, columns=vocab)

[[0 1 0]
 [0 0 1]
 [1 0 0]]


Unnamed: 0,man,robot,woman
0,0,1,0
1,0,0,1
2,1,0,0


The .toarray() method is optional; it converts the sparse matrix representation to a dense one. In corpora with large vocabularies, the sparse matrix representation is much better. Note that we could also use CountVectorizer(binary=True) to achieve one-hot encoding in the above, obviating the Binarizer.

### Corpus
We use a small play corpus (list of sentences). Each sentence represents a document and has a label (topic). You already know these topics from earlier classes.


In [None]:
corpus = [
    'This is a ripe toasty wine.', 
    'A roasty, toasty wine with notes of mocha',
    'Spiced coconut chicken with coriander and salt.',
    'Coriander chicken pasta.',
    'Great dress for vacation.',
    'Perfect dress, perfect fit!'
]
labels = [
    'wine', 'wine', 'food', 'food', 'clothing', 'clothing']

corpus, labels

(['This is a ripe toasty wine.',
  'A roasty, toasty wine with notes of mocha',
  'Spiced coconut chicken with coriander and salt.',
  'Coriander chicken pasta.',
  'Great dress for vacation.',
  'Perfect dress, perfect fit!'],
 ['wine', 'wine', 'food', 'food', 'clothing', 'clothing'])

In [None]:
# List to np.array
corpus = np.array(corpus)

In [None]:
# Create dataframe
corpus_df = pd.DataFrame({'Document': corpus, 'Category': labels})
corpus_df

Unnamed: 0,Document,Category
0,This is a ripe toasty wine.,wine
1,"A roasty, toasty wine with notes of mocha",wine
2,Spiced coconut chicken with coriander and salt.,food
3,Coriander chicken pasta.,food
4,Great dress for vacation.,clothing
5,"Perfect dress, perfect fit!",clothing


### Text Preprocessing

- Remove special characters
- Normalize letter case
- Remove redundant spaces
- Tokenize each document into word-tokens
- Remove stop words
- All these preprocessing steps are wrapped in one function, `preprocess_document()`.

In [None]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def preprocess_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I | re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [None]:
# Apply the preprocessing function with a random sentence first
doc_luzern = "Luzern is a beautiful town with a lot of nice places to visit."
preprocess_document(doc_luzern)

'luzern beautiful town lot nice places visit'

In [None]:
# Let's go back to our original play corpus now and preprocess the first document: 'This is a ripe toasty wine.'
preprocess_document(corpus[0])

'ripe toasty wine'

What elements have been removed via preprocessing? What other preprocessing steps can you notice here?

In [None]:
normalize_corpus = np.vectorize(preprocess_document) ## The `vectorize` function is provided primarily for convenience, not for performance. The implementation is essentially a for loop.

In [None]:
# Preprocess the small corpus
norm_corpus = normalize_corpus(corpus)
print(corpus)
print("="*50)
print(norm_corpus)

['This is a ripe toasty wine.' 'A roasty, toasty wine with notes of mocha'
 'Spiced coconut chicken with coriander and salt.'
 'Coriander chicken pasta.' 'Great dress for vacation.'
 'Perfect dress, perfect fit!']
['ripe toasty wine' 'roasty toasty wine notes mocha'
 'spiced coconut chicken coriander salt' 'coriander chicken pasta'
 'great dress vacation' 'perfect dress perfect fit']


### `CountVectorizer()` from `sklearn`

Learn:
https://towardsdatascience.com/natural-language-processing-count-vectorization-with-scikit-learn-e7804269bb5e

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# BOW features in sparse format
# Create vectorizer
cv = CountVectorizer(min_df=0., max_df=1.)

# Create vector by passing the text corpus into the vectorizer to get back counts
cv_matrix = cv.fit_transform(norm_corpus)

In [None]:
# Now, we can inspect how our vectorizer vectorized the text. Output a list of words used, and their index in the vectors
cv.vocabulary_

{'chicken': 0,
 'coconut': 1,
 'coriander': 2,
 'dress': 3,
 'fit': 4,
 'great': 5,
 'mocha': 6,
 'notes': 7,
 'pasta': 8,
 'perfect': 9,
 'ripe': 10,
 'roasty': 11,
 'salt': 12,
 'spiced': 13,
 'toasty': 14,
 'vacation': 15,
 'wine': 16}

In [None]:
len(norm_corpus)

6

In [None]:
# Check out your corpus' vocab only
vocab = cv.get_feature_names_out()
vocab

array(['chicken', 'coconut', 'coriander', 'dress', 'fit', 'great',
       'mocha', 'notes', 'pasta', 'perfect', 'ripe', 'roasty', 'salt',
       'spiced', 'toasty', 'vacation', 'wine'], dtype=object)

In [None]:
len(vocab)
# Dimensionality equals the size of vocab

17

In [None]:
# Non-zero feature positions in the sparse matrix. Which document has which vocabulary item and how many times.
print(cv_matrix)

  (0, 10)	1
  (0, 14)	1
  (0, 16)	1
  (1, 14)	1
  (1, 16)	1
  (1, 11)	1
  (1, 7)	1
  (1, 6)	1
  (2, 13)	1
  (2, 1)	1
  (2, 0)	1
  (2, 2)	1
  (2, 12)	1
  (3, 0)	1
  (3, 2)	1
  (3, 8)	1
  (4, 5)	1
  (4, 3)	1
  (4, 15)	1
  (5, 3)	1
  (5, 9)	2
  (5, 4)	1


In [None]:
# View representation
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1],
       [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
# Inspect document feature vectors
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,chicken,coconut,coriander,dress,fit,great,mocha,notes,pasta,perfect,ripe,roasty,salt,spiced,toasty,vacation,wine
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1
1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,0,1
2,1,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0
3,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0
5,0,0,0,1,1,0,0,0,0,2,0,0,0,0,0,0,0


### Ngrams Representation

In [None]:
# You can set the n-gram range to 1,2 to get unigrams as well as bigrams
bv = CountVectorizer(ngram_range=(2, 2))
bv_matrix = bv.fit_transform(norm_corpus)

bv_matrix = bv_matrix.toarray()
vocab = bv.get_feature_names()
pd.DataFrame(bv_matrix, columns=vocab)



Unnamed: 0,chicken coriander,chicken pasta,coconut chicken,coriander chicken,coriander salt,dress perfect,dress vacation,great dress,notes mocha,perfect dress,perfect fit,ripe toasty,roasty toasty,spiced coconut,toasty wine,wine notes
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1
2,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0
3,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0


## Tf-Idf



### `TfidfVectorizer()` from `sklearn`

In [None]:
# Once you have more documents, you can specify more parameters
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0.,
                     max_df=1.,
                     norm='l2',
                     use_idf=True,
                     smooth_idf=True,
                     ngram_range= (1,2))

tv_matrix = tv.fit_transform(norm_corpus)

tv_matrix = tv_matrix.toarray()
vocab = tv.get_feature_names_out()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

Unnamed: 0,chicken,chicken coriander,chicken pasta,coconut,coconut chicken,coriander,coriander chicken,coriander salt,dress,dress perfect,dress vacation,fit,great,great dress,mocha,notes,notes mocha,pasta,perfect,perfect dress,perfect fit,ripe,ripe toasty,roasty,roasty toasty,salt,spiced,spiced coconut,toasty,toasty wine,vacation,wine,wine notes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.41,0.41,0.0,0.41,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35,0.35,0.35,0.0,0.0,0.0,0.0,0.0,0.0,0.35,0.35,0.0,0.0,0.0,0.29,0.29,0.0,0.29,0.35
2,0.28,0.35,0.0,0.35,0.35,0.28,0.0,0.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35,0.35,0.35,0.0,0.0,0.0,0.0,0.0
3,0.39,0.0,0.48,0.0,0.0,0.39,0.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.38,0.0,0.46,0.0,0.46,0.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.46,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.28,0.34,0.0,0.34,0.0,0.0,0.0,0.0,0.0,0.0,0.68,0.34,0.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Step-by-Step of TF-IDF 🏠

The following shows the creation and computation of the tfidf matrix step by step. Please go over the codes on your own to practice the tf-idf calculation

#### Create Vocabulary Dictionary of the Corpus

In [None]:
# get unique words as feature names
unique_words = list(
    set([word for doc in [doc.split() for doc in norm_corpus]
         for word in doc]))

# default dict 
def_feature_dict = {w: 0 for w in unique_words}

print('Feature Names:', unique_words)
print('Default Feature Dict:', def_feature_dict)

Feature Names: ['dress', 'roasty', 'toasty', 'notes', 'chicken', 'perfect', 'fit', 'mocha', 'coriander', 'ripe', 'coconut', 'pasta', 'vacation', 'spiced', 'salt', 'great', 'wine']
Default Feature Dict: {'dress': 0, 'roasty': 0, 'toasty': 0, 'notes': 0, 'chicken': 0, 'perfect': 0, 'fit': 0, 'mocha': 0, 'coriander': 0, 'ripe': 0, 'coconut': 0, 'pasta': 0, 'vacation': 0, 'spiced': 0, 'salt': 0, 'great': 0, 'wine': 0}


#### Create Document-Word Matrix (Bag-of-Word Frequencies)

In [None]:
from collections import Counter
# build bag of words features for each document - term frequencies
bow_features = []
for doc in norm_corpus:
    bow_feature_doc = Counter(doc.split())
    # initialize default corpus dictionary
    all_features = Counter(def_feature_dict) 
    
    # update default dict with current doc words
    bow_feature_doc.update(all_features)
    
    # append cur doc dict
    bow_features.append(bow_feature_doc)

bow_features = pd.DataFrame(bow_features)
bow_features

Unnamed: 0,ripe,toasty,wine,dress,roasty,notes,chicken,perfect,fit,mocha,coriander,coconut,pasta,vacation,spiced,salt,great
0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,1,0
3,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1
5,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0


#### Compute Document Frequency of Words

In [None]:
import scipy.sparse as sp
feature_names = list(bow_features.columns)

# build the document frequency matrix
df = np.diff(sp.csc_matrix(bow_features, copy=True).indptr)
# `csc_matrix()` compress `bow_features` into sparse matrix based on columns
# `csc_matrix.indices` stores the matrix value indices in each column
# `csc_matrix.indptr` stores the accumulative numbers of values from column-0 to the right-most column

df = 1 + df  # adding 1 to smoothen idf later

# show smoothened document frequencies
pd.DataFrame([df], columns=feature_names)

Unnamed: 0,ripe,toasty,wine,dress,roasty,notes,chicken,perfect,fit,mocha,coriander,coconut,pasta,vacation,spiced,salt,great
0,2,3,3,3,2,2,3,2,2,2,3,2,2,2,2,2,2


#### Create Inverse Document Frequency of Words

In [None]:
# compute inverse document frequencies for each term
total_docs = 1 + len(norm_corpus)
idf = 1.0 + np.log(float(total_docs) / df)

# show smoothened idfs
pd.DataFrame([np.round(idf, 2)], columns=feature_names)

Unnamed: 0,ripe,toasty,wine,dress,roasty,notes,chicken,perfect,fit,mocha,coriander,coconut,pasta,vacation,spiced,salt,great
0,2.25,1.85,1.85,1.85,2.25,2.25,1.85,2.25,2.25,2.25,1.85,2.25,2.25,2.25,2.25,2.25,2.25


#### Compute Raw TF-IDF for Each Document

In [None]:
# compute tfidf feature matrix
tf = np.array(bow_features, dtype='float64')
tfidf = tf * idf  
# view raw tfidf feature matrix
pd.DataFrame(np.round(tfidf, 2), columns=feature_names)

Unnamed: 0,ripe,toasty,wine,dress,roasty,notes,chicken,perfect,fit,mocha,coriander,coconut,pasta,vacation,spiced,salt,great
0,2.25,1.85,1.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.85,1.85,0.0,2.25,2.25,0.0,0.0,0.0,2.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.85,0.0,0.0,0.0,1.85,2.25,0.0,0.0,2.25,2.25,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.85,0.0,0.0,0.0,1.85,0.0,2.25,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.25,0.0,0.0,2.25
5,0.0,0.0,0.0,1.85,0.0,0.0,0.0,4.51,2.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
