# Week 5 - Text Feature Engineering

## Setup and Text Cleanup
Code starts on page 203.

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', None, 'display.max_columns', None)

# building a corpus of documents
corpus = [
    'The sky is blue and beautiful.',
    'Love this blue and beautiful sky!',
    'The quick brown fox jumps over the lazy dog.',
    'A king\'s breakfast has sausages, ham, bacon, eggs, toast, and beans.',
    'I love green eggs, ham, saussages, and bacon!',
    'The brown fox is quick and the blue dog is lazy!',
    'The sky is very blue and the sky is very beautiful today.',
    'The dog is lazy but the brown fox is quick!'
]
labels = ['weather', 'weather', 'animals', 'food', 'food', 'animals',
          'weather', 'animals']

corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus, 'Category': labels})
corpus_df = corpus_df[['Document', 'Category']]
print(corpus_df, '\n')

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('English')

def normalize_document(doc):
    # lowercase and remove special characters\whitespace
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    #tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

norm_corpus = normalize_corpus(corpus)
print(norm_corpus, '\n')

                                            Document Category
0                     The sky is blue and beautiful.  weather
1                  Love this blue and beautiful sky!  weather
2       The quick brown fox jumps over the lazy dog.  animals
3  A king's breakfast has sausages, ham, bacon, e...     food
4      I love green eggs, ham, saussages, and bacon!     food
5   The brown fox is quick and the blue dog is lazy!  animals
6  The sky is very blue and the sky is very beaut...  weather
7        The dog is lazy but the brown fox is quick!  animals 

['sky blue beautiful' 'love blue beautiful sky'
 'quick brown fox jumps lazy dog'
 'kings breakfast sausages ham bacon eggs toast beans'
 'love green eggs ham saussages bacon' 'brown fox quick blue dog lazy'
 'sky blue sky beautiful today' 'dog lazy brown fox quick'] 



## Bag of Words Model
Starting on page 208

In [2]:
print('Bag of Words Model')
# starting on page 208
from sklearn.feature_extraction.text import CountVectorizer
# get bag of words features in sparse format
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(norm_corpus)
# view non-zero feature positions in the sparse matrix
print(cv_matrix, '\n')

# view dense representation
# warning - might give a memory error if the data is too big
cv_matrix = cv_matrix.toarray()
print(cv_matrix, '\n')

# get all unique words in the corpus
vocab = cv.get_feature_names()
#show document feature vectors
cv_df = pd.DataFrame(cv_matrix, columns=vocab)
print(cv_df, '\n')

# you can set the n-gram range to 1,2 to get unigrams as well as bigrams
bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(norm_corpus)
bv_matrix = bv_matrix.toarray()
vocab = bv.get_feature_names()
bv_df = pd.DataFrame(bv_matrix, columns=vocab)
print(bv_df, '\n')

Bag of Words Model
  (0, 18)	1
  (0, 3)	1
  (0, 2)	1
  (1, 18)	1
  (1, 3)	1
  (1, 2)	1
  (1, 14)	1
  (2, 15)	1
  (2, 5)	1
  (2, 8)	1
  (2, 11)	1
  (2, 13)	1
  (2, 6)	1
  (3, 12)	1
  (3, 4)	1
  (3, 16)	1
  (3, 10)	1
  (3, 0)	1
  (3, 7)	1
  (3, 19)	1
  (3, 1)	1
  (4, 14)	1
  (4, 10)	1
  (4, 0)	1
  (4, 7)	1
  (4, 9)	1
  (4, 17)	1
  (5, 3)	1
  (5, 15)	1
  (5, 5)	1
  (5, 8)	1
  (5, 13)	1
  (5, 6)	1
  (6, 18)	2
  (6, 3)	1
  (6, 2)	1
  (6, 20)	1
  (7, 15)	1
  (7, 5)	1
  (7, 8)	1
  (7, 13)	1
  (7, 6)	1 

[[0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0]
 [0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 1 0 0 0 0 0]
 [1 1 0 0 1 0 0 1 0 0 1 0 1 0 0 0 1 0 0 1 0]
 [1 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 1 0 0 0]
 [0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0]
 [0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1]
 [0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0]] 

   bacon  beans  beautiful  blue  breakfast  brown  dog  eggs  fox  green  \
0      0      0          1     1          0      0    0   

## Tf-Idf Transformer - Starting on Page 213

In [3]:
print('tfidf transformer:')
from sklearn.feature_extraction.text import TfidfTransformer
tt = TfidfTransformer(norm = 'l2', use_idf=True)
tt_matrix = tt.fit_transform(cv_matrix)
tt_matrix = tt_matrix.toarray()
vocab = cv.get_feature_names()
print(pd.DataFrame(np.round(tt_matrix, 2), columns=vocab), '\n')

tfidf transformer:
   bacon  beans  beautiful  blue  breakfast  brown   dog  eggs   fox  green  \
0   0.00   0.00       0.60  0.53       0.00   0.00  0.00  0.00  0.00   0.00   
1   0.00   0.00       0.49  0.43       0.00   0.00  0.00  0.00  0.00   0.00   
2   0.00   0.00       0.00  0.00       0.00   0.38  0.38  0.00  0.38   0.00   
3   0.31   0.38       0.00  0.00       0.38   0.00  0.00  0.31  0.00   0.00   
4   0.38   0.00       0.00  0.00       0.00   0.00  0.00  0.38  0.00   0.46   
5   0.00   0.00       0.00  0.37       0.00   0.42  0.42  0.00  0.42   0.00   
6   0.00   0.00       0.36  0.32       0.00   0.00  0.00  0.00  0.00   0.00   
7   0.00   0.00       0.00  0.00       0.00   0.45  0.45  0.00  0.45   0.00   

    ham  jumps  kings  lazy  love  quick  sausages  saussages   sky  toast  \
0  0.00   0.00   0.00  0.00  0.00   0.00      0.00       0.00  0.60   0.00   
1  0.00   0.00   0.00  0.00  0.57   0.00      0.00       0.00  0.49   0.00   
2  0.00   0.53   0.00  0.38  0.00  

## tfidfvectorizer, page 214

In [6]:
print('tfidf vectorizer:')
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(min_df=0., max_df=1., norm='l2', use_idf=True, smooth_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()
# this part is not in the book - save the tv_matrix for use later on
import os
np.save('tv_matrix.npy', tv_matrix)

vocab = tv.get_feature_names()
print(pd.DataFrame(np.round(tv_matrix, 2), columns=vocab), '\n')

tfidf vectorizer:
   bacon  beans  beautiful  blue  breakfast  brown   dog  eggs   fox  green  \
0   0.00   0.00       0.60  0.53       0.00   0.00  0.00  0.00  0.00   0.00   
1   0.00   0.00       0.49  0.43       0.00   0.00  0.00  0.00  0.00   0.00   
2   0.00   0.00       0.00  0.00       0.00   0.38  0.38  0.00  0.38   0.00   
3   0.31   0.38       0.00  0.00       0.38   0.00  0.00  0.31  0.00   0.00   
4   0.38   0.00       0.00  0.00       0.00   0.00  0.00  0.38  0.00   0.46   
5   0.00   0.00       0.00  0.37       0.00   0.42  0.42  0.00  0.42   0.00   
6   0.00   0.00       0.36  0.32       0.00   0.00  0.00  0.00  0.00   0.00   
7   0.00   0.00       0.00  0.00       0.00   0.45  0.45  0.00  0.45   0.00   

    ham  jumps  kings  lazy  love  quick  sausages  saussages   sky  toast  \
0  0.00   0.00   0.00  0.00  0.00   0.00      0.00       0.00  0.60   0.00   
1  0.00   0.00   0.00  0.00  0.57   0.00      0.00       0.00  0.49   0.00   
2  0.00   0.53   0.00  0.38  0.00   

## Understanding the TF-DF Model - starting on page 215

In [9]:
# get unique words as feature names
# different output than book
unique_words = list(set([word for doc in [doc.split() for doc in norm_corpus] for word in doc]))
def_feature_dict = {w: 0 for w in unique_words}
print('Feature Names:', unique_words)
print('Default Feature Dict:', def_feature_dict, '\n')

# page 216
from collections import Counter
# build bag of words features for each document - term frequencies
bow_features = []
for doc in norm_corpus:
    bow_feature_doc = Counter(doc.split())
    all_features = Counter(def_feature_dict)
    bow_feature_doc.update(all_features)
    bow_features.append(bow_feature_doc)

bow_features = pd.DataFrame(bow_features)
print('BOW features:\n', bow_features)

Feature Names: ['blue', 'sky', 'beautiful', 'love', 'eggs', 'beans', 'jumps', 'lazy', 'quick', 'saussages', 'ham', 'today', 'green', 'bacon', 'kings', 'breakfast', 'toast', 'sausages', 'dog', 'fox', 'brown']
Default Feature Dict: {'blue': 0, 'sky': 0, 'beautiful': 0, 'love': 0, 'eggs': 0, 'beans': 0, 'jumps': 0, 'lazy': 0, 'quick': 0, 'saussages': 0, 'ham': 0, 'today': 0, 'green': 0, 'bacon': 0, 'kings': 0, 'breakfast': 0, 'toast': 0, 'sausages': 0, 'dog': 0, 'fox': 0, 'brown': 0} 

BOW features:
    sky  blue  beautiful  love  eggs  beans  jumps  lazy  quick  saussages  \
0    1     1          1     0     0      0      0     0      0          0   
1    1     1          1     1     0      0      0     0      0          0   
2    0     0          0     0     0      0      1     1      1          0   
3    0     0          0     0     1      1      0     0      0          0   
4    0     0          0     1     1      0      0     0      0          1   
5    0     1          0     0     0

## Document Frequencies - starting on page 216

In [10]:
import scipy.sparse as sp 
feature_names = list(bow_features.columns)

# build the document frequency matrix
df = np.diff(sp.csc_matrix(bow_features, copy=True).indptr)
df = 1 + df # adding 1 to smoothen idf later

# show smoothened document frequencies
print('Smooth DF:\n', pd.DataFrame([df], columns=feature_names), '\n')

Smooth DF:
    sky  blue  beautiful  love  eggs  beans  jumps  lazy  quick  saussages  \
0    4     5          4     3     3      2      2     4      4          2   

   ham  today  green  bacon  kings  breakfast  toast  sausages  dog  fox  \
0    3      2      2      3      2          2      2         2    4    4   

   brown  
0      4   



## IDF - page 217

In [11]:
# compute inverse document frequencies
total_docs = 1 + len(norm_corpus)
idf = 1.0 + np.log(float(total_docs) / df) 

# show smoothened IDFs
print('Smooth IDFs:\n', pd.DataFrame([np.round(idf, 2)], columns=feature_names), '\n')

Smooth IDFs:
     sky  blue  beautiful  love  eggs  beans  jumps  lazy  quick  saussages  \
0  1.81  1.59       1.81   2.1   2.1    2.5    2.5  1.81   1.81        2.5   

   ham  today  green  bacon  kings  breakfast  toast  sausages   dog   fox  \
0  2.1    2.5    2.5    2.1    2.5        2.5    2.5       2.5  1.81  1.81   

   brown  
0   1.81   



## Tdf-Idf

In [12]:
# compute idf diagonal matrix
total_features = bow_features.shape[1]
idf_diag = sp.spdiags(idf, diags=0, m=total_features, n=total_features)
idf_dense = idf_diag.todense()

# print the idf diagonal matrix
print('Diagonal matrix:\n', pd.DataFrame(np.round(idf_dense, 2)), '\n')

# compute tfidf feature matrix - page 218
tf = np.array(bow_features, dtype='float64')
tfidf = tf * idf
# view raw tfidf feature matrix
print('Raw TF-IDF feature matrix\n', pd.DataFrame(np.round(tfidf, 2), columns=feature_names), '\n')

# computer l2 norms
from numpy.linalg import norm
norms = norm(tfidf, axis=1)

# print norms for each document
print('Norms:\n', np.round(norms, 3), '\n')

# compute normalized tfidf
norm_tfidf = tfidf / norms[:, None]

# show final tfidf feature matrix
print('Final TF-DF feature matrix:\n',  pd.DataFrame(np.round(norm_tfidf, 2), columns=feature_names), '\n')

# Extracting Features for New Documents - page 220
new_doc = 'the sky is green today'
print('New doc features:\n', pd.DataFrame(np.round(tv.transform([new_doc]).toarray(), 2), columns=tv.get_feature_names()), '\n')


Diagonal matrix:
       0     1     2    3    4    5    6     7     8    9    10   11   12   13  \
0   1.81  0.00  0.00  0.0  0.0  0.0  0.0  0.00  0.00  0.0  0.0  0.0  0.0  0.0   
1   0.00  1.59  0.00  0.0  0.0  0.0  0.0  0.00  0.00  0.0  0.0  0.0  0.0  0.0   
2   0.00  0.00  1.81  0.0  0.0  0.0  0.0  0.00  0.00  0.0  0.0  0.0  0.0  0.0   
3   0.00  0.00  0.00  2.1  0.0  0.0  0.0  0.00  0.00  0.0  0.0  0.0  0.0  0.0   
4   0.00  0.00  0.00  0.0  2.1  0.0  0.0  0.00  0.00  0.0  0.0  0.0  0.0  0.0   
5   0.00  0.00  0.00  0.0  0.0  2.5  0.0  0.00  0.00  0.0  0.0  0.0  0.0  0.0   
6   0.00  0.00  0.00  0.0  0.0  0.0  2.5  0.00  0.00  0.0  0.0  0.0  0.0  0.0   
7   0.00  0.00  0.00  0.0  0.0  0.0  0.0  1.81  0.00  0.0  0.0  0.0  0.0  0.0   
8   0.00  0.00  0.00  0.0  0.0  0.0  0.0  0.00  1.81  0.0  0.0  0.0  0.0  0.0   
9   0.00  0.00  0.00  0.0  0.0  0.0  0.0  0.00  0.00  2.5  0.0  0.0  0.0  0.0   
10  0.00  0.00  0.00  0.0  0.0  0.0  0.0  0.00  0.00  0.0  2.1  0.0  0.0  0.0   
11  0.00  

## Document Similarity

In [14]:
import os
import numpy as np
import pandas as pd

# load the tv_matrix save in the last file
tv_matrix = np.load('tv_matrix.npy')

# Document Similarity - staring on page 221
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
print('Similarity matrix DF:\n', similarity_df, '\n')

Similarity matrix DF:
           0         1         2         3         4         5         6  \
0  1.000000  0.820599  0.000000  0.000000  0.000000  0.192353  0.817246   
1  0.820599  1.000000  0.000000  0.000000  0.218401  0.157845  0.670631   
2  0.000000  0.000000  1.000000  0.000000  0.000000  0.791821  0.000000   
3  0.000000  0.000000  0.000000  1.000000  0.360407  0.000000  0.000000   
4  0.000000  0.218401  0.000000  0.360407  1.000000  0.000000  0.000000   
5  0.192353  0.157845  0.791821  0.000000  0.000000  1.000000  0.115488   
6  0.817246  0.670631  0.000000  0.000000  0.000000  0.115488  1.000000   
7  0.000000  0.000000  0.850516  0.000000  0.000000  0.930989  0.000000   

          7  
0  0.000000  
1  0.000000  
2  0.850516  
3  0.000000  
4  0.000000  
5  0.930989  
6  0.000000  
7  1.000000   

