In [20]:
import nltk

CORPUS = [
'the sky is blue',
'sky is blue and sky is beautiful',
'the beautiful sky is so blue',
'i love blue cheese'
]

new_doc = ['loving this blue sky today']

from sklearn.feature_extraction.text import CountVectorizer

# ngram_range to various parameters like (1, 3) would build
# feature vectors consisting of all unigrams, bigrams, and trigrams.
def bow_extractor(corpus, ngram_range=(1,1)):
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range) # minimum frequency of 1 
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

bow_vectorizer, bow_features = bow_extractor(CORPUS)
features = bow_features.todense()
print (features)
    

[[0 0 1 0 1 0 1 0 1]
 [1 1 1 0 2 0 2 0 0]
 [0 1 1 0 1 0 1 1 1]
 [0 0 1 1 0 1 0 0 0]]


In [21]:
new_doc_features = bow_vectorizer.transform(new_doc)
new_doc_features = new_doc_features.todense()
print (new_doc_features)

[[0 0 1 0 0 0 1 0 0]]


In [22]:
feature_names = bow_vectorizer.get_feature_names()
print (feature_names)

['and', 'beautiful', 'blue', 'cheese', 'is', 'love', 'sky', 'so', 'the']


In [23]:
# Function to display features as a data frame
import pandas as pd
def display_features(features, feature_names):
    df = pd.DataFrame(data=features,
                      columns=feature_names)
    print (df)

display_features(features, feature_names)


   and  beautiful  blue  cheese  is  love  sky  so  the
0    0          0     1       0   1     0    1   0    1
1    1          1     1       0   2     0    2   0    0
2    0          1     1       0   1     0    1   1    1
3    0          0     1       1   0     1    0   0    0


The Bag of Words model is good, but the vectors are completely based on absolute
frequencies of word occurrences. This has some potential problems where words that
may tend to occur a lot across all documents in the corpus will have higher frequencies
and will tend to overshadow other words that may not occur as frequently but may
be more interesting and effective as features to identify specific categories for the
documents. This is where TF-IDF comes into the picture.

In [24]:
from sklearn.feature_extraction.text import TfidfTransformer
def tfidf_transformer(bow_matrix):
    transformer = TfidfTransformer(norm='l2',
                                   smooth_idf=True,
                                   use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix

import numpy as np
#from feature_extractors import tfidf_transformer
feature_names = bow_vectorizer.get_feature_names()
# build tfidf transformer and show train corpus tfidf features
tfidf_trans, tdidf_features = tfidf_transformer(bow_features)
features = np.round(tdidf_features.todense(), 2)
display_features(features, feature_names)

    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60
1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00
2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


In [25]:
nd_tfidf = tfidf_trans.transform(new_doc_features)
nd_features = np.round(nd_tfidf.todense(), 2)
display_features(nd_features, feature_names)

   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0


# The Maths behind TDF

In [26]:
import scipy.sparse as sp
from numpy.linalg import norm
feature_names = bow_vectorizer.get_feature_names()

In [27]:
# compute term frequency
tf = bow_features.todense()
tf = np.array(tf, dtype='float64')
# show term frequencies
display_features(tf, feature_names)


   and  beautiful  blue  cheese   is  love  sky   so  the
0  0.0        0.0   1.0     0.0  1.0   0.0  1.0  0.0  1.0
1  1.0        1.0   1.0     0.0  2.0   0.0  2.0  0.0  0.0
2  0.0        1.0   1.0     0.0  1.0   0.0  1.0  1.0  1.0
3  0.0        0.0   1.0     1.0  0.0   1.0  0.0  0.0  0.0


In [28]:
df = np.diff(sp.csc_matrix(bow_features, copy=True).indptr)
df = 1 + df # to smoothen idf later

display_features([df], feature_names)

   and  beautiful  blue  cheese  is  love  sky  so  the
0    2          3     5       2   4     2    4   2    3


In [29]:
# compute inverse document frequencies
total_docs = 1 + len(CORPUS)
idf = 1.0 + np.log(float(total_docs) / df)

display_features([np.round(idf, 2)], feature_names)


    and  beautiful  blue  cheese    is  love   sky    so   the
0  1.92       1.51   1.0    1.92  1.22  1.92  1.22  1.92  1.51


In [30]:
# compute idf diagonal matrix
total_features = bow_features.shape[1]
idf_diag = sp.spdiags(idf, diags=0, m=total_features, n=total_features)
idf = idf_diag.todense()

print (np.round(idf, 2))

[[1.92 0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   1.51 0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   1.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   1.92 0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   1.22 0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   1.92 0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   1.22 0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   1.92 0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   1.51]]


In [31]:
# compute tfidf feature matrix
tfidf = tf * idf
# show tfidf feature matrix
display_features(np.round(tfidf, 2), feature_names)

    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00   1.0    0.00  1.22  0.00  1.22  0.00  1.51
1  1.92       1.51   1.0    0.00  2.45  0.00  2.45  0.00  0.00
2  0.00       1.51   1.0    0.00  1.22  0.00  1.22  1.92  1.51
3  0.00       0.00   1.0    1.92  0.00  1.92  0.00  0.00  0.00


In [32]:
# compute L2 norms
norms = norm(tfidf, axis=1)
# print norms for each document
print("L2 Normal form",np.round(norms, 2))

L2 Normal form [2.5  4.35 3.5  2.89]


In [33]:
# compute normalized tfidf
norm_tfidf = tfidf / norms[:, None]
# show final tfidf feature matrix
display_features(np.round(norm_tfidf, 2), feature_names)


    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60
1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00
2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


In [34]:
# compute new doc term freqs from bow freqs
nd_tf = new_doc_features
nd_tf = np.array(nd_tf, dtype='float64')


In [35]:
# compute tfidf using idf matrix from train corpus
nd_tfidf = nd_tf*idf
nd_norms = norm(nd_tfidf, axis=1)
norm_nd_tfidf = nd_tfidf / nd_norms[:, None]
# show new_doc tfidf feature vector
display_features(np.round(norm_nd_tfidf, 2), feature_names)

   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0


In [36]:
# implement a generic function that can directly compute the tfidf-based feature vectors for documents from the
#raw documents themselves.

from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_extractor(corpus, ngram_range=(1,1)):
    vectorizer = TfidfVectorizer(min_df=1,
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

# build tfidf vectorizer and get training corpus feature vectors
tfidf_vectorizer, tdidf_features = tfidf_extractor(CORPUS)
display_features(np.round(tdidf_features.todense(), 2), feature_names)


    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60
1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00
2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


In [37]:
# get tfidf feature vector for the new document
nd_tfidf = tfidf_vectorizer.transform(new_doc)
display_features(np.round(nd_tfidf.todense(), 2), feature_names) 

   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0


# Gensim

The word2vec
framework is much faster than other neural network–based implementations and does
not require manual labels to create meaningful representations among words.

 Internally, it constructs a vocabulary based on the input text
documents and learns vector representations for words based on various techniques
mentioned earlier, and once this is complete, it builds a model that can be used to
extract word vectors for each word in a document. Using various techniques like average
weighting or tfidf weighting, we can compute the averaged vector representation of a
document using its word vectors. 

In [38]:
from gensim.models import word2vec

# tokenize sentences in corpus
TOKENIZED_CORPUS = [nltk.word_tokenize(sentence) 
                    for sentence in CORPUS]
tokenized_new_doc = [nltk.word_tokenize(sentence) 
                    for sentence in new_doc]       

# Set values for various parameters
feature_size = 10    # no of Word vector dimensions  
window_context = 10  # length of the window of words that should be considered for the algorithm                                                                                    
min_word_count = 2   # Minimum word count for the word to be considered. Helps in removing
                     # very specific words because they occur very rarely in the documents                        
sample = 1e-3        # Downsample setting for frequent words

model = word2vec.Word2Vec(TOKENIZED_CORPUS, size=feature_size, 
                          window=window_context, min_count=min_word_count,
                          sample=sample)

# view similar words based on gensim's model
similar_words = {search_term: [item[0] for item in model.wv.most_similar([search_term], topn=5)]
                  for search_term in ['sky', 'blue']}
similar_words

{'sky': ['blue', 'beautiful', 'is', 'the'],
 'blue': ['sky', 'is', 'beautiful', 'the']}

In [40]:
from feature_extractors import averaged_word_vectorizer


avg_word_vec_features = averaged_word_vectorizer(corpus=TOKENIZED_CORPUS,
                                                 model=model,
                                                 num_features=10)
print (np.round(avg_word_vec_features, 3))

[[-0.023  0.021  0.004  0.004  0.02  -0.02  -0.007  0.005  0.018 -0.008]
 [-0.012  0.025  0.008 -0.001  0.025 -0.014  0.004  0.022  0.04  -0.011]
 [-0.024  0.023  0.01   0.004  0.014 -0.011 -0.002  0.014  0.023 -0.008]
 [-0.046  0.026 -0.034 -0.041  0.026 -0.028 -0.036  0.031  0.035  0.012]]


In [41]:
nd_avg_word_vec_features = averaged_word_vectorizer(corpus=tokenized_new_doc,
                                                    model=model,
                                                    num_features=10)
print (np.round(nd_avg_word_vec_features, 3))

[[-0.001  0.019 -0.025 -0.018  0.037 -0.027 -0.016  0.038  0.035 -0.005]]


In [42]:
from feature_extractors import tfidf_weighted_averaged_word_vectorizer

corpus_tfidf = tdidf_features
vocab = tfidf_vectorizer.vocabulary_
wt_tfidf_word_vec_features = tfidf_weighted_averaged_word_vectorizer(corpus=TOKENIZED_CORPUS,
                                                                     tfidf_vectors=corpus_tfidf,
                                                                     tfidf_vocabulary=vocab,
                                                                     model=model, 
                                                                     num_features=10)
print (np.round(wt_tfidf_word_vec_features, 3))

[[-0.023  0.02   0.007  0.008  0.018 -0.02  -0.007  0.001  0.014 -0.009]
 [-0.007  0.024  0.011  0.003  0.027 -0.015  0.008  0.019  0.04  -0.014]
 [-0.024  0.023  0.014  0.008  0.012 -0.009 -0.001  0.013  0.02  -0.008]
 [-0.046  0.026 -0.034 -0.041  0.026 -0.028 -0.036  0.031  0.035  0.012]]


In [43]:
nd_wt_tfidf_word_vec_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_new_doc,
                                                                     tfidf_vectors=nd_tfidf,
                                                                     tfidf_vocabulary=vocab,
                                                                     model=model, 
                                                                     num_features=10)
print (np.round(nd_wt_tfidf_word_vec_features, 3) )

[[ 0.003  0.018 -0.024 -0.015  0.038 -0.027 -0.014  0.039  0.034 -0.007]]
