In [1]:
from collections import Counter
import numpy as np

## Compile Documents

In [2]:
doc1 = 'Wise people think they are foolish'
doc2 = 'Foolish foolish people think they are wise wise'
doc3 = 'I am definitely wise so this irritates me'
doc4 = 'Trump is for sure like definitely foolish'

## Create Corpus

In [3]:
documents = [doc1, doc2, doc3, doc4]

## Tokenize and Lower case

In [4]:
from nltk.tokenize import word_tokenize
tokenized = [word_tokenize(doc.lower()) for doc in documents]
tokenized

[['wise', 'people', 'think', 'they', 'are', 'foolish'],
 ['foolish', 'foolish', 'people', 'think', 'they', 'are', 'wise', 'wise'],
 ['i', 'am', 'definitely', 'wise', 'so', 'this', 'irritates', 'me'],
 ['trump', 'is', 'for', 'sure', 'like', 'definitely', 'foolish']]

In [5]:
tokenized = [word_tokenize(doc.lower()) for doc in documents]

In [6]:
tokenized

[['wise', 'people', 'think', 'they', 'are', 'foolish'],
 ['foolish', 'foolish', 'people', 'think', 'they', 'are', 'wise', 'wise'],
 ['i', 'am', 'definitely', 'wise', 'so', 'this', 'irritates', 'me'],
 ['trump', 'is', 'for', 'sure', 'like', 'definitely', 'foolish']]

## Remove Stop Words

In [7]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
docs = [[word for word in words if word not in stop] 
        for words in tokenized]
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
wordnet = WordNetLemmatizer()

docs_stem = [[porter.stem(word) for word in words]
               for words in docs]
docs_lemma = [[wordnet.lemmatize(word) for word in doc]
                for doc in docs]

In [8]:
docs = [[word for word in words if word not in stop] 
        for words in tokenized]

In [9]:
docs

[['wise', 'people', 'think', 'foolish'],
 ['foolish', 'foolish', 'people', 'think', 'wise', 'wise'],
 ['definitely', 'wise', 'irritates'],
 ['trump', 'sure', 'like', 'definitely', 'foolish']]

## Stemming and Lemmatization

In [10]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
wordnet = WordNetLemmatizer()

docs_stem = [[porter.stem(word) for word in words]
               for words in docs]
docs_lemma = [[wordnet.lemmatize(word) for word in doc]
                for doc in docs]

In [11]:
print(porter.stem('mice'))
print(wordnet.lemmatize('mice'))

mice
mouse


In [12]:
docs_stem

[['wise', u'peopl', 'think', 'foolish'],
 ['foolish', 'foolish', u'peopl', 'think', 'wise', 'wise'],
 [u'definit', 'wise', u'irrit'],
 ['trump', 'sure', 'like', u'definit', 'foolish']]

In [13]:
docs_lemma

[['wise', 'people', 'think', 'foolish'],
 ['foolish', 'foolish', 'people', 'think', 'wise', 'wise'],
 ['definitely', 'wise', 'irritates'],
 ['trump', 'sure', 'like', 'definitely', 'foolish']]

## Vocabulary for our Corpus

In [14]:
vocabulary = [word for doc in docs_lemma for word in doc]

In [15]:
vocabulary

['wise',
 'people',
 'think',
 'foolish',
 'foolish',
 'foolish',
 'people',
 'think',
 'wise',
 'wise',
 'definitely',
 'wise',
 'irritates',
 'trump',
 'sure',
 'like',
 'definitely',
 'foolish']

In [16]:
vocabulary = sorted(list(set(vocabulary)))

In [17]:
print('Vocabulary (features):', vocabulary)

('Vocabulary (features):', ['definitely', 'foolish', 'irritates', 'like', 'people', 'sure', 'think', 'trump', 'wise'])


## Bag of Words

In [18]:
from collections import Counter

In [19]:
 c = Counter(['eggs', 'ham'])

In [20]:
c

Counter({'eggs': 1, 'ham': 1})

In [21]:
def bow_vectorize(doc, vocabulary):
    bag_of_words = Counter(doc)
    doc_vector = np.zeros(len(vocabulary))
    for word_index, word in enumerate(vocabulary):
        if word in bag_of_words:
            doc_vector[word_index] += bag_of_words[word]
    return doc_vector

In [22]:
bow_matrix = list()
for doc in docs_lemma:
    bow_matrix.append(bow_vectorize(doc, vocabulary))


In [23]:
print('features:',vocabulary)
for i in range(len(bow_matrix)):
    print('"%s":'% docs_lemma[i], '\n', bow_matrix[i], '\n')
          
print('feature matrix:')
print(bow_matrix)

('features:', ['definitely', 'foolish', 'irritates', 'like', 'people', 'sure', 'think', 'trump', 'wise'])
('"[\'wise\', \'people\', \'think\', \'foolish\']":', '\n', array([0., 1., 0., 0., 1., 0., 1., 0., 1.]), '\n')
('"[\'foolish\', \'foolish\', \'people\', \'think\', \'wise\', \'wise\']":', '\n', array([0., 2., 0., 0., 1., 0., 1., 0., 2.]), '\n')
('"[\'definitely\', \'wise\', \'irritates\']":', '\n', array([1., 0., 1., 0., 0., 0., 0., 0., 1.]), '\n')
('"[\'trump\', \'sure\', \'like\', \'definitely\', \'foolish\']":', '\n', array([1., 1., 0., 1., 0., 1., 0., 1., 0.]), '\n')
feature matrix:
[array([0., 1., 0., 0., 1., 0., 1., 0., 1.]), array([0., 2., 0., 0., 1., 0., 1., 0., 2.]), array([1., 0., 1., 0., 0., 0., 0., 0., 1.]), array([1., 1., 0., 1., 0., 1., 0., 1., 0.])]


### Bag of Words with CountVectorizer

In [24]:
def lemmatize(doc):
    return [wordnet.lemmatize(word) for word in word_tokenize(doc.lower())]


In [25]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(stop_words=stopwords.words('english'),
                                   vocabulary=vocabulary,
                                   tokenizer=lemmatize)

feature_matrix = count_vectorizer.fit_transform([doc1])

In [26]:
feature_matrix.toarray()

array([[0, 1, 0, 0, 1, 0, 1, 0, 1]])

In [27]:
print('Vectorize:',doc1)
print('Lemmatized:',docs_lemma[0])
print('Features:', vocabulary)
print('\n')
print('sklearn result',feature_matrix)
print('our result',bow_vectorize(docs_lemma[0], vocabulary))
print('\n')
print('feature matrix')
print(count_vectorizer.fit_transform(documents).todense())

('Vectorize:', 'Wise people think they are foolish')
('Lemmatized:', ['wise', 'people', 'think', 'foolish'])
('Features:', ['definitely', 'foolish', 'irritates', 'like', 'people', 'sure', 'think', 'trump', 'wise'])


('sklearn result', <1x9 sparse matrix of type '<type 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>)
('our result', array([0., 1., 0., 0., 1., 0., 1., 0., 1.]))


feature matrix
[[0 1 0 0 1 0 1 0 1]
 [0 2 0 0 1 0 1 0 2]
 [1 0 1 0 0 0 0 0 1]
 [1 1 0 1 0 1 0 1 0]]


## Term Frequency (Tf)

In [28]:
def tf_vectorize(doc, vocabulary):
    bow_vector = bow_vectorize(doc, vocabulary)
    tf_vector = np.zeros(len(vocabulary))
    for idx, vec in enumerate(bow_vector):
        tf_vector[idx] = vec / len(doc)
    return tf_vector

In [29]:
tf_matrix = list()
for doc in docs_lemma:
    tf_matrix.append(tf_vectorize(doc, vocabulary))

In [30]:
print('features:', vocabulary)

for i in range(len(tf_matrix)):
    print('"%s":'%docs_lemma[i], '\n', tf_matrix[i], '\n')

('features:', ['definitely', 'foolish', 'irritates', 'like', 'people', 'sure', 'think', 'trump', 'wise'])
('"[\'wise\', \'people\', \'think\', \'foolish\']":', '\n', array([0.  , 0.25, 0.  , 0.  , 0.25, 0.  , 0.25, 0.  , 0.25]), '\n')
('"[\'foolish\', \'foolish\', \'people\', \'think\', \'wise\', \'wise\']":', '\n', array([0.        , 0.33333333, 0.        , 0.        , 0.16666667,
       0.        , 0.16666667, 0.        , 0.33333333]), '\n')
('"[\'definitely\', \'wise\', \'irritates\']":', '\n', array([0.33333333, 0.        , 0.33333333, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.33333333]), '\n')
('"[\'trump\', \'sure\', \'like\', \'definitely\', \'foolish\']":', '\n', array([0.2, 0.2, 0. , 0.2, 0. , 0.2, 0. , 0.2, 0. ]), '\n')


## Some Tf-Idf 

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'),
                                   vocabulary=vocabulary)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents).todense()

In [32]:
tfidf_matrix

matrix([[0.        , 0.44493104, 0.        , 0.        , 0.54957835,
         0.        , 0.54957835, 0.        , 0.44493104],
        [0.        , 0.60161783, 0.        , 0.        , 0.37155886,
         0.        , 0.37155886, 0.        , 0.60161783],
        [0.55349232, 0.        , 0.70203482, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.44809973],
        [0.39278432, 0.31799276, 0.        , 0.49819711, 0.        ,
         0.49819711, 0.        , 0.49819711, 0.        ]])

In [33]:
print('features:',vocabulary)

for i in range(len(tfidf_matrix)):
    print('"%s":'%docs_lemma[i], '\n', tfidf_matrix[i], '\n')

('features:', ['definitely', 'foolish', 'irritates', 'like', 'people', 'sure', 'think', 'trump', 'wise'])
('"[\'wise\', \'people\', \'think\', \'foolish\']":', '\n', matrix([[0.        , 0.44493104, 0.        , 0.        , 0.54957835,
         0.        , 0.54957835, 0.        , 0.44493104]]), '\n')
('"[\'foolish\', \'foolish\', \'people\', \'think\', \'wise\', \'wise\']":', '\n', matrix([[0.        , 0.60161783, 0.        , 0.        , 0.37155886,
         0.        , 0.37155886, 0.        , 0.60161783]]), '\n')
('"[\'definitely\', \'wise\', \'irritates\']":', '\n', matrix([[0.55349232, 0.        , 0.70203482, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.44809973]]), '\n')
('"[\'trump\', \'sure\', \'like\', \'definitely\', \'foolish\']":', '\n', matrix([[0.39278432, 0.31799276, 0.        , 0.49819711, 0.        ,
         0.49819711, 0.        , 0.49819711, 0.        ]]), '\n')


## Euclidian Distance Comparison

In [34]:
from sklearn.metrics.pairwise import euclidean_distances

In [35]:
bow_matrix[0]

array([0., 1., 0., 0., 1., 0., 1., 0., 1.])

In [36]:
bow_matrix[1]

array([0., 2., 0., 0., 1., 0., 1., 0., 2.])

In [37]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(euclidean_distances(bow_matrix[0].reshape(1, -1), bow_matrix[1].reshape(1, -1)))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[1.41421356]]


In [38]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(euclidean_distances(tf_matrix[0].reshape(1, -1), tf_matrix[1].reshape(1, -1)))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[0.16666667]]


In [39]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(euclidean_distances(tfidf_matrix[0].reshape(1, -1), tfidf_matrix[1].reshape(1, -1)))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[0.33538543]]


## Cosine Similarity Comparison

In [40]:
from sklearn.metrics.pairwise import cosine_similarity

In [41]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(cosine_similarity(bow_matrix[0].reshape(1, -1), bow_matrix[1].reshape(1, -1)))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[0.9486833]]


In [42]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(cosine_similarity(tf_matrix[0].reshape(1, -1), tf_matrix[1].reshape(1, -1)))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[0.9486833]]


In [43]:
print('Compare "%s" \nwith "%s"'%(doc1, doc2))
print(cosine_similarity(tfidf_matrix[0].reshape(1, -1), tfidf_matrix[1].reshape(1, -1)))

Compare "Wise people think they are foolish" 
with "Foolish foolish people think they are wise wise"
[[0.94375831]]


# Search Engine Query Example

In [44]:
query = 'The foolish Trump'

In [45]:
query_vectorized = tfidf_vectorizer.transform([query]).todense()
print("Query:", query)
print("Vectorized query:", query_vectorized)

('Query:', 'The foolish Trump')
('Vectorized query:', matrix([[0.        , 0.53802897, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.84292635, 0.        ]]))


In [46]:
for doc, tf_doc in zip(documents, tfidf_matrix):
    print(doc, cosine_similarity(query_vectorized.reshape(1, -1), tf_doc.reshape(1, -1)))

('Wise people think they are foolish', array([[0.23938579]]))
('Foolish foolish people think they are wise wise', array([[0.32368782]]))
('I am definitely wise so this irritates me', array([[0.]]))
('Trump is for sure like definitely foolish', array([[0.59103279]]))


## A Final Cosine Similiarity Thingy

In [47]:
for index in range(len(documents[1:])):
    print('"%s" compared with "%s"'%(documents[0], documents[index+1]))
    print('TF cosine similarity:', cosine_similarity(tf_matrix[0].reshape(1, -1),
                                                     tf_matrix[index+1].reshape(1, -1)))
    print('TF-IDF cosine similarity:', cosine_similarity(tfidf_matrix[0].reshape(1, -1),
                                                         tfidf_matrix[index+1].reshape(1, -1)))
    

"Wise people think they are foolish" compared with "Foolish foolish people think they are wise wise"
('TF cosine similarity:', array([[0.9486833]]))
('TF-IDF cosine similarity:', array([[0.94375831]]))
"Wise people think they are foolish" compared with "I am definitely wise so this irritates me"
('TF cosine similarity:', array([[0.28867513]]))
('TF-IDF cosine similarity:', array([[0.19937348]]))
"Wise people think they are foolish" compared with "Trump is for sure like definitely foolish"
('TF cosine similarity:', array([[0.2236068]]))
('TF-IDF cosine similarity:', array([[0.14148485]]))
