## Glossary

- bag of words: Vectors of word counts or frequencies.
- bag of n-grams: Counts of word pairs (bigrams), triplets (trigrams), and so on.
- tf-idf vectors: Word scores that better represent their importance

In [216]:
import nltk
import copy
import math

from collections import Counter, OrderedDict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from nltk.tokenize import TreebankWordTokenizer
from nlpia.data.loaders import kite_text, kite_history

In [217]:
tokenizer = TreebankWordTokenizer()

sentence = 'The faster Harry got to the store, the faster Harry, the faster, would go home.'
tokens = tokenizer.tokenize(sentence)
tokens

['The',
 'faster',
 'Harry',
 'got',
 'to',
 'the',
 'store',
 ',',
 'the',
 'faster',
 'Harry',
 ',',
 'the',
 'faster',
 ',',
 'would',
 'go',
 'home',
 '.']

In [218]:
bag_of_words = Counter(tokens)
bag_of_words.most_common(3)

[('faster', 3), ('the', 3), (',', 3)]

In [219]:
times_harry_appear = bag_of_words['Harry']
times_harry_appear

2

In [220]:
num_unique_words = len(bag_of_words)
num_unique_words

12

In [221]:
tf = times_harry_appear / num_unique_words
round(tf, 4)

0.1667

In [222]:
docs = ['The faster Harry got to the store, the faster and faster Harry would get home.',
        'Harry is hairy anf gaster than Jill.',
        'Jill is not as hairy as Harry.']

doc_tokens = []
for doc in docs:
    doc_tokens += [sorted(tokenizer.tokenize(doc.lower()))]
len(doc_tokens[0])

17

In [223]:
all_doc_tokens = sum(doc_tokens, [])
all_doc_tokens[:5], len(all_doc_tokens)

([',', '.', 'and', 'faster', 'faster'], 33)

In [224]:
lexicon = sorted(set(all_doc_tokens))
lexicon[:5], len(lexicon)

([',', '.', 'and', 'anf', 'as'], 20)

In [225]:
zero_vector = OrderedDict((token, 0) for token in lexicon)
document_tfidf_vectors = []

for doc in docs:
    vec = copy.copy(zero_vector)
    tokens = tokenizer.tokenize(doc.lower())
    token_counts = Counter(tokens)
    
    for key, value in token_counts.items():
        docs_containing_key = 0
        for _doc in docs:
            if key in _doc:
                docs_containing_key += 1
        tf = value / len(lexicon)
        if docs_containing_key:
            idf = len(docs) / docs_containing_key
        else:
            idf = 0
        
        vec[key] = tf * idf # Why no math.log?
    document_tfidf_vectors.append(vec)

In [226]:
def cosine_sim(v_dict, w_dict):
    v = [val for val in v_dict.values()]
    w = [val for val in w_dict.values()]
    
    dot_prod = 0
    for v_i, w_i in zip(v, w):
        dot_prod += v_i * w_i
    
    # Calculate the magnitude.
    mag_1 = math.sqrt(sum([x * x for x in v]))
    mag_2 = math.sqrt(sum([x * x for x in w]))
    
    return dot_prod / (mag_1 * mag_2)

In [227]:
query = 'How long does it take to get to the store?'
query_vec = copy.copy(zero_vector)
tokens = tokenizer.tokenize(query.lower())
token_counts = Counter(tokens)

for key, value in token_counts.items():
    docs_containing_key = 0
    for _doc in docs:
        if key in _doc.lower():
            docs_containing_key += 1
    if docs_containing_key == 0:
        continue
    tf = value / len(tokens)
    idf = len(docs) / docs_containing_key
    
    query_vec[key] = tf * idf

In [228]:
for vectors in document_tfidf_vectors:
    print(cosine_sim(query_vec, vectors))

0.5163977794943223
0.0
0.0


In [229]:
vectorizer = TfidfVectorizer(min_df=1)
vectors = vectorizer.fit_transform(docs)

res = vectorizer.transform([query])
for i, vec in enumerate(vectors):
    # print('euclidean: {:.3f}'.format(euclidean_distances(res, vec)[0][0]))
    print(i, '{:.3f}'.format(cosine_similarity(res, vec)[0][0]))

0 0.515
1 0.000
2 0.000


# Kite Example

In [230]:
tokenizer = TreebankWordTokenizer()
stopwords = frozenset(nltk.corpus.stopwords.words('english'))

tokens = tokenizer.tokenize(kite_text.lower())
tokens = [word for word in tokens if word not in stopwords]
token_counts = Counter(tokens)

In [231]:
token_counts.most_common(5)

[('kite', 16), (',', 15), ('kites', 8), ('wing', 5), ('lift', 4)]

## Vectorizing

In [232]:
document_vectors = []

doc_length = len(tokens)
for key, value in token_counts.most_common():
    document_vectors.append(value / doc_length)
    
document_vectors[:3]

[0.07207207207207207, 0.06756756756756757, 0.036036036036036036]

## Topic Modelling

In [233]:
kite_intro = kite_text.lower()
intro_tokens = tokenizer.tokenize(kite_intro)
intro_total = len(intro_tokens)

kite_history = kite_history.lower()
history_tokens = tokenizer.tokenize(kite_history)
history_total = len(history_tokens)

In [234]:
intro_tf, history_tf = {}, {}

intro_counts = Counter(intro_tokens)
history_counts = Counter(history_tokens)

In [235]:
intro_tf['kite'] = intro_counts['kite'] / intro_total
f'Term Frequency of "kite" in intro is: {intro_tf["kite"]:.4f}'

'Term Frequency of "kite" in intro is: 0.0441'

In [236]:
history_tf['kite'] = history_counts['kite'] / history_total
f'Term Frequency of "kite" in history is: {history_tf["kite"]:.4f}'

'Term Frequency of "kite" in history is: 0.0202'

In [237]:
intro_tf['and'] = intro_counts['and'] / intro_total
f'Term Frequency of "and" in intro is: {intro_tf["and"]:.4f}'

'Term Frequency of "and" in intro is: 0.0275'

In [238]:
history_tf['and'] = history_counts['and'] / history_total
f'Term Frequency of "and" in history is: {history_tf["and"]:.4f}'

'Term Frequency of "and" in history is: 0.0303'