# Dictionaries

In [None]:
##import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

poses = {'n': 'noun', 's':'adj (s)', 'a':'adj', 'r':'adv'}

for synset in wn.synsets('good'):
  print('{} {}'.format(poses[synset.pos()], ', '.join([l.name() for l in synset.lemmas()])))


noun good
noun good, goodness
noun good, goodness
noun commodity, trade_good, good
adj good
adj (s) full, good
adj good
adj (s) estimable, good, honorable, respectable
adj (s) beneficial, good
adj (s) good
adj (s) good, just, upright
adj (s) adept, expert, good, practiced, proficient, skillful, skilful
adj (s) good
adj (s) dear, good, near
adj (s) dependable, good, safe, secure
adj (s) good, right, ripe
adj (s) good, well
adj (s) effective, good, in_effect, in_force
adj (s) good
adj (s) good, serious
adj (s) good, sound
adj (s) good, salutary
adj (s) good, honest
adj (s) good, undecomposed, unspoiled, unspoilt
adj (s) good
adv well, good
adv thoroughly, soundly, good


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Regular Expressions

In [None]:
import re

In [None]:

# Sample text
text = "Please contact us at support @example.com or sales@example.com."


In [None]:

# Regular expression pattern for email addresses
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'


In [None]:
# Find all matches in the text
matches = re.findall(pattern, text)

# Print the matches
print(matches)

['support@example.com', 'sales@example.com']


## Basic chatbot

In [None]:
 r = r"[^a-z]*([y]o|[h']?ello|ok|hey|(good[ ])?(morn[gin']{0,3}|"\
     r"afternoon|even[gin']{0,3}))[\s,;:]{1,3}([a-z]{1,20})"

In [None]:
re_greeting = re.compile(r, flags=re.IGNORECASE)

In [None]:
re_greeting.match('Hello Rosa')

<re.Match object; span=(0, 10), match='Hello Rosa'>

In [None]:
re_greeting.match('Hello Rosa').groups()

('Hello', None, None, 'Rosa')

In [None]:
re_greeting.match("Good morning Rosa").groups()

('Good morning', 'Good ', 'morning', 'Rosa')

In [None]:
re_greeting.match('Good evening Rosa Parks').groups()

('Good evening', 'Good ', 'evening', 'Rosa')

In [None]:
my_names = set(['rosa', 'rose', 'chatty', 'chatbot', 'bot', 'chatterbot', 'class'])

In [None]:
curt_names = set(['hal', 'you', 'u'])

In [None]:
greeter_name = 'siri'

In [None]:
match = re_greeting.match(input())

hey u


In [None]:
match.groups()

('hey', None, None, 'u')

In [None]:
if match:
  at_name  = match.groups()[-1]
  if at_name in curt_names:
    print("Good one.")
  elif at_name.lower() in my_names:
    print("Hi {}, how are you?".format(greeter_name))


Good one.


# Simple tokenizer

In [None]:
sentence = """
Many years later, as he faced the firing squad,
Colonel Aurelio Buendia was to remember that distant
afternoon that his father took him to discover ice.
"""

In [None]:
s1 = sentence.split()

['Many',
 'years',
 'later,',
 'as',
 'he',
 'faced',
 'the',
 'firing',
 'squad,',
 'Colonel',
 'Aurelio',
 'Buendia',
 'was',
 'to',
 'remember',
 'that',
 'distant',
 'afternoon',
 'that',
 'his',
 'father',
 'took',
 'him',
 'to',
 'discover',
 'ice.']

In [None]:
# texto de 500 palabras
# v1: 1x500

v1 = [1 for x in range(0, 500) if x in s1 else 0]

v1 = []
for x in range(0, 500):
  if x in s1:
    v1.append(1)
  else:
    v1.append(0)

### Problems:

- Punctuation included with words
- Repetitive words

In [None]:
# [good, bad, neutral]
# good = 0 0
# bad = 1 0
# neutral = 0 1

x1 = good
x2 = bad
x3 = neutral

x1 = [0 0]
x2 = [1 0]
x3 = [0 1]

## Let's create one-hot vectors

In [None]:
import numpy as np

In [None]:
token_sequence = str.split(sentence)

In [None]:
vocab = sorted(set(token_sequence))

In [None]:
vocab

['Aurelio',
 'Buendia',
 'Colonel',
 'Many',
 'afternoon',
 'as',
 'discover',
 'distant',
 'faced',
 'father',
 'firing',
 'he',
 'him',
 'his',
 'ice.',
 'later,',
 'remember',
 'squad,',
 'that',
 'the',
 'to',
 'took',
 'was',
 'years']

In [None]:
total_tokens = len(token_sequence)
vocab_size = len(vocab)

In [None]:
print(total_tokens, vocab_size)

26 24


In [None]:
onehot_vectors = np.zeros((total_tokens, vocab_size), int)

In [None]:
for i, word in enumerate(token_sequence):
  onehot_vectors[i, vocab.index(word)] = 1

In [None]:
print(sentence)


Many years later, as he faced the firing squad,
Colonel Aurelio Buendia was to remember that distant
afternoon that his father took him to discover ice.



In [None]:
onehot_vectors

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 1, 0, 0, 0

In [None]:
import pandas as pd

df = pd.DataFrame(onehot_vectors, columns=vocab)
df

Unnamed: 0,Aurelio,Buendia,Colonel,Many,afternoon,as,discover,distant,faced,father,...,ice.,"later,",remember,"squad,",that,the,to,took,was,years
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df[df == 0] = ''

  df[df == 0] = ''


In [None]:
df

Unnamed: 0,Aurelio,Buendia,Colonel,Many,afternoon,as,discover,distant,faced,father,...,ice.,"later,",remember,"squad,",that,the,to,took,was,years
0,,,,1.0,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,1.0
2,,,,,,,,,,,...,,1.0,,,,,,,,
3,,,,,,1.0,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,1.0,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,1.0,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,1.0,,,,,,
9,,,1.0,,,,,,,,...,,,,,,,,,,


### Observations
 - one-hot vectors are very sparse
 - no loss of information, we could recover the sentence from the vector
 - English contains over 20,000 common words
 - creating a one-hot vector per sentence is not practical


## Bag of words


In [None]:
print(sentence)


Many years later, as he faced the firing squad,
Colonel Aurelio Buendia was to remember that distant
afternoon that his father took him to discover ice.



In [None]:
sentence_bow = {}
for token in sentence.split():
  sentence_bow[token] = 1
sorted(sentence_bow.items())


[('Aurelio', 1),
 ('Buendia', 1),
 ('Colonel', 1),
 ('Many', 1),
 ('afternoon', 1),
 ('as', 1),
 ('discover', 1),
 ('distant', 1),
 ('faced', 1),
 ('father', 1),
 ('firing', 1),
 ('he', 1),
 ('him', 1),
 ('his', 1),
 ('ice.', 1),
 ('later,', 1),
 ('remember', 1),
 ('squad,', 1),
 ('that', 1),
 ('the', 1),
 ('to', 1),
 ('took', 1),
 ('was', 1),
 ('years', 1)]

In [None]:
df = pd.DataFrame(pd.Series(dict([(token, 1) for token in
sentence.split()])), columns=['sent']).T

In [None]:
df

Unnamed: 0,Many,years,"later,",as,he,faced,the,firing,"squad,",Colonel,...,remember,that,distant,afternoon,his,father,took,him,discover,ice.
sent,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [None]:
sentences = """Many years later, as he faced the firing squad, \
Colonel Aurelio Buendia was to remember that distant \
afternoon that his father took him to discover ice.\n"""

sentences += """At that time Macondo was a village of twenty \
adobe houses, built on the bank of a river of clear water that \
ran along a bed of polished stones, which were white and enormous, \
like prehistoric eggs.\n"""

sentences += """The world was so recent that many things lacked \
names, and in order to indicate them it was necessary to point.\n"""

sentences += """Every year during the month of March a family of \
ragged gypsies would set up their tents near the village, \
and with a great uproar of pipes and kettledrums they would \
display new inventions."""

In [None]:
print(sentences)

Many years later, as he faced the firing squad, Colonel Aurelio Buendia was to remember that distant afternoon that his father took him to discover ice.
At that time Macondo was a village of twenty adobe houses, built on the bank of a river of clear water that ran along a bed of polished stones, which were white and enormous, like prehistoric eggs.
The world was so recent that many things lacked names, and in order to indicate them it was necessary to point.
Every year during the month of March a family of ragged gypsies would set up their tents near the village, and with a great uproar of pipes and kettledrums they would display new inventions.


In [None]:
sentences.split('\n')

['Many years later, as he faced the firing squad, Colonel Aurelio Buendia was to remember that distant afternoon that his father took him to discover ice.',
 'At that time Macondo was a village of twenty adobe houses, built on the bank of a river of clear water that ran along a bed of polished stones, which were white and enormous, like prehistoric eggs.',
 'The world was so recent that many things lacked names, and in order to indicate them it was necessary to point.',
 'Every year during the month of March a family of ragged gypsies would set up their tents near the village, and with a great uproar of pipes and kettledrums they would display new inventions.']

In [None]:
corpus = {}
for i, sent in enumerate(sentences.split('\n')):
  corpus['sent{}'.format(i)] = dict((tok, 1) for tok in sent.split())

In [None]:
corpus

{'sent0': {'Many': 1,
  'years': 1,
  'later,': 1,
  'as': 1,
  'he': 1,
  'faced': 1,
  'the': 1,
  'firing': 1,
  'squad,': 1,
  'Colonel': 1,
  'Aurelio': 1,
  'Buendia': 1,
  'was': 1,
  'to': 1,
  'remember': 1,
  'that': 1,
  'distant': 1,
  'afternoon': 1,
  'his': 1,
  'father': 1,
  'took': 1,
  'him': 1,
  'discover': 1,
  'ice.': 1},
 'sent1': {'At': 1,
  'that': 1,
  'time': 1,
  'Macondo': 1,
  'was': 1,
  'a': 1,
  'village': 1,
  'of': 1,
  'twenty': 1,
  'adobe': 1,
  'houses,': 1,
  'built': 1,
  'on': 1,
  'the': 1,
  'bank': 1,
  'river': 1,
  'clear': 1,
  'water': 1,
  'ran': 1,
  'along': 1,
  'bed': 1,
  'polished': 1,
  'stones,': 1,
  'which': 1,
  'were': 1,
  'white': 1,
  'and': 1,
  'enormous,': 1,
  'like': 1,
  'prehistoric': 1,
  'eggs.': 1},
 'sent2': {'The': 1,
  'world': 1,
  'was': 1,
  'so': 1,
  'recent': 1,
  'that': 1,
  'many': 1,
  'things': 1,
  'lacked': 1,
  'names,': 1,
  'and': 1,
  'in': 1,
  'order': 1,
  'to': 1,
  'indicate': 1,
  'the

In [None]:
df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T

In [None]:
df.shape

(4, 91)

In [None]:
df[df.columns[:25]]

Unnamed: 0,Many,years,"later,",as,he,faced,the,firing,"squad,",Colonel,...,that,distant,afternoon,his,father,took,him,discover,ice.,At
sent0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,0
sent1,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
sent2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
sent3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df.shape

(4, 91)

### How similar are the vectors?

- How can we determine it?


In [None]:
df.index

Index(['sent0', 'sent1', 'sent2', 'sent3'], dtype='object')

In [None]:
df.loc['sent0'].dot(df.loc['sent1'])

3

In [None]:
df.loc['sent1'].dot(df.loc['sent2'])

3

In [None]:
df.loc['sent2'].dot(df.loc['sent3'])

1

In [None]:
[(k, v) for (k, v) in (df.loc['sent0'] & df.loc['sent1']).items() if v]

[('the', 1), ('was', 1), ('that', 1)]

In [None]:
[(k, v) for (k, v) in (df.loc['sent1'] & df.loc['sent2']).items() if v]

[('was', 1), ('that', 1), ('and', 1)]

In [None]:
[(k, v) for (k, v) in (df.loc['sent2'] & df.loc['sent3']).items() if v]

[('and', 1)]

## N-grams

In [None]:
from nltk.util import ngrams
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(sentence)

In [None]:
tokens

['Many',
 'years',
 'later',
 ',',
 'as',
 'he',
 'faced',
 'the',
 'firing',
 'squad',
 ',',
 'Colonel',
 'Aurelio',
 'Buendia',
 'was',
 'to',
 'remember',
 'that',
 'distant',
 'afternoon',
 'that',
 'his',
 'father',
 'took',
 'him',
 'to',
 'discover',
 'ice',
 '.']

In [None]:
two_grams = list(ngrams(tokens,2))
[" ".join(x) for x in two_grams]

['Many years',
 'years later',
 'later ,',
 ', as',
 'as he',
 'he faced',
 'faced the',
 'the firing',
 'firing squad',
 'squad ,',
 ', Colonel',
 'Colonel Aurelio',
 'Aurelio Buendia',
 'Buendia was',
 'was to',
 'to remember',
 'remember that',
 'that distant',
 'distant afternoon',
 'afternoon that',
 'that his',
 'his father',
 'father took',
 'took him',
 'him to',
 'to discover',
 'discover ice',
 'ice .']

### Issues

- some n-grams are very rare
- rarity can make the vocabulary set grow quickly
- it might cause overfitting
- there are n-grams that don't contribute in a meaningful manner (e.g., at the, was to, as he, him to, that his)

## Stop words

In [None]:
import nltk
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words_es = nltk.corpus.stopwords.words('spanish')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
print("Total English:", len(stop_words))
print("Total Spanish:", len(stop_words_es))

Total English: 179
Total Spanish: 313


In [None]:
stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [None]:
stop_words_es[:10]

['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se']

## Normalizing

### Case folding

In [None]:
tokens = ['House', 'Visitor', 'Center']
normalized_tokens = [x.lower() for x in tokens]
print(normalized_tokens)

['house', 'visitor', 'center']


### Stemming

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
' '.join([stemmer.stem(w).strip("'") for w in "dish washer's washed dishes".split()])


'dish washer wash dish'

### Lemmatization

In [None]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
lemmatizer.lemmatize("better")

'better'

In [None]:
lemmatizer.lemmatize("better", pos="a")

'good'

In [None]:
lemmatizer.lemmatize("goods", pos="a")

'goods'

In [None]:
lemmatizer.lemmatize("goods", pos="n")

'good'

## Bag of words with frequency

In [None]:
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(sentences.lower())

In [None]:
from collections import Counter
bow = Counter(tokens)
bow

Counter({'many': 2,
         'years': 1,
         'later': 1,
         ',': 7,
         'as': 1,
         'he': 1,
         'faced': 1,
         'the': 5,
         'firing': 1,
         'squad': 1,
         'colonel': 1,
         'aurelio': 1,
         'buendia': 1,
         'was': 4,
         'to': 4,
         'remember': 1,
         'that': 5,
         'distant': 1,
         'afternoon': 1,
         'his': 1,
         'father': 1,
         'took': 1,
         'him': 1,
         'discover': 1,
         'ice.': 1,
         'at': 1,
         'time': 1,
         'macondo': 1,
         'a': 5,
         'village': 2,
         'of': 7,
         'twenty': 1,
         'adobe': 1,
         'houses': 1,
         'built': 1,
         'on': 1,
         'bank': 1,
         'river': 1,
         'clear': 1,
         'water': 1,
         'ran': 1,
         'along': 1,
         'bed': 1,
         'polished': 1,
         'stones': 1,
         'which': 1,
         'were': 1,
         'white': 1,
       

In [None]:
num_unique_words = len(bow)
key = "macondo"
n = bow[key]
tf = n/num_unique_words
round(tf,3)

0.011

### Vectorizing

In [None]:
doc_length = len(bow)
doc_vector = [round(value / doc_length,3) for value in bow.values()]

In [None]:
doc_vector

[0.022,
 0.011,
 0.011,
 0.078,
 0.011,
 0.011,
 0.011,
 0.056,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.044,
 0.044,
 0.011,
 0.056,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.056,
 0.022,
 0.078,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.044,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.022,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011,
 0.011]

## Zipf's

In [None]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
brown.words()[:10]

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of']

In [None]:
from collections import Counter
puncs = set((',', '.', '--', '-', '!', '?', ':', ';', '``', "''", '(', ')', '[', ']'))
word_list = (x.lower() for x in brown.words() if x not in puncs)
token_counts = Counter(word_list)
token_counts.most_common(20)

[('the', 69971),
 ('of', 36412),
 ('and', 28853),
 ('to', 26158),
 ('a', 23195),
 ('in', 21337),
 ('that', 10594),
 ('is', 10109),
 ('was', 9815),
 ('he', 9548),
 ('for', 9489),
 ('it', 8760),
 ('with', 7289),
 ('as', 7253),
 ('his', 6996),
 ('on', 6741),
 ('be', 6377),
 ('at', 5372),
 ('by', 5306),
 ('i', 5164)]

## IDF vectors

In [136]:
from nltk.util import ngrams
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

In [138]:
brown.paras()[0]


[['The',
  'Fulton',
  'County',
  'Grand',
  'Jury',
  'said',
  'Friday',
  'an',
  'investigation',
  'of',
  "Atlanta's",
  'recent',
  'primary',
  'election',
  'produced',
  '``',
  'no',
  'evidence',
  "''",
  'that',
  'any',
  'irregularities',
  'took',
  'place',
  '.']]

In [139]:
puncs = set((',', '.', '--', '-', '!', '?', ':', ';', '``', "''", '(', ')', '[', ']'))

In [140]:
par50 = []
for par in brown.paras()[:50]:
  par50 += par[0]
par50_list = (x.lower() for x in par50 if x not in puncs)


In [141]:
intro_text = ' '.join(list(par50_list))

In [142]:
par_rest = []
for par in brown.paras()[:]:
  par_rest += par[0]
par_rest_list = (x.lower() for x in par_rest if x not in puncs)

In [143]:
all_text = ' '.join(list(par_rest_list))


In [144]:
intro_tokens = tokenizer.tokenize(intro_text)
all_tokens = tokenizer.tokenize(all_text)


In [145]:
total_intro = len(intro_tokens)
total_corpus = len(all_tokens)

print(total_intro, total_corpus)

1112 279558


In [146]:
intro_counts = Counter(intro_tokens)

In [147]:
def get_tf(tf_dict, counts, total, token):
  tf_dict[token] = counts[token] / total
  print('Term Frequency of "{}" in intro is: {:.8f}'.format(token, tf_dict[token]))

In [148]:
intro_tf = {}
all_tf = {}

intro_counts = Counter(intro_tokens)
intro_tf['fulton'] = intro_counts['fulton'] / total_intro

all_counts = Counter(all_tokens)
all_tf['fulton'] = all_counts['fulton'] / total_corpus

In [149]:
print('Term Frequency of "fulton" in intro is: {:.8f}'.format(intro_tf['fulton']))
print('Term Frequency of "fulton" in all is: {:.8f}'.format(all_tf['fulton']))

Term Frequency of "fulton" in intro is: 0.00989209
Term Frequency of "fulton" in all is: 0.00003935


In [150]:
get_tf(intro_tf, intro_counts, total_intro, "and")
get_tf(all_tf, all_counts, total_corpus, "and")

Term Frequency of "and" in intro is: 0.02248201
Term Frequency of "and" in intro is: 0.02598030


## Why the difference between "fulton" and "and"?

A good way to think of a term’s inverse document frequency is this: How strange is it that this token is in this document? If a term appears in one document a lot of times, but occurs rarely in the rest of the corpus, one could assume it’s important to that document specifically.

In [151]:
get_tf(intro_tf, intro_counts, total_intro, "committee")

Term Frequency of "committee" in intro is: 0.00089928


In [152]:
get_tf(all_tf, all_counts, total_corpus, "committee")

Term Frequency of "committee" in intro is: 0.00027543


In [153]:
par_rest = []
for par in brown.paras()[50:]:
  par_rest += par[0]
par_rest_list = (x.lower() for x in par_rest if x not in puncs)

rest_text = ' '.join(list(par_rest_list))
rest_tokens = tokenizer.tokenize(rest_text)
total_rest = len(rest_tokens)


In [154]:
### Get IDF

def get_idf(idf_dict, docs_list, token):
  num_docs = len(docs_list)
  num_docs_token = 0
  for doc in docs_list:
    if token in doc:
      num_docs_token += 1
  idf_dict[token] = num_docs / num_docs_token


In [155]:
idf = {}
get_idf(idf, [intro_tokens, rest_tokens], 'fulton')
get_idf(idf, [intro_tokens, rest_tokens], 'and')
get_idf(idf, [intro_tokens, rest_tokens], 'committee')

In [156]:
idf

{'fulton': 2.0, 'and': 1.0, 'committee': 1.0}

## Get TF-IDF

In [157]:
intro_tfidf = {token: intro_tf[token]*idf[token] for token in ['fulton', 'and', 'committee']}
all_tfidf = {token: all_tf[token]*idf[token] for token in ['fulton', 'and', 'committee']}

In [158]:
intro_tfidf

{'fulton': 0.019784172661870502,
 'and': 0.022482014388489208,
 'committee': 0.0008992805755395684}

In [159]:
all_tfidf

{'fulton': 7.869565528441326e-05,
 'and': 0.025980297469576974,
 'committee': 0.0002754347934954464}

## Going back to Zipf

In [None]:
# assume you have 1,000,000 docs

total_docs = 1000000
num_cat = 1
num_dog = 10
idf_cat = total_docs/num_cat
idf_dog = total_docs/num_dog

print(idf_cat, idf_dog)

1000000.0 100000.0


Zipf’s Law showed that when you compare the frequencies of two words, like “cat” and “dog,” even if they occur a similar number of times, the more frequent word will have an exponentially higher frequency than the less frequent one.


### Get the log


In [None]:
import numpy as np
idf_cat1 = np.log10(total_docs/num_cat)
idf_dog1 = np.log10(total_docs/num_dog)
print(idf_cat1, idf_dog1)

6.0 5.0


## Moving everything to log

In [160]:
def get_logtf(tf_dict, counts, total, token):
  tf_dict[token] = np.log(counts[token]) - np.log10(total)

In [161]:
def get_logidf(idf_dict, docs_list, token):
  num_docs = len(docs_list)
  num_docs_token = 0
  for doc in docs_list:
    if token in doc:
      num_docs_token += 1
  print(num_docs,num_docs_token )
  idf_dict[token] = np.log10(num_docs) - np.log10(num_docs_token)


In [162]:
tokens = ['fulton', 'and', 'committee']

logtf_intro = {}
logtf_all = {}
for token in tokens:
  get_logtf(logtf_intro, intro_counts, total_intro, token)
  get_logtf(logtf_all, all_counts, total_corpus, token)

In [163]:
logidf = {}
for token in tokens:
  get_idf(logidf, [intro_tokens, all_tokens], token)

In [164]:
intro_logtfidf = {token: logtf_intro[token] + logidf[token] for token in tokens}
all_logtfidf = {token: logtf_all[token] + logidf[token] for token in tokens}

In [165]:
logtf_intro

{'fulton': -0.6482095144476681,
 'and': 0.17277103762216184,
 'committee': -3.0461047872460387}

In [166]:
logtf_all

{'fulton': -3.048576652006454,
 'and': 3.4440763208013427,
 'committee': -1.1026665029511404}

In [168]:
logidf

{'fulton': 1.0, 'and': 1.0, 'committee': 1.0}

In [169]:
intro_logtfidf

{'fulton': 0.35179048555233194,
 'and': 1.1727710376221618,
 'committee': -2.0461047872460387}

In [170]:
all_logtfidf

{'fulton': -2.048576652006454,
 'and': 4.444076320801343,
 'committee': -0.10266650295114044}