In [16]:
from collections import Counter
import re

from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from nltk import word_tokenize
from nltk.corpus import stopwords

In [2]:
Counter(word_tokenize("""The cat is in the box. The cat likes the box. 
The box is over the cat."""))

Counter({'The': 3,
         'cat': 3,
         'is': 2,
         'in': 1,
         'the': 3,
         'box': 3,
         '.': 3,
         'likes': 1,
         'over': 1})

In [3]:
text = """The cat is in the box. The cat likes the box. 
The box is over the cat."""

tokens = [w for w in word_tokenize(text.lower()) if w.isalpha()]

no_stops = [t for t in tokens if t not in stopwords.words('english')]

Counter(no_stops), Counter(no_stops).most_common(2)

(Counter({'cat': 3, 'box': 3, 'likes': 1}), [('cat', 3), ('box', 3)])

In [6]:
my_documents = ["The movie was about a spaceship and aliens.",
           "I really liked the movie!",
           "Awesome action scenes, but boring characters.",
           "The movie was awful! I hate alien films.",
           "Space is cool! I liked the movie.",
           "More space films, please!"]

tokenized_docs = [word_tokenize(doc.lower()) for doc in my_documents]
dictionary = Dictionary(tokenized_docs)
dictionary.token2id

{'.': 0,
 'a': 1,
 'about': 2,
 'aliens': 3,
 'and': 4,
 'movie': 5,
 'spaceship': 6,
 'the': 7,
 'was': 8,
 '!': 9,
 'i': 10,
 'liked': 11,
 'really': 12,
 ',': 13,
 'action': 14,
 'awesome': 15,
 'boring': 16,
 'but': 17,
 'characters': 18,
 'scenes': 19,
 'alien': 20,
 'awful': 21,
 'films': 22,
 'hate': 23,
 'cool': 24,
 'is': 25,
 'space': 26,
 'more': 27,
 'please': 28}

In [12]:
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
for c in corpus:
    print(c)
# List of Lists
# Each list is a document
# Each tuple is a word(tokenId in corpus) and its count(frequency)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]
[(5, 1), (7, 1), (9, 1), (10, 1), (11, 1), (12, 1)]
[(0, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)]
[(0, 1), (5, 1), (7, 1), (8, 1), (9, 1), (10, 1), (20, 1), (21, 1), (22, 1), (23, 1)]
[(0, 1), (5, 1), (7, 1), (9, 1), (10, 1), (11, 1), (24, 1), (25, 1), (26, 1)]
[(9, 1), (13, 1), (22, 1), (26, 1), (27, 1), (28, 1)]


#### Tf-idf with gensim

* Term frequency - inverse document frequency.
* Allows to determine the most important words in each document
* Each corpus may have shared words beyond just stopwords
* These words should be down-weighted in importance
* Example from astronomy: "star" is a common word, but it is not as important as "neutron star" or "black hole"
* Ensure most common words don't show up as key words
* Keeps document specific frequence words weighted high

$$
tfidf(t,d,D) = tf(t,d) \times idf(t,D)
$$
$$
w_{i,j} = tf_{i,j} \times log(\frac{N}{df_i})
$$

where 

$w_{i,j}$ is the weight of token $i$ in document $j$.  
$tf_{i,j}$ is the term frequency of token $i$ in document $j$.  
$df_i$ is the number of documents that contain token $i$.  
$N$ is the total number of documents.

In [18]:
tfidf = TfidfModel(corpus)
tfidf[corpus[1]]

[(5, 0.1746298276735174),
 (7, 0.1746298276735174),
 (9, 0.1746298276735174),
 (10, 0.29853166221463673),
 (11, 0.47316148988815415),
 (12, 0.7716931521027908)]