### **PART 1. Using NLTK**

In [10]:
import nltk
text = "For those of you who have read the first edition of this book, some of the familiar case studies \
        will reappear in this edition"

In [11]:
# 1. Split an input text into tokens including words and marks such as comma, punctuation, etc...
tokens = nltk.word_tokenize(text)
print("Output: \n======\n", tokens)

Output: 
 ['For', 'those', 'of', 'you', 'who', 'have', 'read', 'the', 'first', 'edition', 'of', 'this', 'book', ',', 'some', 'of', 'the', 'familiar', 'case', 'studies', 'will', 'reappear', 'in', 'this', 'edition']


In [12]:
# 2. Get stopword list, then removing stopwords
stopwords = set(nltk.corpus.stopwords.words("english"))
print("Number of stopwords: ", len(stopwords))

tokens = [token for token in tokens if token not in stopwords]
print("Output: \n======\n", tokens)

Number of stopwords:  179
Output: 
 ['For', 'read', 'first', 'edition', 'book', ',', 'familiar', 'case', 'studies', 'reappear', 'edition']


In [13]:
# 3. Tag part-of-speech (POS) for each token in the input text
tagged_tokens = nltk.pos_tag(tokens)
print("Output: \n======\n", tagged_tokens)

Output: 
 [('For', 'IN'), ('read', 'VBN'), ('first', 'JJ'), ('edition', 'NN'), ('book', 'NN'), (',', ','), ('familiar', 'JJ'), ('case', 'NN'), ('studies', 'NNS'), ('reappear', 'VBP'), ('edition', 'NN')]


In [14]:
# 4. Identify name entities
# The input for ne_chunk() is the list of tokens that were tagged the part-of-speech
entities = nltk.chunk.ne_chunk(tagged_tokens)
print("Output: \n======\n", entities)

Output: 
 (S
  For/IN
  read/VBN
  first/JJ
  edition/NN
  book/NN
  ,/,
  familiar/JJ
  case/NN
  studies/NNS
  reappear/VBP
  edition/NN)


### **PART 2. Using Gensim to convert docs to TF-IDF**

In [15]:
from gensim import corpora, models

documents = [
    ['architecture', 'layers', 'request', 'flows'],
    ['event', 'processor', 'components'],
    ['researchers', 'struggled', 'MLPs', 'backpropagation', 'autodiff'],
    ['determine', 'variance', 'estimators'],
    ['summarize', 'hypothesis', 'tests', 'mean'],
]

In [18]:
# 1. Create the dictionary from the input document set
# Dictionary is a list of tuples, in which each tuple is a pair of word_id and word
dictionary = corpora.Dictionary(documents)
print(list(dictionary.items()))

[(0, 'architecture'), (1, 'flows'), (2, 'layers'), (3, 'request'), (4, 'components'), (5, 'event'), (6, 'processor'), (7, 'MLPs'), (8, 'autodiff'), (9, 'backpropagation'), (10, 'researchers'), (11, 'struggled'), (12, 'determine'), (13, 'estimators'), (14, 'variance'), (15, 'hypothesis'), (16, 'mean'), (17, 'summarize'), (18, 'tests')]


In [19]:
# 2. Represent each document in bag-of-word
# in each document, a word is represented by a pair of the word_id and the frequency of this word in the document
corpus = [dictionary.doc2bow(document) for document in documents]
for num, document in enumerate(corpus):
    print("document %d has %d words: " %(num, len(document)), document)

document 0 has 4 words:  [(0, 1), (1, 1), (2, 1), (3, 1)]
document 1 has 3 words:  [(4, 1), (5, 1), (6, 1)]
document 2 has 5 words:  [(7, 1), (8, 1), (9, 1), (10, 1), (11, 1)]
document 3 has 3 words:  [(12, 1), (13, 1), (14, 1)]
document 4 has 4 words:  [(15, 1), (16, 1), (17, 1), (18, 1)]


In [22]:
# 3. convert bag-of-word to numeric vector using tf-idf
tfidf = models.TfidfModel(corpus)
tfidf_corpus = tfidf[corpus]
for num, document in enumerate(tfidf_corpus):
    print("document %d has %d words: " %(num, len(document)), document)

document 0 has 4 words:  [(0, 0.5), (1, 0.5), (2, 0.5), (3, 0.5)]
document 1 has 3 words:  [(4, 0.5773502691896258), (5, 0.5773502691896258), (6, 0.5773502691896258)]
document 2 has 5 words:  [(7, 0.447213595499958), (8, 0.447213595499958), (9, 0.447213595499958), (10, 0.447213595499958), (11, 0.447213595499958)]
document 3 has 3 words:  [(12, 0.5773502691896258), (13, 0.5773502691896258), (14, 0.5773502691896258)]
document 4 has 4 words:  [(15, 0.5), (16, 0.5), (17, 0.5), (18, 0.5)]
