In [1]:
#meta 2/8/2022 Text Vectorization
#book: Applied Text Analysis with Python
#authors: Benjamin Bengfort, Rebecca Bilbro, Tony Ojeda

#infra: run on-prem 
#compute: my trainbox
# created env anya_nlp_experiments: Python 3.7.6, networkx 2.4 already available in default base env, added gensim
#      conda install -c conda-forge gensim
#      gensim 3.8.3

#2/8/2022 Text Vectorization
#      Tokenization
#      Frequency vectors with NLTK, Scikit-learn, Gensim

In [2]:
import nltk
import string

Refer to https://www.nltk.org/data.html

NLTK comes with many corpora, toy grammars, trained models, etc. A complete list is posted at: https://www.nltk.org/nltk_data/

To install the data, first install NLTK (see https://www.nltk.org/install.html), then use NLTK’s data downloader.

In [3]:
#prerequisite
#nltk.download('punkt')

#### book `Applied Text Analysis with Python`
Enabling language-aware data products with ML.  
repo https://github.com/anyaconda/atap

# Ch.4 Text Vectorization and Transformation Pipelines 



## Words in Space
Setup: Create a list of docs and tokenize them for vectorization examples.


In [4]:
# The corpus object
doc = "The elephant sneezed at the sight of potatoes"
corpus = [
    "The elephant sneezed at the sight of potatoes.",
    "Bats can see via echolocation. See the bat sight sneeze!",
    "Wondering, she opened the door to the studio.",
]

Vectorization technique -> choice of implementation:  
- NLTK
- Scikit-Learn  
- Gensim

In [5]:
# Tokenization function - using NLTK
#desc: perform lightweight normalization, strip punctuation, set to lowercase
#return: tokens of type string
def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()

    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)


In [6]:
#my parts - Tokenization function
for token in tokenize(doc):
    print (token)

the
eleph
sneez
at
the
sight
of
potato


## Frequency Vectors
The simplest vector encoding model is to simply fill in the vector with the frequency of each work as it appears in a doc.  The representation can be either a straight count or a normalized encoding (each word is weighted by the total # of words in the doc. 

Formal definitions:

In [15]:
def nltk_frequency_vectorize(corpus):

    # The NLTK frequency vectorize method
    from collections import defaultdict

    def vectorize(doc):
        features = defaultdict(int)

        for token in tokenize(doc):
            features[token] += 1

        return features

    return map(vectorize, corpus)


def sklearn_frequency_vectorize(corpus):
    # The Scikit-Learn frequency vectorize method
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer()
    return vectorizer.fit_transform(corpus)


def gensim_frequency_vectorize(corpus):
    # The Gensim frequency vectorize method
    import gensim
    
    tokenized_corpus = [list(tokenize(doc)) for doc in corpus]
    id2word = gensim.corpora.Dictionary(tokenized_corpus)
    return [id2word.doc2bow(doc) for doc in tokenized_corpus]

#### My parts

- NLTK

In [7]:
# The NLTK frequency vectorize method
from collections import defaultdict

def vectorize(doc):
    features = defaultdict(int)

    for token in tokenize(doc):
        features[token] += 1

    return features

In [8]:
#my parts
vectorize(doc)

defaultdict(int,
            {'the': 2,
             'eleph': 1,
             'sneez': 1,
             'at': 1,
             'sight': 1,
             'of': 1,
             'potato': 1})

In [32]:
vectors = map(vectorize, corpus) #class map
print(list(vectors))

[defaultdict(<class 'int'>, {'the': 2, 'eleph': 1, 'sneez': 1, 'at': 1, 'sight': 1, 'of': 1, 'potato': 1}), defaultdict(<class 'int'>, {'bat': 2, 'can': 1, 'see': 2, 'via': 1, 'echoloc': 1, 'the': 1, 'sight': 1, 'sneez': 1}), defaultdict(<class 'int'>, {'wonder': 1, 'she': 1, 'open': 1, 'the': 2, 'door': 1, 'to': 1, 'studio': 1})]


- Scikit-learn

In [11]:
#my parts
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
dtm = vectorizer.fit_transform(corpus) #scipy.sparse.csr.csr_matrix
print(dtm)

  (0, 16)	2
  (0, 6)	1
  (0, 14)	1
  (0, 0)	1
  (0, 12)	1
  (0, 7)	1
  (0, 9)	1
  (1, 16)	1
  (1, 12)	1
  (1, 2)	1
  (1, 3)	1
  (1, 10)	2
  (1, 18)	1
  (1, 5)	1
  (1, 1)	1
  (1, 13)	1
  (2, 16)	2
  (2, 19)	1
  (2, 11)	1
  (2, 8)	1
  (2, 4)	1
  (2, 17)	1
  (2, 15)	1


In [12]:
dtm[0].toarray(), dtm[1].toarray()

(array([[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0]],
       dtype=int64),
 array([[0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 2, 0, 1, 1, 0, 0, 1, 0, 1, 0]],
       dtype=int64))

In [13]:
#my parts
vocab = vectorizer.get_feature_names()
vocab

['at',
 'bat',
 'bats',
 'can',
 'door',
 'echolocation',
 'elephant',
 'of',
 'opened',
 'potatoes',
 'see',
 'she',
 'sight',
 'sneeze',
 'sneezed',
 'studio',
 'the',
 'to',
 'via',
 'wondering']

- Gensim

In [21]:
import gensim
    
tokenized_corpus = [list(tokenize(doc)) for doc in corpus]
id2word = gensim.corpora.Dictionary(tokenized_corpus) #class gensim.corpora.dictionary.Dictionary

In [22]:
#my parts
tokenized_corpus

[['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato'],
 ['bat',
  'can',
  'see',
  'via',
  'echoloc',
  'see',
  'the',
  'bat',
  'sight',
  'sneez'],
 ['wonder', 'she', 'open', 'the', 'door', 'to', 'the', 'studio']]

In [18]:
#my parts
[id2word.doc2bow(doc) for doc in tokenized_corpus]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2)],
 [(4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 2), (11, 1)],
 [(6, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]]

In [27]:
#my parts
for item in id2word.items():
    print(item)

(0, 'at')
(1, 'eleph')
(2, 'of')
(3, 'potato')
(4, 'sight')
(5, 'sneez')
(6, 'the')
(7, 'bat')
(8, 'can')
(9, 'echoloc')
(10, 'see')
(11, 'via')
(12, 'door')
(13, 'open')
(14, 'she')
(15, 'studio')
(16, 'to')
(17, 'wonder')


## Xtra

In [None]:
#$xtra How does collections.defaultdict work?
s = 'mississippi'
d = defaultdict(int)
for k in s:
    d[k] += 1
d.items()