In [1]:
##### text vectorization

In [2]:
# https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [3]:
import numpy as np
import pandas as pd

In [4]:
# doc corpus 
corpus = ["This is a brown house. This house is big. The street number is 1.",
          "This is a small house. This house has 1 bedroom. The street number is 12.",
          "This dog is brown. This dog likes to play.",
          "The dog is in the bedroom."]
corpus

['This is a brown house. This house is big. The street number is 1.',
 'This is a small house. This house has 1 bedroom. The street number is 12.',
 'This dog is brown. This dog likes to play.',
 'The dog is in the bedroom.']

### Binary Term Frequency

In [5]:
# Binary Term Frequency
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(
    binary=True, norm=False, # tf - binary
    use_idf=False, smooth_idf=False, # idf - none
    lowercase=True, stop_words='english', 
    min_df=1, max_df=1.0, max_features=None, 
    ngram_range=(1, 1))
df = pd.DataFrame(tv.fit_transform(corpus).toarray(), columns=tv.get_feature_names())
# print(tv.vocabulary_) # dictionary where key is word and value is index in returned array
# from sklearn.feature_extraction import text
# print(text.ENGLISH_STOP_WORDS)
df

Unnamed: 0,12,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
2,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### BoW (Bag of Words) Term Frequency 

In [6]:
# BoW (Bag of Words) Term Frequency 

# Note: The default regexp select tokens of 2 or more alphanumeric characters 
# (punctuation is completely ignored and always treated as a token separator).
#
# lowercase = True; stop_words = ‘english’
#
# - max_df (default 1.0): 
#             When building the vocabulary ignore terms that have a document frequency 
#             strictly higher than the given threshold (corpus-specific stop words). 
#             If float, the parameter represents a proportion of documents, integer absolute counts.
# - min_df (default 1): 
#           When building the vocabulary ignore terms that have a document frequency 
#           strictly lower than the given threshold.
#           If float, the parameter represents a proportion of documents, integer absolute counts.
# - max_features (default None) : 
#             If not None, build a vocabulary that only consider the top max_features 
#            ordered by term frequency across the corpus.
# - ngram_range (default (1,1)):
#            The lower and upper boundary of the range of n-values for different n-grams to be extracted. 
#            All values of n such that min_n <= n <= max_n will be used.

from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(
    binary=False, norm=None, # tf - bow
    use_idf=False, smooth_idf=False, # idf - none
    lowercase=True, stop_words='english', 
    min_df=1, max_df=1.0, max_features=None, 
    ngram_range=(1, 1))
df = pd.DataFrame(tv.fit_transform(corpus).toarray(), columns=tv.get_feature_names())
df

# BoW using CountVectorizer
# from sklearn.feature_extraction.text import CountVectorizer
# cv = CountVectorizer(
#     lowercase=True, stop_words='english', 
#     min_df=1, max_df=1.0, max_features=None, 
#     ngram_range=(1, 1))
# df = pd.DataFrame(cv.fit_transform(corpus).toarray(), columns=cv.get_feature_names())
# df

Unnamed: 0,12,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0
1,1.0,1.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,1.0
2,0.0,0.0,0.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### L1-Normalized Term Frequency

In [7]:
# L1-Normalized Term Frequency

from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(
    binary=False, norm='l1', # tf - l1 norm
    use_idf=False, smooth_idf=False, # idf - none
    lowercase=True, stop_words='english', 
    min_df=1, max_df=1.0, max_features=None, 
    ngram_range=(1, 1))
df = pd.DataFrame(tv.fit_transform(corpus).toarray(), columns=tv.get_feature_names())
df

# we expect absolute sum of normalized rows to be 1 if norm='l1', for instance:
# np.sum(np.abs(df.loc[0])) 

Unnamed: 0,12,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0.0,0.0,0.166667,0.166667,0.0,0.333333,0.0,0.166667,0.0,0.0,0.166667
1,0.142857,0.142857,0.0,0.0,0.0,0.285714,0.0,0.142857,0.0,0.142857,0.142857
2,0.0,0.0,0.0,0.2,0.4,0.0,0.2,0.0,0.2,0.0,0.0
3,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0


### L2-Normalized TfIdf 

In [8]:
# L2-Normalized TfIdf 

from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(
    binary=False, # tf  - bow
    use_idf=True, smooth_idf=True, # idf  - with smoothing
    norm='l2', # tfidf - l2 norm
    lowercase=True, stop_words='english', 
    min_df=1, max_df=1.0, max_features=None, 
    ngram_range=(1, 1))
df = pd.DataFrame(tv.fit_transform(corpus).toarray(), columns=tv.get_feature_names())
df

# we expect sum or squares of normalized rows to sum to be 1 if norm='l2', for instance:
# np.sum(np.square(df.loc[0])) 
# we expect absolute sum of normalized rows to be 1 if norm='l1', for instance:
#np.sum(np.abs(df.loc[0])) 

Unnamed: 0,12,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0.0,0.0,0.432291,0.340823,0.0,0.681647,0.0,0.340823,0.0,0.0,0.340823
1,0.396802,0.312843,0.0,0.0,0.0,0.625687,0.0,0.312843,0.0,0.396802,0.312843
2,0.0,0.0,0.0,0.348842,0.697684,0.0,0.442462,0.0,0.442462,0.0,0.0
3,0.0,0.707107,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# doc corpus 

document1 = """In Greek mythology, Python (Greek: Πύθων, gen.: Πύθωνος) was the earth-dragon of 
Delphi, always represented in Greek sculpture and vase-paintings as a serpent. He presided at the 
Delphic oracle, which existed in the cult center for his mother, Gaia, "Earth," Pytho being the 
place name that was substituted for the earlier Krisa.[1] Hellenes considered the site to be the 
center of the earth, represented by a stone, the omphalos or navel, which Python guarded."""

document2 = """Monty Python (sometimes known as The Pythons)[2][3] were a British surreal comedy 
group who created the sketch comedy show Monty Python's Flying Circus, that first aired on the BBC on 
October 5, 1969. Forty-five episodes were made over four series. The Python phenomenon developed from 
the television series into something larger in scope and impact, spawning touring stage shows, films, 
numerous albums, several books, and a stage musical. The group's influence on comedy has been compared 
to The Beatles' influence on music."""

document3 = """Python is a widely used general-purpose, high-level programming language.[19][20] 
Its design philosophy emphasizes code readability, and its syntax allows programmers to express 
concepts in fewer lines of code than would be possible in languages such as C++ or Java.[21][22] 
The language provides constructs intended to enable clear programs on both a small and large scale."""

corpus = [document1, document2, document3]


from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(
    binary=False, # tf  - bow
    use_idf=True, smooth_idf=True, # idf  - with smoothing
    norm='l2', # tfidf - l2 norm
    lowercase=True, stop_words='english', 
    min_df=1, max_df=1.0, max_features=None, 
    ngram_range=(1, 1))
df = pd.DataFrame(tv.fit_transform(corpus).toarray(), columns=tv.get_feature_names())
df

Unnamed: 0,19,1969,20,21,22,aired,albums,allows,bbc,beatles,...,substituted,surreal,syntax,television,touring,used,vase,widely,πύθων,πύθωνος
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.133161,0.0,0.0,0.0,0.0,0.0,0.133161,0.0,0.133161,0.133161
1,0.0,0.126858,0.0,0.0,0.0,0.126858,0.126858,0.0,0.126858,0.126858,...,0.0,0.126858,0.0,0.126858,0.126858,0.0,0.0,0.0,0.0,0.0
2,0.153667,0.0,0.153667,0.153667,0.153667,0.0,0.0,0.153667,0.0,0.0,...,0.0,0.0,0.153667,0.0,0.0,0.153667,0.0,0.153667,0.0,0.0


In [10]:
print ("\ndocument 0:")
print (df.loc[0].sort_values(ascending=False)[:5])

print ("\ndocument 1:")
print (df.loc[1].sort_values(ascending=False)[:5])

print ("\ndocument 2:")
print (df.loc[2].sort_values(ascending=False)[:5])


document 0:
greek          0.399484
earth          0.399484
represented    0.266323
center         0.266323
python         0.157295
Name: 0, dtype: float64

document 1:
comedy       0.380573
group        0.253715
stage        0.253715
series       0.253715
influence    0.253715
Name: 1, dtype: float64

document 2:
language       0.307333
code           0.307333
general        0.153667
programming    0.153667
programmers    0.153667
Name: 2, dtype: float64


### Word2Vec using spaCy

In [11]:
# load a pre-trained model 
import spacy
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])

In [12]:
# get w2v representation of the word 'breakfast'
print (nlp('breakfast').vector.size)
nlp('breakfast').vector[:10]

300


array([ 0.073378,  0.22767 ,  0.20842 , -0.45679 , -0.078219,  0.60196 ,
       -0.024494, -0.46798 ,  0.054627,  2.2837  ], dtype=float32)

In [13]:
# find cosine similarity between w2v representations of  'breakfast' and 'universe'
nlp('breakfast').similarity(nlp('universe'))

0.04429254581549143

In [14]:
# doc vectors are average of token vectors
doc = nlp("I like oranges that are sweet.")
print (doc.vector.size)
doc.vector[:10]

300


array([-0.13203129,  0.28281957, -0.2719    , -0.34741428,  0.08816486,
        0.07970984,  0.06203228, -0.139731  , -0.04216057,  1.8548899 ],
      dtype=float32)

In [15]:
doc1 = nlp("I like oranges that are sweet.")
doc2 = nlp("I like apples that are sour.")
doc1.similarity(doc2)

0.9621542455456396

### Word2Vec using gensim

In [16]:
# https://github.com/RaRe-Technologies/gensim-data
# pip install gensim

import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

print (api.info()['models'].keys())
print (api.info()['corpora'].keys())

dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])
dict_keys(['semeval-2016-2017-task3-subtaskBC', 'semeval-2016-2017-task3-subtaskA-unannotated', 'patent-2017', 'quora-duplicate-questions', 'wiki-english-20171001', 'text8', 'fake-news', '20-newsgroups', '__testing_matrix-synopsis', '__testing_multipart-matrix-synopsis'])


In [17]:
# load a pre-trained model based on twitter data (2B tweets, 27B tokens, 1.2M vocab)
mdl_gt25 = api.load('glove-twitter-25')



In [18]:
# get w2v representation of the word 'breakfast'
mdl_gt25['breakfast']

array([-1.2631e+00,  9.7467e-01,  5.8564e-01,  2.4308e-01, -4.8782e-04,
        7.8041e-01,  1.3769e+00, -1.1552e+00,  3.1112e-01,  1.8297e-01,
        2.4571e-01, -5.9838e-01, -3.4310e+00, -5.7546e-01,  7.2326e-01,
       -3.7095e-01,  1.6455e+00, -1.7549e+00, -8.2295e-01,  1.4697e-02,
       -5.2037e-01, -1.6713e-01, -9.1294e-01,  1.0092e+00,  1.2327e+00],
      dtype=float32)

In [19]:
# get words most similar to 'breakfast' based on consine distance of w2v representation
mdl_gt25.most_similar('breakfast')

[('dinner', 0.9736680388450623),
 ('lunch', 0.973638653755188),
 ('meal', 0.9281821250915527),
 ('cooking', 0.9191612601280212),
 ('bake', 0.9160193800926208),
 ('bbq', 0.909622073173523),
 ('coffee', 0.9091862440109253),
 ('snack', 0.9019741415977478),
 ('cake', 0.9003361463546753),
 ('food', 0.897562563419342)]

In [20]:
# find cosine similarity between w2v representations of  'breakfast' and 'universe'
mdl_gt25.similarity('breakfast','universe')

0.35903135

In [21]:
# find the odd one out based on cosine similarity between w2v representations of the words
mdl_gt25.doesnt_match(['breakfast', 'cereal', 'dinner', 'lunch'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'cereal'

In [22]:
# load a sample corpus (first 100,000,000 bytes of plain text from Wikipedia - used for testing purposes)
corpus = api.load('text8')

# use sample corpus to train a Word2Vec model
mdl_text8 = Word2Vec(corpus, size=25)



In [23]:
# get w2v representation of the word 'breakfast'
mdl_text8.wv.__getitem__('breakfast')

array([ 0.98928195,  0.74085563, -0.1351101 , -0.15665665,  0.44802588,
       -0.12992509, -0.0552069 ,  0.60877985, -1.4601767 ,  0.05157738,
       -0.4879481 , -0.4341142 , -0.43766704, -0.76964897, -0.43753752,
        0.39795735, -0.19255933, -0.06765946,  0.15642244,  0.12377832,
       -0.84224975, -0.05425147, -0.6869235 , -0.6053662 , -1.0039521 ],
      dtype=float32)

In [24]:
# get words most similar to 'breakfast' based on consine distance of w2v representation
mdl_text8.wv.most_similar('breakfast')

[('restaurant', 0.8712396621704102),
 ('cafe', 0.8542876243591309),
 ('caf', 0.8474255800247192),
 ('gnomes', 0.84150230884552),
 ('stuffed', 0.8392696380615234),
 ('toad', 0.8273366689682007),
 ('lemon', 0.8272871971130371),
 ('palm', 0.8271494507789612),
 ('trailer', 0.8260844945907593),
 ('lynx', 0.824597954750061)]