In [49]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import word2vec,FastText

In [14]:
### One Hot Encoding (OHE) Of Word Vectors
# In this method, each word in the vocabulary V is assigned an integer index, i(from 0 to V-1) & the vector representation for each word
# is of the length V with all 0s except 1 at the ith index for the corresponding word. It is a word-level representation.

In [15]:
sample_sentence = "the dictionary has many examples of how words are used"

def OHE(text):
  tokens = set(text.lower().split())
  length = len(tokens)
  index_map = { val:index for val,index in zip(tokens,range(length))}
  ohe_matrix = {}


  for token in tokens :
    ohe = np.zeros(length)
    ohe[index_map[token]] = 1
    ohe_matrix[token] = ohe

  return ohe_matrix


ohe_result = OHE(sample_sentence)
ohe_result

{'the': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'examples': array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'used': array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]),
 'words': array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]),
 'dictionary': array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]),
 'are': array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]),
 'of': array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]),
 'many': array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]),
 'how': array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]),
 'has': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])}

In [16]:
"""
The vector representation doesn’t hold any semantic meaning,
example: If we get boy, boys & jail as 3 tokens, we know vector representation for boy & boys should be closer
but in the case of OHE, every token is at the same distance from any other token.

Sparse representation (most values 0 in presentation) can lead to computational inefficiencies.

OOV (out of vocabulary) problem.

Very high dimension as each word has a vector length of total vocab, the dimensionality curse.
"""

'\nThe vector representation doesn’t hold any semantic meaning,\nexample: If we get boy, boys & jail as 3 tokens, we know vector representation for boy & boys should be closer \nbut in the case of OHE, every token is at the same distance from any other token. \n\nSparse representation (most values 0 in presentation) can lead to computational inefficiencies.\n\nOOV (out of vocabulary) problem.\n\nVery high dimension as each word has a vector length of total vocab, the dimensionality curse.\n'

In [17]:
### Bag of Words (BoW)
# Bag Of Word is indeed just a bag of words ignoring 2 crucial things in its vector representations 1) Order of tokens in which they appear in sentence/sequence and 2) Semantic meaning of the token.
# Bag of Words is majorly based on the frequency of tokens present in a sentence & nothing else.
# It is a sentence-level representation.

In [18]:
sample_sentences = ["the dictionary has many examples of how words are used",
                    "The more you practice, the more confident you become"]

cv = CountVectorizer()
bow_rep = cv.fit_transform(sample_sentences)

print(cv.vocabulary_)
print(f"BOW Representation for {sample_sentences[0]} is {bow_rep[0].toarray()}")
print(f"BOW Representation for {sample_sentences[1]} is {bow_rep[1].toarray()}")

{'the': 11, 'dictionary': 3, 'has': 5, 'many': 7, 'examples': 4, 'of': 9, 'how': 6, 'words': 13, 'are': 0, 'used': 12, 'more': 8, 'you': 14, 'practice': 10, 'confident': 2, 'become': 1}
BOW Representation for the dictionary has many examples of how words are used is [[1 0 0 1 1 1 1 1 0 1 0 1 1 1 0]]
BOW Representation for The more you practice, the more confident you become is [[0 1 1 0 0 0 0 0 2 0 1 2 0 0 2]]


In [19]:
"""
The representation is now of fixed length irrespective of the sentence length
The representation dimension has reduced drastically compared to OHE where we would have such vector representation for just one token/word.
Though, as vocab can be huge, the representation can still be parsed.

Any sentence with the same words will have a similar presentation. Though, slight variations can make the representation drastically different.
‘I run’ & ‘they ran’ will have completely different representation

OOV (out of vocabulary) problem.
"""

' \nThe representation is now of fixed length irrespective of the sentence length\nThe representation dimension has reduced drastically compared to OHE where we would have such vector representation for just one token/word. \nThough, as vocab can be huge, the representation can still be parsed.\n\nAny sentence with the same words will have a similar presentation. Though, slight variations can make the representation drastically different. \n‘I run’ & ‘they ran’ will have completely different representation\n\nOOV (out of vocabulary) problem.\n'

In [20]:
### Bag of N-grams (BoN)
# Before this the major drawback observed was the method was missing out on the context of the words used.
# By grouping continuous words together, we may be able to capture some meaning.
# The BoW can be considered as BoN with n=1.

In [21]:
cv = CountVectorizer(ngram_range=(3,3))
sample_sentences = ["the dictionary has many examples of how words are used",
                    "The more you practice, the more confident you become"]

ngram_rep = cv.fit_transform(sample_sentences)
print(cv.vocabulary_)

print(f"N-GRAM Representation for {sample_sentences[0]} is {ngram_rep[0].toarray()}")
print(f"N-GRAM Representation for {sample_sentences[1]} is {ngram_rep[1].toarray()}")

{'the dictionary has': 10, 'dictionary has many': 1, 'has many examples': 3, 'many examples of': 5, 'examples of how': 2, 'of how words': 8, 'how words are': 4, 'words are used': 13, 'the more you': 12, 'more you practice': 7, 'you practice the': 14, 'practice the more': 9, 'the more confident': 11, 'more confident you': 6, 'confident you become': 0}
N-GRAM Representation for the dictionary has many examples of how words are used is [[0 1 1 1 1 1 0 0 1 0 1 0 0 1 0]]
N-GRAM Representation for The more you practice, the more confident you become is [[1 0 0 0 0 0 1 1 0 1 0 1 1 0 1]]


In [22]:
"""
N-GRAM representation does improve semantic understanding of the sentence & sentences with the same phrase/group of words will have a similar representation.

Though we still have a long way to go as :

We haven’t found a solution to OOVs
Sparse representation poses a big challenge
As vocab increases, the dimensionality of representation increases
"""

'\nN-GRAM representation does improve semantic understanding of the sentence & sentences with the same phrase/group of words will have a similar representation. \n\nThough we still have a long way to go as : \n\nWe haven’t found a solution to OOVs\nSparse representation poses a big challenge\nAs vocab increases, the dimensionality of representation increases\n'

In [23]:
### TF-IDF

In [33]:
tf = TfidfVectorizer()
sample_sentences = ["the dictionary has many examples of how words are used",
                    "the more you practice, the more confident you become"]

tf_rep = tf.fit_transform(sample_sentences)
print(tf.vocabulary_)
print("\n")

print(f"TfIdf Representation for {sample_sentences[0]} is {tf_rep[0].toarray()} \n")
print(f"TfIdf Representation for {sample_sentences[1]} is {tf_rep[1].toarray()}")

{'the': 11, 'dictionary': 3, 'has': 5, 'many': 7, 'examples': 4, 'of': 9, 'how': 6, 'words': 13, 'are': 0, 'used': 12, 'more': 8, 'you': 14, 'practice': 10, 'confident': 2, 'become': 1}


TfIdf Representation for the dictionary has many examples of how words are used is [[0.32433627 0.         0.         0.32433627 0.32433627 0.32433627
  0.32433627 0.32433627 0.         0.32433627 0.         0.23076793
  0.32433627 0.32433627 0.        ]] 

TfIdf Representation for the more you practice, the more confident you become is [[0.         0.27708406 0.27708406 0.         0.         0.
  0.         0.         0.55416811 0.         0.27708406 0.39429518
  0.         0.         0.55416811]]


In [34]:
"""
TF-IDF is a single float value per word that solves a very particular problem that may come in handy in text classification a lot i.e. word importance
i.e. how important a particular word is in a document/sentence.
It is a sentence-level representation

It moves with intuition than if a certain word is present in every other sentence, chances that it might not be important.
For example: is, a, the, etc. while words that are rarely found in the sentences may be of higher importance.

TF(x) : (Frequency of word ‘x’ in sentence s) / (total tokens in sentence s)
IDF(x): log(total sentences/total sentences with word ‘x’)
TF_IDF(x): TF(x) * IDF(x)

F-IDF also struggles with the same problems of handling OOVs, sparsity & big dimensions but it does capture some semantics as well.

"""

'\nTF-IDF is a single float value per word that solves a very particular problem that may come in handy in text classification a lot i.e. word importance\ni.e. how important a particular word is in a document/sentence. \nIt is a sentence-level representation\n\nIt moves with intuition than if a certain word is present in every other sentence, chances that it might not be important. \nFor example: is, a, the, etc. while words that are rarely found in the sentences may be of higher importance.\n\n'

In [34]:
### Word2Vec

In [48]:
sample_para = """Bali is predominantly a Hindu country. Bali is known for its elaborate, traditional dancing. The dancing is inspired by its Hindi beliefs. Most of the dancing portrays tales of good versus evil. To watch the dancing is a breathtaking experience. Lombok has some impressive points of interest – the majestic Gunung Rinjani is an active volcano. It is the second highest peak in Indonesia. Art is a Balinese passion. Batik paintings and carved statues make popular souvenirs. Artists can be seen whittling and painting on the streets, particularly in Ubud. It is easy to appreciate each island as an attractive tourist destination. Majestic scenery; rich culture; white sands and warm, azure waters draw visitors like magnets every year. Snorkelling and diving around the nearby Gili Islands is magnificent. Marine fish, starfish, turtles and coral reef are present in abundance. Bali and Lombok are part of the Indonesian archipelago. Bali has some spectacular temples. The most significant is the Mother Temple, Besakih. The inhabitants of Lombok are mostly Muslim with a Hindu minority. Lombok remains the most understated of the two islands. Lombok has several temples worthy of a visit, though they are less prolific. Bali and Lombok are neighbouring islands."""
text = [ word.lower().split() for word in sample_para.split(".") ]

skipgram = word2vec.Word2Vec(text, window = 2, vector_size = 100, sg = 1, min_count = 1)
cbow = word2vec.Word2Vec(text, window = 2, vector_size = 100, sg = 0, min_count = 1)

print (f"Skipgram : {skipgram.wv.most_similar('dancing', topn=5)} ")
print (f"cbow : {cbow.wv.most_similar('dancing', topn=5)} ")

Skipgram : [('art', 0.285526841878891), ('inspired', 0.2409186065196991), ('remains', 0.21204082667827606), ('visitors', 0.20760729908943176), ('of', 0.20495805144309998)] 
cbow : [('art', 0.285190224647522), ('inspired', 0.239618718624115), ('remains', 0.21129843592643738), ('visitors', 0.2056775540113449), ('of', 0.20177268981933594)] 


In [35]:
"""
Continuous Bag Of Words (CBoW)

A very popular name in the word embedding space, Word2Vec is a neural network-based model for learning word embeddings.
It does solve the 2 most critical problems: lower dimension representations & capturing the meaning of the words using the context (vicinity words).

The idea behind CBoW is to train a NN that gives context words (vicinity words) as input & predicts the target word.
Choose an even number ‘m’.
Now for a target word, we will consider its ‘m’ neighboring word on either side (left & right) & prepare training data set.
Here, given the source text, for each word, we prepare training samples where vicinity words act as features & the blue highlighted ones are the target
"""

'\nA very popular name in the word embedding space, Word2Vec is a neural network-based model for learning word embeddings. \nIt does solve the 2 most critical problems: lower dimension representations & capturing the meaning of the words using the context (vicinity words).\n\nThe idea behind CBoW is to train a NN that gives context words (vicinity words) as input & predicts the target word.\n\n'

In [None]:
"""
Skip Gram

An exact opposite to CBoW, Skip Gram tries to predict context words given a single word as input.
Select ‘m’ similar to CBoW. We will be considering m words on both sides of every word for prediction but as separate samples.
The blue token becomes the input.
Next, similar to CBoW, we have a 2-layered NN where each vicinity word is fed & the same target word is predicted against each input word.

Interestingly, we require the weight of the hidden layer that acts as word embedding. The model trained can be dumped !!
"""

In [None]:
### Global Vectors (GloVe)

In [None]:
"""
In the case of Word2Vec, only vicinity words are used to derive an embedding for a given word.
This may be restrictive in the long run as the embedding will have a limited context.
GloVe, a word-level vector representation scheme, use the entire corpus to derive embedding for a word.

GloVe uses word-word co-occurrence probabilities to incorporate the essence of the entire corpus.
As it is a global statistic(derived using the entire dataset), the embedding generated is called GloVe or Global Vectors.

A word-word co-occurrence matrix is nothing but a 2d array that stores frequency of every possible pair of word in the corpus.
It can be imagined as a correlation matrix structure with instead of correlation values,
we have frequency of the 2 words occurring together in a specified window.
we will calculate word-word cooccurrence probabilities using the above matrix.
For P(A|B) = Freq(A∩B)/Freq(B)

P(k|ice)/P(k|steam) >>1 ,k is more associated with ice. As in case of P(solid|ice)/P(solid|steam)=8.9
P(k|ice)/P(k|steam) <<1, k is more related to steam. As P(gas|ice)/P(gas|steam)=0.085
P(k|ice)/P(k|steam)≈1, k is either related to both or unrelated. As in case of water & fashion.

"""

In [None]:
### fastText

In [50]:
sample_para = """Bali is predominantly a Hindu country. Bali is known for its elaborate, traditional dancing. The dancing is inspired by its Hindi beliefs. Most of the dancing portrays tales of good versus evil. To watch the dancing is a breathtaking experience. Lombok has some impressive points of interest – the majestic Gunung Rinjani is an active volcano. It is the second highest peak in Indonesia. Art is a Balinese passion. Batik paintings and carved statues make popular souvenirs. Artists can be seen whittling and painting on the streets, particularly in Ubud. It is easy to appreciate each island as an attractive tourist destination. Majestic scenery; rich culture; white sands and warm, azure waters draw visitors like magnets every year. Snorkelling and diving around the nearby Gili Islands is magnificent. Marine fish, starfish, turtles and coral reef are present in abundance. Bali and Lombok are part of the Indonesian archipelago. Bali has some spectacular temples. The most significant is the Mother Temple, Besakih. The inhabitants of Lombok are mostly Muslim with a Hindu minority. Lombok remains the most understated of the two islands. Lombok has several temples worthy of a visit, though they are less prolific. Bali and Lombok are neighbouring islands."""
text = [ word.lower().split() for word in sample_para.split(".") ]


fasttext = FastText(vector_size=100, window=2, min_count=1)
fasttext.build_vocab(text)
fasttext.train(text, total_examples=len(text), epochs=10)

print (f"Fasttext : {fasttext.wv.most_similar('dancing', topn=5)} ")

Fasttext : [('year', 0.2896951735019684), ('painting', 0.28574812412261963), ('predominantly', 0.24354396760463715), ('by', 0.22881247103214264), ('they', 0.21968737244606018)] 


In [None]:
################################################################# END OF NOTEBOOK ############################################################